From f7753392c3e5d6e04fc6c1a96445f72a7efea7e4 Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 2 May 2026 23:25:39 -0400
Subject: [PATCH 1/6] Add AST parser module for Python semantic chunking

Introduces ast_parser.py which uses Python's built-in ast module
to extract functions, classes, and methods as individual chunks.
This provides semantically meaningful code units for embedding
instead of arbitrary line-based splits.
---
 ast_parser.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 ast_parser.py

diff --git a/ast_parser.py b/ast_parser.py
new file mode 100644
index 0000000..ed57798
--- /dev/null
+++ b/ast_parser.py
@@ -0,0 +1,108 @@
+# ast_parser.py
+#
+# AST-based chunking for Python files.
+# Extracts functions, classes, and methods as semantically meaningful chunks
+# instead of splitting code into arbitrary line blocks.
+
+import ast
+from pathlib import Path
+from typing import List, Optional
+
+from indexer import Chunk
+
+
+def parse_python_file(path: Path, source: str) -> Optional[List[Chunk]]:
+    """
+    Parse a Python file into AST-based chunks.
+    Each function, method, and class becomes its own chunk.
+
+    Returns None if parsing fails (syntax error, etc.)
+    """
+    try:
+        tree = ast.parse(source, filename=str(path))
+    except SyntaxError:
+        return None
+
+    chunks: List[Chunk] = []
+    lines = source.splitlines(keepends=True)
+
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start_line = node.lineno
+            end_line = _get_end_line(node, lines)
+
+            # Build chunk text from the source lines
+            chunk_text = "".join(lines[start_line - 1 : end_line])
+
+            # Add context: prepend the symbol name and type for better embeddings
+            node_type = "class" if isinstance(node, ast.ClassDef) else "function"
+            context_prefix = f"# [{node_type}] {node.name}\n"
+
+            chunks.append(
+                Chunk(
+                    file_path=path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    text=context_prefix + chunk_text,
+                )
+            )
+
+    return chunks if chunks else None
+
+
+def _get_end_line(node: ast.AST, lines: List[str]) -> int:
+    """
+    Get the last line of an AST node.
+    Uses end_lineno if available (Python 3.8+), otherwise walks children.
+    """
+    if hasattr(node, "end_lineno") and node.end_lineno is not None:
+        return node.end_lineno
+
+    # Fallback: find the max line number among all child nodes
+    max_line = node.lineno
+    for child in ast.walk(node):
+        if hasattr(child, "lineno") and child.lineno:
+            max_line = max(max_line, child.lineno)
+
+    return max_line
+
+
+def extract_top_level_code(path: Path, source: str) -> Optional[Chunk]:
+    """
+    Extract module-level code that isn't inside any function or class.
+    This includes imports, constants, and top-level statements.
+    """
+    try:
+        tree = ast.parse(source, filename=str(path))
+    except SyntaxError:
+        return None
+
+    lines = source.splitlines(keepends=True)
+    total_lines = len(lines)
+
+    # Find all line ranges occupied by functions/classes
+    occupied = set()
+    for node in ast.iter_child_nodes(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start = node.lineno
+            end = _get_end_line(node, lines)
+            for ln in range(start, end + 1):
+                occupied.add(ln)
+
+    # Collect non-occupied lines
+    top_level_lines = []
+    for i in range(1, total_lines + 1):
+        if i not in occupied:
+            top_level_lines.append(lines[i - 1])
+
+    text = "".join(top_level_lines).strip()
+
+    if not text:
+        return None
+
+    return Chunk(
+        file_path=path,
+        start_line=1,
+        end_line=total_lines,
+        text=f"# [module-level] {path.name}\n{text}",
+    )

From 109bf016accafabc5b29085156cca3c3922712cd Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 2 May 2026 23:26:35 -0400
Subject: [PATCH 2/6] Integrate AST chunking into indexer with config toggle

The indexer now attempts AST-based parsing for .py files before
falling back to line-based chunking. Added AST_ENABLED and
AST_EXTENSIONS settings to config for easy control.
---
 config.py  | 14 +++++++++++++-
 indexer.py | 52 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/config.py b/config.py
index 07d52fb..6770a07 100644
--- a/config.py
+++ b/config.py
@@ -46,9 +46,21 @@
 # Maximum file size (in bytes) to read.
 MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB
 
-# How many lines to group into one chunk.
+# How many lines to group into one chunk (used as fallback when AST parsing fails).
 CHUNK_LINE_COUNT = 40
 
+# ---- AST chunking settings ----
+
+# Enable AST-based chunking for supported file types.
+# When enabled, functions/classes are extracted as individual chunks.
+# Falls back to line-based chunking if parsing fails.
+AST_ENABLED = True
+
+# File extensions that support AST-based parsing.
+AST_EXTENSIONS = {
+    ".py",
+}
+
 # How many top chunks to keep before sending anything to the LLM.
 CANDIDATE_CHUNK_LIMIT = 20
 
diff --git a/indexer.py b/indexer.py
index 6eb3da4..4c775b4 100644
--- a/indexer.py
+++ b/indexer.py
@@ -1,7 +1,7 @@
 # indexer.py
 #
 # Walk the filesystem, filter useful files, and split them into text chunks.
-# No AI here — this is just smart file parsing.
+# Uses AST-based parsing for supported languages, falls back to line-based chunking.
 
 import os
 from pathlib import Path
@@ -14,6 +14,8 @@
     IGNORE_DIRS,
     MAX_FILE_SIZE_BYTES,
     CHUNK_LINE_COUNT,
+    AST_ENABLED,
+    AST_EXTENSIONS,
 )
 
 
@@ -50,29 +52,65 @@ def iter_files(root: Path | None = None):
 
 def read_file_chunks(path: Path) -> List[Chunk]:
     """
-    Read a file, split into CHUNK_LINE_COUNT-line chunks,
-    attach line numbers for better search results.
+    Read a file and split into chunks.
+    Uses AST-based parsing for supported languages if enabled,
+    falls back to line-based chunking otherwise.
     """
     chunks: List[Chunk] = []
 
     try:
-        # skip massive files (logs, binaries, etc.)
         if path.stat().st_size > MAX_FILE_SIZE_BYTES:
             return chunks
 
         with path.open("r", encoding="utf-8", errors="ignore") as f:
-            lines = f.readlines()
+            source = f.read()
     except Exception:
         return chunks
 
-    if not lines:
+    if not source.strip():
         return chunks
 
+    # Try AST-based chunking for supported file types
+    if AST_ENABLED and path.suffix in AST_EXTENSIONS:
+        ast_chunks = _try_ast_chunking(path, source)
+        if ast_chunks:
+            return ast_chunks
+
+    # Fallback: line-based chunking
+    return _line_based_chunking(path, source)
+
+
+def _try_ast_chunking(path: Path, source: str) -> List[Chunk]:
+    """Attempt AST-based chunking. Returns empty list on failure."""
+    if path.suffix == ".py":
+        from ast_parser import parse_python_file, extract_top_level_code
+
+        chunks = []
+
+        # Get function/class chunks
+        ast_chunks = parse_python_file(path, source)
+        if ast_chunks:
+            chunks.extend(ast_chunks)
+
+        # Get module-level code (imports, constants, etc.)
+        top_level = extract_top_level_code(path, source)
+        if top_level:
+            chunks.append(top_level)
+
+        return chunks
+
+    return []
+
+
+def _line_based_chunking(path: Path, source: str) -> List[Chunk]:
+    """Original line-based chunking as fallback."""
+    chunks: List[Chunk] = []
+    lines = source.splitlines(keepends=True)
     total = len(lines)
     i = 0
 
     while i < total:
-        start = i + 1          # 1-based line numbering
+        start = i + 1
         end_index = min(i + CHUNK_LINE_COUNT, total)
         end = end_index
 

From a6ab1be4c8e6ad8f75d5aa4c1e01aac9969408a3 Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 2 May 2026 23:27:24 -0400
Subject: [PATCH 3/6] Improve search result display for AST-parsed chunks

Show the symbol type and name (function/class) in search results
when available from AST parsing. Also increased preview from 5
to 8 lines for better context visibility.
---
 search.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/search.py b/search.py
index c1326f6..ac2ac88 100644
--- a/search.py
+++ b/search.py
@@ -105,17 +105,28 @@ def format_results(results: List[SearchResult]) -> str:
 
     for i, result in enumerate(results, 1):
         chunk = result.chunk
-        lines.append(
-            f"\n[{i}] {chunk.file_path} "
-            f"(lines {chunk.start_line}-{chunk.end_line}) "
-            f"[score: {result.score:.4f}]"
-        )
+        # Extract symbol tag if present (from AST chunks)
+        symbol_tag = ""
+        first_line = chunk.text.split("\n", 1)[0]
+        if first_line.startswith("# [") and "]" in first_line:
+            symbol_tag = first_line[2:]  # e.g. "[function] my_func"
+
+        location = f"{chunk.file_path} (lines {chunk.start_line}-{chunk.end_line})"
+        header = f"\n[{i}] {location} [score: {result.score:.4f}]"
+        if symbol_tag:
+            header += f"  {symbol_tag}"
+
+        lines.append(header)
         lines.append("-" * 60)
 
-        # Show a preview of the chunk (first 5 lines)
-        preview_lines = chunk.text.split("\n")[:5]
+        # Show a preview of the chunk (skip the context prefix line for AST chunks)
+        text_lines = chunk.text.split("\n")
+        if text_lines and text_lines[0].startswith("# ["):
+            text_lines = text_lines[1:]
+
+        preview_lines = text_lines[:8]
         preview = "\n".join(preview_lines)
-        if len(chunk.text.split("\n")) > 5:
+        if len(text_lines) > 8:
             preview += "\n..."
 
         lines.append(preview)

From c62aea8d00946eb39603b70277dcffab4d5a2531 Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 2 May 2026 23:28:16 -0400
Subject: [PATCH 4/6] Update README with AST chunking documentation

Document the new AST-based parsing step and add ast_parser.py
to the project structure section.
---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9442990..abc12e4 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,12 @@ A local semantic search tool for your codebase. Instead of matching exact string
 
 ## How it works
 
-1. **Indexer** scans your project files and splits them into chunks
-2. **Embedding** generates vector embeddings for each chunk using Ollama
-3. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches
+1. **Indexer** scans your project files and splits them into semantically meaningful chunks
+2. **AST Parser** (for Python files) extracts functions, classes, and methods as individual chunks — no more cutting code in half
+3. **Embedding** generates vector embeddings for each chunk using Ollama
+4. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches
+
+For non-Python files (or files with syntax errors), the indexer falls back to line-based chunking.
 
 ## Requirements
 
@@ -37,8 +40,9 @@ python main.py -d /path/to/project
 
 ## Project structure
 
-- `config.py` — settings (model name, allowed file types, chunk size, etc.)
-- `indexer.py` — file walking and chunking
+- `config.py` — settings (model name, allowed file types, chunk size, AST toggle, etc.)
+- `indexer.py` — file walking, AST-aware chunking with line-based fallback
+- `ast_parser.py` — Python AST extraction (functions, classes, module-level code)
 - `embedding.py` — Ollama embedding generation
 - `search.py` — cosine similarity search and result formatting
 - `main.py` — CLI entry point

From 5a22c33281fb17d7c18c0b4e0ce3c5d4ea01a2d9 Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 9 May 2026 00:38:16 -0400
Subject: [PATCH 5/6] Add JS/TS semantic chunking support

---
 config.py    |   2 +
 indexer.py   |  17 ++++++-
 js_parser.py | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 js_parser.py

diff --git a/config.py b/config.py
index 6770a07..6fc32be 100644
--- a/config.py
+++ b/config.py
@@ -59,6 +59,8 @@
 # File extensions that support AST-based parsing.
 AST_EXTENSIONS = {
     ".py",
+    ".js",
+    ".ts",
 }
 
 # How many top chunks to keep before sending anything to the LLM.
diff --git a/indexer.py b/indexer.py
index 4c775b4..5f8bd3b 100644
--- a/indexer.py
+++ b/indexer.py
@@ -87,18 +87,31 @@ def _try_ast_chunking(path: Path, source: str) -> List[Chunk]:
 
         chunks = []
 
-        # Get function/class chunks
         ast_chunks = parse_python_file(path, source)
         if ast_chunks:
             chunks.extend(ast_chunks)
 
-        # Get module-level code (imports, constants, etc.)
         top_level = extract_top_level_code(path, source)
         if top_level:
             chunks.append(top_level)
 
         return chunks
 
+    if path.suffix in (".js", ".ts"):
+        from js_parser import parse_js_file, extract_js_top_level
+
+        chunks = []
+
+        js_chunks = parse_js_file(path, source)
+        if js_chunks:
+            chunks.extend(js_chunks)
+
+        top_level = extract_js_top_level(path, source)
+        if top_level:
+            chunks.append(top_level)
+
+        return chunks
+
     return []
 
 
diff --git a/js_parser.py b/js_parser.py
new file mode 100644
index 0000000..e392a3d
--- /dev/null
+++ b/js_parser.py
@@ -0,0 +1,141 @@
+# js_parser.py
+#
+# Regex-based chunking for JavaScript/TypeScript files.
+# Extracts functions, classes, and arrow functions as semantic chunks.
+# Not a full AST parser — uses pattern matching for speed and zero dependencies.
+
+import re
+from pathlib import Path
+from typing import List, Optional
+
+from indexer import Chunk
+
+# Patterns for JS/TS constructs
+PATTERNS = [
+    # class declarations
+    re.compile(r"^(?:export\s+)?(?:default\s+)?(?:abstract\s+)?class\s+(\w+)", re.MULTILINE),
+    # named function declarations
+    re.compile(r"^(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+(\w+)", re.MULTILINE),
+    # arrow / const functions: const foo = (...) => or const foo = function
+    re.compile(r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>", re.MULTILINE),
+    re.compile(r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?function", re.MULTILINE),
+]
+
+
+def parse_js_file(path: Path, source: str) -> Optional[List[Chunk]]:
+    """
+    Parse a JS/TS file into chunks based on top-level declarations.
+    Each function or class becomes its own chunk.
+
+    Returns None if no declarations are found.
+    """
+    lines = source.splitlines(keepends=True)
+    total_lines = len(lines)
+
+    # Find all declaration start positions
+    declarations = []
+    for pattern in PATTERNS:
+        for match in pattern.finditer(source):
+            line_no = source[:match.start()].count("\n") + 1
+            name = match.group(1)
+            declarations.append((line_no, name))
+
+    if not declarations:
+        return None
+
+    # Sort by line number
+    declarations.sort(key=lambda x: x[0])
+
+    # Deduplicate overlapping matches at the same line
+    seen_lines = set()
+    unique = []
+    for line_no, name in declarations:
+        if line_no not in seen_lines:
+            seen_lines.add(line_no)
+            unique.append((line_no, name))
+    declarations = unique
+
+    chunks: List[Chunk] = []
+
+    for i, (start_line, name) in enumerate(declarations):
+        # End line is either the line before the next declaration or EOF
+        if i + 1 < len(declarations):
+            end_line = declarations[i + 1][0] - 1
+            # Trim trailing blank lines between declarations
+            while end_line > start_line and not lines[end_line - 1].strip():
+                end_line -= 1
+        else:
+            end_line = total_lines
+
+        chunk_text = "".join(lines[start_line - 1 : end_line])
+
+        # Determine node type
+        first_line = lines[start_line - 1].strip()
+        if "class " in first_line:
+            node_type = "class"
+        else:
+            node_type = "function"
+
+        context_prefix = f"# [{node_type}] {name}\n"
+
+        chunks.append(
+            Chunk(
+                file_path=path,
+                start_line=start_line,
+                end_line=end_line,
+                text=context_prefix + chunk_text,
+            )
+        )
+
+    return chunks if chunks else None
+
+
+def extract_js_top_level(path: Path, source: str) -> Optional[Chunk]:
+    """
+    Extract top-level code (imports, constants) that isn't inside
+    any detected function or class declaration.
+    """
+    lines = source.splitlines(keepends=True)
+    total_lines = len(lines)
+
+    # Find occupied line ranges
+    occupied = set()
+    for pattern in PATTERNS:
+        for match in pattern.finditer(source):
+            start = source[:match.start()].count("\n") + 1
+            occupied.add(start)
+
+    # Build declaration ranges (same logic as parse_js_file)
+    declarations = []
+    for pattern in PATTERNS:
+        for match in pattern.finditer(source):
+            line_no = source[:match.start()].count("\n") + 1
+            declarations.append(line_no)
+
+    declarations = sorted(set(declarations))
+
+    occupied_lines = set()
+    for i, start in enumerate(declarations):
+        if i + 1 < len(declarations):
+            end = declarations[i + 1] - 1
+        else:
+            end = total_lines
+        for ln in range(start, end + 1):
+            occupied_lines.add(ln)
+
+    # Collect non-occupied lines
+    top_level = []
+    for i in range(1, total_lines + 1):
+        if i not in occupied_lines:
+            top_level.append(lines[i - 1])
+
+    text = "".join(top_level).strip()
+    if not text:
+        return None
+
+    return Chunk(
+        file_path=path,
+        start_line=1,
+        end_line=total_lines,
+        text=f"# [module-level] {path.name}\n{text}",
+    )

From 1d25325042742bcdc70130f4d3029d6ceffd27ac Mon Sep 17 00:00:00 2001
From: devangpratap <115096812+devangpratap@users.noreply.github.com>
Date: Sat, 9 May 2026 00:38:17 -0400
Subject: [PATCH 6/6] Add --stats flag to show chunk breakdown by file type

---
 main.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/main.py b/main.py
index 237ec49..269eb44 100644
--- a/main.py
+++ b/main.py
@@ -30,6 +30,11 @@ def main():
         default=10,
         help="Number of results to show (default: 10)",
     )
+    parser.add_argument(
+        "--stats",
+        action="store_true",
+        help="Show indexing statistics (chunk counts by file type and method)",
+    )
     args = parser.parse_args()
 
     root = Path(args.dir).resolve()
@@ -46,6 +51,10 @@ def main():
         print("No files found to index.")
         return
 
+    if args.stats:
+        print_stats(chunks)
+        return
+
     # If no query given, enter interactive mode
     if args.query is None:
         print("\nNo query provided. Enter a query (or 'quit' to exit):\n")
@@ -65,6 +74,30 @@ def main():
         run_search(args.query, chunks, args.results)
 
 
+def print_stats(chunks):
+    """Show chunk breakdown by file extension and parsing method."""
+    from collections import Counter
+
+    ext_counts = Counter()
+    ast_counts = Counter()
+    line_counts = Counter()
+
+    for c in chunks:
+        ext = c.file_path.suffix
+        ext_counts[ext] += 1
+        if c.text.startswith("# ["):
+            ast_counts[ext] += 1
+        else:
+            line_counts[ext] += 1
+
+    print(f"\n{'Extension':<12} {'Total':<8} {'AST':<8} {'Line-based':<10}")
+    print("-" * 40)
+    for ext in sorted(ext_counts):
+        print(f"{ext:<12} {ext_counts[ext]:<8} {ast_counts.get(ext, 0):<8} {line_counts.get(ext, 0):<10}")
+    print("-" * 40)
+    print(f"{'Total':<12} {sum(ext_counts.values()):<8} {sum(ast_counts.values()):<8} {sum(line_counts.values()):<10}")
+
+
 def run_search(query: str, chunks, top_k: int):
     """Embed chunks, run semantic search, and display results."""
     print(f"\nEmbedding {len(chunks)} chunks...")