devangpratap · devangpratap · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/README.md b/README.md
@@ -4,9 +4,12 @@ A local semantic search tool for your codebase. Instead of matching exact string
 
 ## How it works
 
-1. **Indexer** scans your project files and splits them into chunks
-2. **Embedding** generates vector embeddings for each chunk using Ollama
-3. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches
+1. **Indexer** scans your project files and splits them into semantically meaningful chunks
+2. **AST Parser** (for Python files) extracts functions, classes, and methods as individual chunks — no more cutting code in half
+3. **Embedding** generates vector embeddings for each chunk using Ollama
+4. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches
+
+For non-Python files (or files with syntax errors), the indexer falls back to line-based chunking.
 
 ## Requirements
 
@@ -37,8 +40,9 @@ python main.py -d /path/to/project
 
 ## Project structure
 
-- `config.py` — settings (model name, allowed file types, chunk size, etc.)
-- `indexer.py` — file walking and chunking
+- `config.py` — settings (model name, allowed file types, chunk size, AST toggle, etc.)
+- `indexer.py` — file walking, AST-aware chunking with line-based fallback
+- `ast_parser.py` — Python AST extraction (functions, classes, module-level code)
 - `embedding.py` — Ollama embedding generation
 - `search.py` — cosine similarity search and result formatting
 - `main.py` — CLI entry point
diff --git a/ast_parser.py b/ast_parser.py
@@ -0,0 +1,108 @@
+# ast_parser.py
+#
+# AST-based chunking for Python files.
+# Extracts functions, classes, and methods as semantically meaningful chunks
+# instead of splitting code into arbitrary line blocks.
+
+import ast
+from pathlib import Path
+from typing import List, Optional
+
+from indexer import Chunk
+
+
+def parse_python_file(path: Path, source: str) -> Optional[List[Chunk]]:
+    """
+    Parse a Python file into AST-based chunks.
+    Each function, method, and class becomes its own chunk.
+
+    Returns None if parsing fails (syntax error, etc.)
+    """
+    try:
+        tree = ast.parse(source, filename=str(path))
+    except SyntaxError:
+        return None
+
+    chunks: List[Chunk] = []
+    lines = source.splitlines(keepends=True)
+
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start_line = node.lineno
+            end_line = _get_end_line(node, lines)
+
+            # Build chunk text from the source lines
+            chunk_text = "".join(lines[start_line - 1 : end_line])
+
+            # Add context: prepend the symbol name and type for better embeddings
+            node_type = "class" if isinstance(node, ast.ClassDef) else "function"
+            context_prefix = f"# [{node_type}] {node.name}\n"
+
+            chunks.append(
+                Chunk(
+                    file_path=path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    text=context_prefix + chunk_text,
+                )
+            )
+
+    return chunks if chunks else None
+
+
+def _get_end_line(node: ast.AST, lines: List[str]) -> int:
+    """
+    Get the last line of an AST node.
+    Uses end_lineno if available (Python 3.8+), otherwise walks children.
+    """
+    if hasattr(node, "end_lineno") and node.end_lineno is not None:
+        return node.end_lineno
+
+    # Fallback: find the max line number among all child nodes
+    max_line = node.lineno
+    for child in ast.walk(node):
+        if hasattr(child, "lineno") and child.lineno:
+            max_line = max(max_line, child.lineno)
+
+    return max_line
+
+
+def extract_top_level_code(path: Path, source: str) -> Optional[Chunk]:
+    """
+    Extract module-level code that isn't inside any function or class.
+    This includes imports, constants, and top-level statements.
+    """
+    try:
+        tree = ast.parse(source, filename=str(path))
+    except SyntaxError:
+        return None
+
+    lines = source.splitlines(keepends=True)
+    total_lines = len(lines)
+
+    # Find all line ranges occupied by functions/classes
+    occupied = set()
+    for node in ast.iter_child_nodes(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start = node.lineno
+            end = _get_end_line(node, lines)
+            for ln in range(start, end + 1):
+                occupied.add(ln)
+
+    # Collect non-occupied lines
+    top_level_lines = []
+    for i in range(1, total_lines + 1):
+        if i not in occupied:
+            top_level_lines.append(lines[i - 1])
+
+    text = "".join(top_level_lines).strip()
+
+    if not text:
+        return None
+
+    return Chunk(
+        file_path=path,
+        start_line=1,
+        end_line=total_lines,
+        text=f"# [module-level] {path.name}\n{text}",
+    )
diff --git a/config.py b/config.py
@@ -46,9 +46,23 @@
 # Maximum file size (in bytes) to read.
 MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB
 
-# How many lines to group into one chunk.
+# How many lines to group into one chunk (used as fallback when AST parsing fails).
 CHUNK_LINE_COUNT = 40
 
+# ---- AST chunking settings ----
+
+# Enable AST-based chunking for supported file types.
+# When enabled, functions/classes are extracted as individual chunks.
+# Falls back to line-based chunking if parsing fails.
+AST_ENABLED = True
+
+# File extensions that support AST-based parsing.
+AST_EXTENSIONS = {
+    ".py",
+    ".js",
+    ".ts",
+}
+
 # How many top chunks to keep before sending anything to the LLM.
 CANDIDATE_CHUNK_LIMIT = 20
 

diff --git a/indexer.py b/indexer.py
@@ -1,7 +1,7 @@
 # indexer.py
 #
 # Walk the filesystem, filter useful files, and split them into text chunks.
-# No AI here — this is just smart file parsing.
+# Uses AST-based parsing for supported languages, falls back to line-based chunking.
 
 import os
 from pathlib import Path
@@ -14,6 +14,8 @@
     IGNORE_DIRS,
     MAX_FILE_SIZE_BYTES,
     CHUNK_LINE_COUNT,
+    AST_ENABLED,
+    AST_EXTENSIONS,
 )
 
 
@@ -50,29 +52,78 @@ def iter_files(root: Path | None = None):
 
 def read_file_chunks(path: Path) -> List[Chunk]:
     """
-    Read a file, split into CHUNK_LINE_COUNT-line chunks,
-    attach line numbers for better search results.
+    Read a file and split into chunks.
+    Uses AST-based parsing for supported languages if enabled,
+    falls back to line-based chunking otherwise.
     """
     chunks: List[Chunk] = []
 
     try:
-        # skip massive files (logs, binaries, etc.)
         if path.stat().st_size > MAX_FILE_SIZE_BYTES:
             return chunks
 
         with path.open("r", encoding="utf-8", errors="ignore") as f:
-            lines = f.readlines()
+            source = f.read()
     except Exception:
         return chunks
 
-    if not lines:
+    if not source.strip():
         return chunks
 
+    # Try AST-based chunking for supported file types
+    if AST_ENABLED and path.suffix in AST_EXTENSIONS:
+        ast_chunks = _try_ast_chunking(path, source)
+        if ast_chunks:
+            return ast_chunks
+
+    # Fallback: line-based chunking
+    return _line_based_chunking(path, source)
+
+
+def _try_ast_chunking(path: Path, source: str) -> List[Chunk]:
+    """Attempt AST-based chunking. Returns empty list on failure."""
+    if path.suffix == ".py":
+        from ast_parser import parse_python_file, extract_top_level_code
+
+        chunks = []
+
+        ast_chunks = parse_python_file(path, source)
+        if ast_chunks:
+            chunks.extend(ast_chunks)
+
+        top_level = extract_top_level_code(path, source)
+        if top_level:
+            chunks.append(top_level)
+
+        return chunks
+
+    if path.suffix in (".js", ".ts"):
+        from js_parser import parse_js_file, extract_js_top_level
+
+        chunks = []
+
+        js_chunks = parse_js_file(path, source)
+        if js_chunks:
+            chunks.extend(js_chunks)
+
+        top_level = extract_js_top_level(path, source)
+        if top_level:
+            chunks.append(top_level)
+
+        return chunks
+
+    return []
+
+
+def _line_based_chunking(path: Path, source: str) -> List[Chunk]:
+    """Original line-based chunking as fallback."""
+    chunks: List[Chunk] = []
+    lines = source.splitlines(keepends=True)
     total = len(lines)
     i = 0
 
     while i < total:
-        start = i + 1          # 1-based line numbering
+        start = i + 1
         end_index = min(i + CHUNK_LINE_COUNT, total)
         end = end_index