From f7753392c3e5d6e04fc6c1a96445f72a7efea7e4 Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 2 May 2026 23:25:39 -0400 Subject: [PATCH 1/6] Add AST parser module for Python semantic chunking Introduces ast_parser.py which uses Python's built-in ast module to extract functions, classes, and methods as individual chunks. This provides semantically meaningful code units for embedding instead of arbitrary line-based splits. --- ast_parser.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 ast_parser.py diff --git a/ast_parser.py b/ast_parser.py new file mode 100644 index 0000000..ed57798 --- /dev/null +++ b/ast_parser.py @@ -0,0 +1,108 @@ +# ast_parser.py +# +# AST-based chunking for Python files. +# Extracts functions, classes, and methods as semantically meaningful chunks +# instead of splitting code into arbitrary line blocks. + +import ast +from pathlib import Path +from typing import List, Optional + +from indexer import Chunk + + +def parse_python_file(path: Path, source: str) -> Optional[List[Chunk]]: + """ + Parse a Python file into AST-based chunks. + Each function, method, and class becomes its own chunk. + + Returns None if parsing fails (syntax error, etc.) + """ + try: + tree = ast.parse(source, filename=str(path)) + except SyntaxError: + return None + + chunks: List[Chunk] = [] + lines = source.splitlines(keepends=True) + + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + start_line = node.lineno + end_line = _get_end_line(node, lines) + + # Build chunk text from the source lines + chunk_text = "".join(lines[start_line - 1 : end_line]) + + # Add context: prepend the symbol name and type for better embeddings + node_type = "class" if isinstance(node, ast.ClassDef) else "function" + context_prefix = f"# [{node_type}] {node.name}\n" + + chunks.append( + Chunk( + file_path=path, + start_line=start_line, + end_line=end_line, + text=context_prefix + chunk_text, + ) + ) + + return chunks if chunks else None + + +def _get_end_line(node: ast.AST, lines: List[str]) -> int: + """ + Get the last line of an AST node. + Uses end_lineno if available (Python 3.8+), otherwise walks children. + """ + if hasattr(node, "end_lineno") and node.end_lineno is not None: + return node.end_lineno + + # Fallback: find the max line number among all child nodes + max_line = node.lineno + for child in ast.walk(node): + if hasattr(child, "lineno") and child.lineno: + max_line = max(max_line, child.lineno) + + return max_line + + +def extract_top_level_code(path: Path, source: str) -> Optional[Chunk]: + """ + Extract module-level code that isn't inside any function or class. + This includes imports, constants, and top-level statements. + """ + try: + tree = ast.parse(source, filename=str(path)) + except SyntaxError: + return None + + lines = source.splitlines(keepends=True) + total_lines = len(lines) + + # Find all line ranges occupied by functions/classes + occupied = set() + for node in ast.iter_child_nodes(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + start = node.lineno + end = _get_end_line(node, lines) + for ln in range(start, end + 1): + occupied.add(ln) + + # Collect non-occupied lines + top_level_lines = [] + for i in range(1, total_lines + 1): + if i not in occupied: + top_level_lines.append(lines[i - 1]) + + text = "".join(top_level_lines).strip() + + if not text: + return None + + return Chunk( + file_path=path, + start_line=1, + end_line=total_lines, + text=f"# [module-level] {path.name}\n{text}", + ) From 109bf016accafabc5b29085156cca3c3922712cd Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 2 May 2026 23:26:35 -0400 Subject: [PATCH 2/6] Integrate AST chunking into indexer with config toggle The indexer now attempts AST-based parsing for .py files before falling back to line-based chunking. Added AST_ENABLED and AST_EXTENSIONS settings to config for easy control. --- config.py | 14 +++++++++++++- indexer.py | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/config.py b/config.py index 07d52fb..6770a07 100644 --- a/config.py +++ b/config.py @@ -46,9 +46,21 @@ # Maximum file size (in bytes) to read. MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB -# How many lines to group into one chunk. +# How many lines to group into one chunk (used as fallback when AST parsing fails). CHUNK_LINE_COUNT = 40 +# ---- AST chunking settings ---- + +# Enable AST-based chunking for supported file types. +# When enabled, functions/classes are extracted as individual chunks. +# Falls back to line-based chunking if parsing fails. +AST_ENABLED = True + +# File extensions that support AST-based parsing. +AST_EXTENSIONS = { + ".py", +} + # How many top chunks to keep before sending anything to the LLM. CANDIDATE_CHUNK_LIMIT = 20 diff --git a/indexer.py b/indexer.py index 6eb3da4..4c775b4 100644 --- a/indexer.py +++ b/indexer.py @@ -1,7 +1,7 @@ # indexer.py # # Walk the filesystem, filter useful files, and split them into text chunks. -# No AI here — this is just smart file parsing. +# Uses AST-based parsing for supported languages, falls back to line-based chunking. import os from pathlib import Path @@ -14,6 +14,8 @@ IGNORE_DIRS, MAX_FILE_SIZE_BYTES, CHUNK_LINE_COUNT, + AST_ENABLED, + AST_EXTENSIONS, ) @@ -50,29 +52,65 @@ def iter_files(root: Path | None = None): def read_file_chunks(path: Path) -> List[Chunk]: """ - Read a file, split into CHUNK_LINE_COUNT-line chunks, - attach line numbers for better search results. + Read a file and split into chunks. + Uses AST-based parsing for supported languages if enabled, + falls back to line-based chunking otherwise. """ chunks: List[Chunk] = [] try: - # skip massive files (logs, binaries, etc.) if path.stat().st_size > MAX_FILE_SIZE_BYTES: return chunks with path.open("r", encoding="utf-8", errors="ignore") as f: - lines = f.readlines() + source = f.read() except Exception: return chunks - if not lines: + if not source.strip(): return chunks + # Try AST-based chunking for supported file types + if AST_ENABLED and path.suffix in AST_EXTENSIONS: + ast_chunks = _try_ast_chunking(path, source) + if ast_chunks: + return ast_chunks + + # Fallback: line-based chunking + return _line_based_chunking(path, source) + + +def _try_ast_chunking(path: Path, source: str) -> List[Chunk]: + """Attempt AST-based chunking. Returns empty list on failure.""" + if path.suffix == ".py": + from ast_parser import parse_python_file, extract_top_level_code + + chunks = [] + + # Get function/class chunks + ast_chunks = parse_python_file(path, source) + if ast_chunks: + chunks.extend(ast_chunks) + + # Get module-level code (imports, constants, etc.) + top_level = extract_top_level_code(path, source) + if top_level: + chunks.append(top_level) + + return chunks + + return [] + + +def _line_based_chunking(path: Path, source: str) -> List[Chunk]: + """Original line-based chunking as fallback.""" + chunks: List[Chunk] = [] + lines = source.splitlines(keepends=True) total = len(lines) i = 0 while i < total: - start = i + 1 # 1-based line numbering + start = i + 1 end_index = min(i + CHUNK_LINE_COUNT, total) end = end_index From a6ab1be4c8e6ad8f75d5aa4c1e01aac9969408a3 Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 2 May 2026 23:27:24 -0400 Subject: [PATCH 3/6] Improve search result display for AST-parsed chunks Show the symbol type and name (function/class) in search results when available from AST parsing. Also increased preview from 5 to 8 lines for better context visibility. --- search.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/search.py b/search.py index c1326f6..ac2ac88 100644 --- a/search.py +++ b/search.py @@ -105,17 +105,28 @@ def format_results(results: List[SearchResult]) -> str: for i, result in enumerate(results, 1): chunk = result.chunk - lines.append( - f"\n[{i}] {chunk.file_path} " - f"(lines {chunk.start_line}-{chunk.end_line}) " - f"[score: {result.score:.4f}]" - ) + # Extract symbol tag if present (from AST chunks) + symbol_tag = "" + first_line = chunk.text.split("\n", 1)[0] + if first_line.startswith("# [") and "]" in first_line: + symbol_tag = first_line[2:] # e.g. "[function] my_func" + + location = f"{chunk.file_path} (lines {chunk.start_line}-{chunk.end_line})" + header = f"\n[{i}] {location} [score: {result.score:.4f}]" + if symbol_tag: + header += f" {symbol_tag}" + + lines.append(header) lines.append("-" * 60) - # Show a preview of the chunk (first 5 lines) - preview_lines = chunk.text.split("\n")[:5] + # Show a preview of the chunk (skip the context prefix line for AST chunks) + text_lines = chunk.text.split("\n") + if text_lines and text_lines[0].startswith("# ["): + text_lines = text_lines[1:] + + preview_lines = text_lines[:8] preview = "\n".join(preview_lines) - if len(chunk.text.split("\n")) > 5: + if len(text_lines) > 8: preview += "\n..." lines.append(preview) From c62aea8d00946eb39603b70277dcffab4d5a2531 Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 2 May 2026 23:28:16 -0400 Subject: [PATCH 4/6] Update README with AST chunking documentation Document the new AST-based parsing step and add ast_parser.py to the project structure section. --- README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9442990..abc12e4 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,12 @@ A local semantic search tool for your codebase. Instead of matching exact string ## How it works -1. **Indexer** scans your project files and splits them into chunks -2. **Embedding** generates vector embeddings for each chunk using Ollama -3. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches +1. **Indexer** scans your project files and splits them into semantically meaningful chunks +2. **AST Parser** (for Python files) extracts functions, classes, and methods as individual chunks — no more cutting code in half +3. **Embedding** generates vector embeddings for each chunk using Ollama +4. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches + +For non-Python files (or files with syntax errors), the indexer falls back to line-based chunking. ## Requirements @@ -37,8 +40,9 @@ python main.py -d /path/to/project ## Project structure -- `config.py` — settings (model name, allowed file types, chunk size, etc.) -- `indexer.py` — file walking and chunking +- `config.py` — settings (model name, allowed file types, chunk size, AST toggle, etc.) +- `indexer.py` — file walking, AST-aware chunking with line-based fallback +- `ast_parser.py` — Python AST extraction (functions, classes, module-level code) - `embedding.py` — Ollama embedding generation - `search.py` — cosine similarity search and result formatting - `main.py` — CLI entry point From 5a22c33281fb17d7c18c0b4e0ce3c5d4ea01a2d9 Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 9 May 2026 00:38:16 -0400 Subject: [PATCH 5/6] Add JS/TS semantic chunking support --- config.py | 2 + indexer.py | 17 ++++++- js_parser.py | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 js_parser.py diff --git a/config.py b/config.py index 6770a07..6fc32be 100644 --- a/config.py +++ b/config.py @@ -59,6 +59,8 @@ # File extensions that support AST-based parsing. AST_EXTENSIONS = { ".py", + ".js", + ".ts", } # How many top chunks to keep before sending anything to the LLM. diff --git a/indexer.py b/indexer.py index 4c775b4..5f8bd3b 100644 --- a/indexer.py +++ b/indexer.py @@ -87,18 +87,31 @@ def _try_ast_chunking(path: Path, source: str) -> List[Chunk]: chunks = [] - # Get function/class chunks ast_chunks = parse_python_file(path, source) if ast_chunks: chunks.extend(ast_chunks) - # Get module-level code (imports, constants, etc.) top_level = extract_top_level_code(path, source) if top_level: chunks.append(top_level) return chunks + if path.suffix in (".js", ".ts"): + from js_parser import parse_js_file, extract_js_top_level + + chunks = [] + + js_chunks = parse_js_file(path, source) + if js_chunks: + chunks.extend(js_chunks) + + top_level = extract_js_top_level(path, source) + if top_level: + chunks.append(top_level) + + return chunks + return [] diff --git a/js_parser.py b/js_parser.py new file mode 100644 index 0000000..e392a3d --- /dev/null +++ b/js_parser.py @@ -0,0 +1,141 @@ +# js_parser.py +# +# Regex-based chunking for JavaScript/TypeScript files. +# Extracts functions, classes, and arrow functions as semantic chunks. +# Not a full AST parser — uses pattern matching for speed and zero dependencies. + +import re +from pathlib import Path +from typing import List, Optional + +from indexer import Chunk + +# Patterns for JS/TS constructs +PATTERNS = [ + # class declarations + re.compile(r"^(?:export\s+)?(?:default\s+)?(?:abstract\s+)?class\s+(\w+)", re.MULTILINE), + # named function declarations + re.compile(r"^(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s+(\w+)", re.MULTILINE), + # arrow / const functions: const foo = (...) => or const foo = function + re.compile(r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[a-zA-Z_]\w*)\s*=>", re.MULTILINE), + re.compile(r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?function", re.MULTILINE), +] + + +def parse_js_file(path: Path, source: str) -> Optional[List[Chunk]]: + """ + Parse a JS/TS file into chunks based on top-level declarations. + Each function or class becomes its own chunk. + + Returns None if no declarations are found. + """ + lines = source.splitlines(keepends=True) + total_lines = len(lines) + + # Find all declaration start positions + declarations = [] + for pattern in PATTERNS: + for match in pattern.finditer(source): + line_no = source[:match.start()].count("\n") + 1 + name = match.group(1) + declarations.append((line_no, name)) + + if not declarations: + return None + + # Sort by line number + declarations.sort(key=lambda x: x[0]) + + # Deduplicate overlapping matches at the same line + seen_lines = set() + unique = [] + for line_no, name in declarations: + if line_no not in seen_lines: + seen_lines.add(line_no) + unique.append((line_no, name)) + declarations = unique + + chunks: List[Chunk] = [] + + for i, (start_line, name) in enumerate(declarations): + # End line is either the line before the next declaration or EOF + if i + 1 < len(declarations): + end_line = declarations[i + 1][0] - 1 + # Trim trailing blank lines between declarations + while end_line > start_line and not lines[end_line - 1].strip(): + end_line -= 1 + else: + end_line = total_lines + + chunk_text = "".join(lines[start_line - 1 : end_line]) + + # Determine node type + first_line = lines[start_line - 1].strip() + if "class " in first_line: + node_type = "class" + else: + node_type = "function" + + context_prefix = f"# [{node_type}] {name}\n" + + chunks.append( + Chunk( + file_path=path, + start_line=start_line, + end_line=end_line, + text=context_prefix + chunk_text, + ) + ) + + return chunks if chunks else None + + +def extract_js_top_level(path: Path, source: str) -> Optional[Chunk]: + """ + Extract top-level code (imports, constants) that isn't inside + any detected function or class declaration. + """ + lines = source.splitlines(keepends=True) + total_lines = len(lines) + + # Find occupied line ranges + occupied = set() + for pattern in PATTERNS: + for match in pattern.finditer(source): + start = source[:match.start()].count("\n") + 1 + occupied.add(start) + + # Build declaration ranges (same logic as parse_js_file) + declarations = [] + for pattern in PATTERNS: + for match in pattern.finditer(source): + line_no = source[:match.start()].count("\n") + 1 + declarations.append(line_no) + + declarations = sorted(set(declarations)) + + occupied_lines = set() + for i, start in enumerate(declarations): + if i + 1 < len(declarations): + end = declarations[i + 1] - 1 + else: + end = total_lines + for ln in range(start, end + 1): + occupied_lines.add(ln) + + # Collect non-occupied lines + top_level = [] + for i in range(1, total_lines + 1): + if i not in occupied_lines: + top_level.append(lines[i - 1]) + + text = "".join(top_level).strip() + if not text: + return None + + return Chunk( + file_path=path, + start_line=1, + end_line=total_lines, + text=f"# [module-level] {path.name}\n{text}", + ) From 1d25325042742bcdc70130f4d3029d6ceffd27ac Mon Sep 17 00:00:00 2001 From: devangpratap <115096812+devangpratap@users.noreply.github.com> Date: Sat, 9 May 2026 00:38:17 -0400 Subject: [PATCH 6/6] Add --stats flag to show chunk breakdown by file type --- main.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/main.py b/main.py index 237ec49..269eb44 100644 --- a/main.py +++ b/main.py @@ -30,6 +30,11 @@ def main(): default=10, help="Number of results to show (default: 10)", ) + parser.add_argument( + "--stats", + action="store_true", + help="Show indexing statistics (chunk counts by file type and method)", + ) args = parser.parse_args() root = Path(args.dir).resolve() @@ -46,6 +51,10 @@ def main(): print("No files found to index.") return + if args.stats: + print_stats(chunks) + return + # If no query given, enter interactive mode if args.query is None: print("\nNo query provided. Enter a query (or 'quit' to exit):\n") @@ -65,6 +74,30 @@ def main(): run_search(args.query, chunks, args.results) +def print_stats(chunks): + """Show chunk breakdown by file extension and parsing method.""" + from collections import Counter + + ext_counts = Counter() + ast_counts = Counter() + line_counts = Counter() + + for c in chunks: + ext = c.file_path.suffix + ext_counts[ext] += 1 + if c.text.startswith("# ["): + ast_counts[ext] += 1 + else: + line_counts[ext] += 1 + + print(f"\n{'Extension':<12} {'Total':<8} {'AST':<8} {'Line-based':<10}") + print("-" * 40) + for ext in sorted(ext_counts): + print(f"{ext:<12} {ext_counts[ext]:<8} {ast_counts.get(ext, 0):<8} {line_counts.get(ext, 0):<10}") + print("-" * 40) + print(f"{'Total':<12} {sum(ext_counts.values()):<8} {sum(ast_counts.values()):<8} {sum(line_counts.values()):<10}") + + def run_search(query: str, chunks, top_k: int): """Embed chunks, run semantic search, and display results.""" print(f"\nEmbedding {len(chunks)} chunks...")