Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ A local semantic search tool for your codebase. Instead of matching exact string

## How it works

1. **Indexer** scans your project files and splits them into chunks
2. **Embedding** generates vector embeddings for each chunk using Ollama
3. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches
1. **Indexer** scans your project files and splits them into semantically meaningful chunks
2. **AST Parser** (for Python files) extracts functions, classes, and methods as individual chunks — no more cutting code in half
3. **Embedding** generates vector embeddings for each chunk using Ollama
4. **Search** compares your natural language query against all chunks using cosine similarity and returns the best matches

For non-Python files (or files with syntax errors), the indexer falls back to line-based chunking.

## Requirements

Expand Down Expand Up @@ -37,8 +40,9 @@ python main.py -d /path/to/project

## Project structure

- `config.py` — settings (model name, allowed file types, chunk size, etc.)
- `indexer.py` — file walking and chunking
- `config.py` — settings (model name, allowed file types, chunk size, AST toggle, etc.)
- `indexer.py` — file walking, AST-aware chunking with line-based fallback
- `ast_parser.py` — Python AST extraction (functions, classes, module-level code)
- `embedding.py` — Ollama embedding generation
- `search.py` — cosine similarity search and result formatting
- `main.py` — CLI entry point
108 changes: 108 additions & 0 deletions ast_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# ast_parser.py
#
# AST-based chunking for Python files.
# Extracts functions, classes, and methods as semantically meaningful chunks
# instead of splitting code into arbitrary line blocks.

import ast
from pathlib import Path
from typing import List, Optional

from indexer import Chunk


def parse_python_file(path: Path, source: str) -> Optional[List[Chunk]]:
"""
Parse a Python file into AST-based chunks.
Each function, method, and class becomes its own chunk.

Returns None if parsing fails (syntax error, etc.)
"""
try:
tree = ast.parse(source, filename=str(path))
except SyntaxError:
return None

chunks: List[Chunk] = []
lines = source.splitlines(keepends=True)

for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start_line = node.lineno
end_line = _get_end_line(node, lines)

# Build chunk text from the source lines
chunk_text = "".join(lines[start_line - 1 : end_line])

# Add context: prepend the symbol name and type for better embeddings
node_type = "class" if isinstance(node, ast.ClassDef) else "function"
context_prefix = f"# [{node_type}] {node.name}\n"

chunks.append(
Chunk(
file_path=path,
start_line=start_line,
end_line=end_line,
text=context_prefix + chunk_text,
)
)

return chunks if chunks else None


def _get_end_line(node: ast.AST, lines: List[str]) -> int:
"""
Get the last line of an AST node.
Uses end_lineno if available (Python 3.8+), otherwise walks children.
"""
if hasattr(node, "end_lineno") and node.end_lineno is not None:
return node.end_lineno

# Fallback: find the max line number among all child nodes
max_line = node.lineno
for child in ast.walk(node):
if hasattr(child, "lineno") and child.lineno:
max_line = max(max_line, child.lineno)

return max_line


def extract_top_level_code(path: Path, source: str) -> Optional[Chunk]:
"""
Extract module-level code that isn't inside any function or class.
This includes imports, constants, and top-level statements.
"""
try:
tree = ast.parse(source, filename=str(path))
except SyntaxError:
return None

lines = source.splitlines(keepends=True)
total_lines = len(lines)

# Find all line ranges occupied by functions/classes
occupied = set()
for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno
end = _get_end_line(node, lines)
for ln in range(start, end + 1):
occupied.add(ln)

# Collect non-occupied lines
top_level_lines = []
for i in range(1, total_lines + 1):
if i not in occupied:
top_level_lines.append(lines[i - 1])

text = "".join(top_level_lines).strip()

if not text:
return None

return Chunk(
file_path=path,
start_line=1,
end_line=total_lines,
text=f"# [module-level] {path.name}\n{text}",
)
16 changes: 15 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,23 @@
# Maximum file size (in bytes) to read.
MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB

# How many lines to group into one chunk.
# How many lines to group into one chunk (used as fallback when AST parsing fails).
CHUNK_LINE_COUNT = 40

# ---- AST chunking settings ----

# Enable AST-based chunking for supported file types.
# When enabled, functions/classes are extracted as individual chunks.
# Falls back to line-based chunking if parsing fails.
AST_ENABLED = True

# File extensions that support AST-based parsing.
AST_EXTENSIONS = {
".py",
".js",
".ts",
}

# How many top chunks to keep before sending anything to the LLM.
CANDIDATE_CHUNK_LIMIT = 20

Expand Down
65 changes: 58 additions & 7 deletions indexer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# indexer.py
#
# Walk the filesystem, filter useful files, and split them into text chunks.
# No AI here — this is just smart file parsing.
# Uses AST-based parsing for supported languages, falls back to line-based chunking.

import os
from pathlib import Path
Expand All @@ -14,6 +14,8 @@
IGNORE_DIRS,
MAX_FILE_SIZE_BYTES,
CHUNK_LINE_COUNT,
AST_ENABLED,
AST_EXTENSIONS,
)


Expand Down Expand Up @@ -50,29 +52,78 @@ def iter_files(root: Path | None = None):

def read_file_chunks(path: Path) -> List[Chunk]:
"""
Read a file, split into CHUNK_LINE_COUNT-line chunks,
attach line numbers for better search results.
Read a file and split into chunks.
Uses AST-based parsing for supported languages if enabled,
falls back to line-based chunking otherwise.
"""
chunks: List[Chunk] = []

try:
# skip massive files (logs, binaries, etc.)
if path.stat().st_size > MAX_FILE_SIZE_BYTES:
return chunks

with path.open("r", encoding="utf-8", errors="ignore") as f:
lines = f.readlines()
source = f.read()
except Exception:
return chunks

if not lines:
if not source.strip():
return chunks

# Try AST-based chunking for supported file types
if AST_ENABLED and path.suffix in AST_EXTENSIONS:
ast_chunks = _try_ast_chunking(path, source)
if ast_chunks:
return ast_chunks

# Fallback: line-based chunking
return _line_based_chunking(path, source)


def _try_ast_chunking(path: Path, source: str) -> List[Chunk]:
"""Attempt AST-based chunking. Returns empty list on failure."""
if path.suffix == ".py":
from ast_parser import parse_python_file, extract_top_level_code

chunks = []

ast_chunks = parse_python_file(path, source)
if ast_chunks:
chunks.extend(ast_chunks)

top_level = extract_top_level_code(path, source)
if top_level:
chunks.append(top_level)

return chunks

if path.suffix in (".js", ".ts"):
from js_parser import parse_js_file, extract_js_top_level

chunks = []

js_chunks = parse_js_file(path, source)
if js_chunks:
chunks.extend(js_chunks)

top_level = extract_js_top_level(path, source)
if top_level:
chunks.append(top_level)

return chunks

return []


def _line_based_chunking(path: Path, source: str) -> List[Chunk]:
"""Original line-based chunking as fallback."""
chunks: List[Chunk] = []
lines = source.splitlines(keepends=True)
total = len(lines)
i = 0

while i < total:
start = i + 1 # 1-based line numbering
start = i + 1
end_index = min(i + CHUNK_LINE_COUNT, total)
end = end_index

Expand Down
Loading