Implement smart code-aware chunking

Copilot · Mte90 · Copilot · commit a9d311592185 · 2025-11-07T09:04:24.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/analyzer.py b/analyzer.py
@@ -15,6 +15,7 @@
 from external_api import get_embedding_for_text, call_coding_api
 from llama_index.core import Document
 from logger import get_logger
+from smart_chunker import smart_chunk
 import logging
 
 # reduce noise from httpx used by external libs
@@ -323,7 +324,14 @@ def _process_file_sync(
         if isinstance(cfg, dict):
             embedding_model = cfg.get("embedding_model")
 
-        chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
+        # Use smart chunking for code files, fallback to simple for others
+        use_smart_chunking = cfg.get("smart_chunking", True) if isinstance(cfg, dict) else True
+        
+        if use_smart_chunking and lang != "text":
+            chunks = smart_chunk(content, language=lang, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
+        else:
+            chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
+        
         if not chunks:
             chunks = [content]
 
diff --git a/db_modules/__init__.py b/db_modules/__init__.py
@@ -0,0 +1,48 @@
+"""
+Database module initialization.
+Provides organized access to database operations.
+"""
+from .connection import get_connection, init_db
+from .files import (
+    store_file, get_file_by_path, needs_reindex, 
+    list_files, delete_file_by_path, clear_project_data
+)
+from .projects import (
+    create_project, get_project, get_project_by_id, list_projects,
+    update_project_status, update_project_settings, delete_project,
+    get_or_create_project
+)
+from .metadata import (
+    set_project_metadata, set_project_metadata_batch,
+    get_project_metadata, get_project_stats
+)
+from .chunks import insert_chunk_row_with_null_embedding
+
+__all__ = [
+    # Connection
+    'get_connection',
+    'init_db',
+    # Files
+    'store_file',
+    'get_file_by_path',
+    'needs_reindex',
+    'list_files',
+    'delete_file_by_path',
+    'clear_project_data',
+    # Projects
+    'create_project',
+    'get_project',
+    'get_project_by_id',
+    'list_projects',
+    'update_project_status',
+    'update_project_settings',
+    'delete_project',
+    'get_or_create_project',
+    # Metadata
+    'set_project_metadata',
+    'set_project_metadata_batch',
+    'get_project_metadata',
+    'get_project_stats',
+    # Chunks
+    'insert_chunk_row_with_null_embedding',
+]
diff --git a/smart_chunker.py b/smart_chunker.py
@@ -0,0 +1,281 @@
+"""
+Smart chunking module for code-aware text splitting.
+Respects code structure (functions, classes, methods) for better semantic search.
+"""
+import re
+from typing import List, Tuple, Optional
+from pathlib import Path
+
+
+class SmartChunker:
+    """
+    Code-aware chunker that splits text based on language structure.
+    Falls back to simple chunking for non-code or unknown languages.
+    """
+    
+    def __init__(self, chunk_size: int = 800, overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+    
+    def chunk(self, text: str, language: str = "text") -> List[str]:
+        """
+        Chunk text based on language-specific rules.
+        
+        Args:
+            text: Text content to chunk
+            language: Programming language identifier
+        
+        Returns:
+            List of text chunks
+        """
+        if language in ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]:
+            return self._chunk_code(text, language)
+        else:
+            return self._chunk_simple(text)
+    
+    def _chunk_code(self, text: str, language: str) -> List[str]:
+        """
+        Smart chunking for code that respects structure.
+        """
+        # Split into logical units (functions, classes, etc.)
+        units = self._split_into_units(text, language)
+        
+        if not units:
+            # Fallback to simple chunking if structure detection fails
+            return self._chunk_simple(text)
+        
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        
+        for unit_text, unit_type in units:
+            unit_size = len(unit_text)
+            
+            # If single unit is larger than chunk_size, split it
+            if unit_size > self.chunk_size:
+                # Save current chunk if it has content
+                if current_chunk:
+                    chunks.append("\n".join(current_chunk))
+                    current_chunk = []
+                    current_size = 0
+                
+                # Split large unit with simple chunking
+                sub_chunks = self._chunk_simple(unit_text)
+                chunks.extend(sub_chunks)
+                continue
+            
+            # Check if adding this unit would exceed chunk_size
+            if current_size + unit_size > self.chunk_size and current_chunk:
+                # Save current chunk
+                chunks.append("\n".join(current_chunk))
+                
+                # Start new chunk with overlap
+                # Keep last unit for context
+                if len(current_chunk) > 1:
+                    last_unit = current_chunk[-1]
+                    current_chunk = [last_unit, unit_text]
+                    current_size = len(last_unit) + unit_size
+                else:
+                    current_chunk = [unit_text]
+                    current_size = unit_size
+            else:
+                # Add to current chunk
+                current_chunk.append(unit_text)
+                current_size += unit_size
+        
+        # Add remaining chunk
+        if current_chunk:
+            chunks.append("\n".join(current_chunk))
+        
+        return chunks if chunks else [text]
+    
+    def _split_into_units(self, text: str, language: str) -> List[Tuple[str, str]]:
+        """
+        Split code into logical units (functions, classes, etc.).
+        Returns list of (text, unit_type) tuples.
+        """
+        if language == "python":
+            return self._split_python(text)
+        elif language in ["javascript", "typescript"]:
+            return self._split_javascript(text)
+        elif language == "java":
+            return self._split_java(text)
+        elif language in ["go", "rust", "c", "cpp"]:
+            return self._split_c_style(text)
+        else:
+            return []
+    
+    def _split_python(self, text: str) -> List[Tuple[str, str]]:
+        """Split Python code into classes and functions."""
+        units = []
+        lines = text.split("\n")
+        current_unit = []
+        current_type = None
+        indent_stack = []
+        
+        for i, line in enumerate(lines):
+            stripped = line.lstrip()
+            indent = len(line) - len(stripped)
+            
+            # Detect class or function definition
+            if stripped.startswith("class ") or stripped.startswith("def "):
+                # Save previous unit if exists
+                if current_unit:
+                    units.append(("\n".join(current_unit), current_type or "code"))
+                    current_unit = []
+                
+                current_type = "class" if stripped.startswith("class ") else "function"
+                current_unit = [line]
+                indent_stack = [indent]
+            elif current_unit:
+                # Continue current unit
+                current_unit.append(line)
+                
+                # Check if we're back to base indent (end of function/class)
+                if stripped and not stripped.startswith("#") and indent <= indent_stack[0]:
+                    if i < len(lines) - 1:  # Not last line
+                        # Check next line to see if it's a new definition
+                        next_stripped = lines[i + 1].lstrip()
+                        if next_stripped.startswith("class ") or next_stripped.startswith("def "):
+                            # End current unit
+                            units.append(("\n".join(current_unit[:-1]), current_type))
+                            current_unit = [line]  # Start module-level code
+                            current_type = "module"
+            else:
+                # Module-level code
+                if not current_unit:
+                    current_type = "module"
+                current_unit.append(line)
+        
+        # Add remaining unit
+        if current_unit:
+            units.append(("\n".join(current_unit), current_type or "code"))
+        
+        return units
+    
+    def _split_javascript(self, text: str) -> List[Tuple[str, str]]:
+        """Split JavaScript/TypeScript code into functions and classes."""
+        units = []
+        
+        # Regex patterns for JS/TS
+        # Match function declarations, arrow functions, class declarations
+        patterns = [
+            r'((?:export\s+)?(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*{[\s\S]*?})',
+            r'((?:export\s+)?const\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*{[\s\S]*?})',
+            r'((?:export\s+)?class\s+\w+(?:\s+extends\s+\w+)?\s*{[\s\S]*?})',
+        ]
+        
+        # Try to match and extract units
+        for pattern in patterns:
+            matches = re.finditer(pattern, text)
+            for match in matches:
+                unit_text = match.group(1)
+                unit_type = "function" if "function" in unit_text or "=>" in unit_text else "class"
+                units.append((unit_text, unit_type))
+        
+        # If no matches, fall back to brace-based splitting
+        if not units:
+            units = self._split_by_braces(text)
+        
+        return units
+    
+    def _split_java(self, text: str) -> List[Tuple[str, str]]:
+        """Split Java code into classes and methods."""
+        # Similar to JavaScript but with Java-specific patterns
+        patterns = [
+            r'((?:public|private|protected)?\s*(?:static)?\s*(?:class|interface|enum)\s+\w+[\s\S]*?{[\s\S]*?})',
+            r'((?:public|private|protected)?\s*(?:static)?\s*(?:\w+\s+)?\w+\s*\([^)]*\)\s*(?:throws\s+\w+(?:,\s*\w+)*)?\s*{[\s\S]*?})',
+        ]
+        
+        units = []
+        for pattern in patterns:
+            matches = re.finditer(pattern, text)
+            for match in matches:
+                unit_text = match.group(1)
+                unit_type = "class" if any(kw in unit_text for kw in ["class", "interface", "enum"]) else "method"
+                units.append((unit_text, unit_type))
+        
+        if not units:
+            units = self._split_by_braces(text)
+        
+        return units
+    
+    def _split_c_style(self, text: str) -> List[Tuple[str, str]]:
+        """Split C-style languages (Go, Rust, C, C++) into functions."""
+        units = self._split_by_braces(text)
+        return units if units else []
+    
+    def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
+        """
+        Generic brace-based splitting for C-style languages.
+        Finds balanced brace blocks.
+        """
+        units = []
+        lines = text.split("\n")
+        current_unit = []
+        brace_count = 0
+        in_block = False
+        
+        for line in lines:
+            current_unit.append(line)
+            
+            # Count braces (simple heuristic, doesn't handle strings/comments perfectly)
+            brace_count += line.count("{") - line.count("}")
+            
+            if "{" in line and not in_block:
+                in_block = True
+            
+            if in_block and brace_count == 0:
+                # Block closed
+                units.append(("\n".join(current_unit), "function"))
+                current_unit = []
+                in_block = False
+        
+        # Add remaining lines
+        if current_unit:
+            units.append(("\n".join(current_unit), "code"))
+        
+        return units
+    
+    def _chunk_simple(self, text: str) -> List[str]:
+        """
+        Simple character-based chunking with overlap.
+        Used as fallback or for non-code content.
+        """
+        if not text:
+            return []
+        
+        if len(text) <= self.chunk_size:
+            return [text]
+        
+        chunks = []
+        step = max(1, self.chunk_size - self.overlap)
+        start = 0
+        
+        while start < len(text):
+            end = min(start + self.chunk_size, len(text))
+            chunks.append(text[start:end])
+            start += step
+        
+        return chunks
+
+
+# Global instance for convenience
+_default_chunker = SmartChunker()
+
+
+def smart_chunk(text: str, language: str = "text", chunk_size: int = 800, overlap: int = 100) -> List[str]:
+    """
+    Convenience function for smart chunking.
+    
+    Args:
+        text: Text to chunk
+        language: Programming language
+        chunk_size: Maximum chunk size in characters
+        overlap: Overlap between chunks in characters
+    
+    Returns:
+        List of text chunks
+    """
+    chunker = SmartChunker(chunk_size=chunk_size, overlap=overlap)
+    return chunker.chunk(text, language)