Skip to content

Commit 7921f45

Browse files
CopilotMte90
andcommitted
Replace AI components with llama-index, remove include_content parameter, show total files
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent d5056b0 commit 7921f45

File tree

10 files changed

+268
-61
lines changed

10 files changed

+268
-61
lines changed

ai/analyzer.py

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717
insert_chunk_vector_with_retry as _insert_chunk_vector_with_retry,
1818
get_chunk_text as _get_chunk_text,
1919
)
20-
from .openai import call_coding_api, EmbeddingClient
20+
from .openai import call_coding_api
21+
from .llama_embeddings import OpenAICompatibleEmbedding
22+
from .llama_chunker import chunk_with_llama_index
2123
from llama_index.core import Document
2224
from utils.logger import get_logger
23-
from utils import compute_file_hash, chunk_text, norm, cosine
24-
from .smart_chunker import smart_chunk
25+
from utils import compute_file_hash, norm, cosine
2526
import logging
2627

2728
# reduce noise from httpx used by external libs
@@ -63,8 +64,8 @@
6364

6465
logger = get_logger(__name__)
6566

66-
# Initialize EmbeddingClient for structured logging and retry logic
67-
_embedding_client = EmbeddingClient()
67+
# Initialize llama-index embedding client
68+
_embedding_client = OpenAICompatibleEmbedding()
6869

6970
# Thread-local storage to track execution state inside futures
7071
_thread_state = threading.local()
@@ -85,7 +86,8 @@ def _get_embedding_with_semaphore(semaphore: threading.Semaphore, text: str, fil
8586
semaphore.acquire()
8687
try:
8788
_thread_state.stage = "calling_embed_text"
88-
result = _embedding_client.embed_text(text, file_path=file_path, chunk_index=chunk_index)
89+
# Use llama-index embedding client
90+
result = _embedding_client._get_text_embedding(text)
8991
_thread_state.stage = "completed"
9092
return result
9193
except Exception as e:
@@ -170,14 +172,8 @@ def _process_file_sync(
170172
if isinstance(cfg, dict):
171173
embedding_model = cfg.get("embedding_model")
172174

173-
# Use smart chunking for supported code languages
174-
use_smart_chunking = cfg.get("smart_chunking", True) if isinstance(cfg, dict) else True
175-
supported_languages = ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]
176-
177-
if use_smart_chunking and lang in supported_languages:
178-
chunks = smart_chunk(content, language=lang, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
179-
else:
180-
chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
175+
# Use llama-index chunking for all content
176+
chunks = chunk_with_llama_index(content, language=lang, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
181177

182178
if not chunks:
183179
chunks = [content]
@@ -439,19 +435,18 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
439435

440436

441437

442-
def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True):
438+
def search_semantic(query: str, database_path: str, top_k: int = 5):
443439
"""
444-
Uses llama-index with sqlite-vector backend to retrieve best-matching chunks and returns
445-
a list of {file_id, path, chunk_index, score, content (optional)}.
440+
Uses llama-index with sqlite-vector backend to retrieve best-matching chunks.
441+
Always includes content as it's needed for the coding model context.
446442
447443
Args:
448444
query: Search query text
449445
database_path: Path to the SQLite database
450446
top_k: Number of results to return
451-
include_content: Whether to retrieve and include the actual chunk text
452447
453448
Returns:
454-
List of dicts with file_id, path, chunk_index, score, and optionally content
449+
List of dicts with file_id, path, chunk_index, score, and content
455450
"""
456451
try:
457452
# Use llama-index for semantic search
@@ -466,13 +461,9 @@ def search_semantic(query: str, database_path: str, top_k: int = 5, include_cont
466461
"file_id": metadata.get("file_id", 0),
467462
"path": metadata.get("path", ""),
468463
"chunk_index": metadata.get("chunk_index", 0),
469-
"score": metadata.get("score", 0.0)
464+
"score": metadata.get("score", 0.0),
465+
"content": doc.text or "" # Always include content for LLM context
470466
}
471-
472-
# Include content if requested
473-
if include_content:
474-
result["content"] = doc.text or ""
475-
476467
results.append(result)
477468

478469
logger.info(f"llama-index search returned {len(results)} results")

ai/llama_chunker.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
LlamaIndex-based chunking for code and text.
3+
Replaces smart_chunker.py with llama-index's built-in splitters.
4+
"""
5+
from typing import List
6+
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
7+
from llama_index.core.schema import Document
8+
9+
from utils.logger import get_logger
10+
11+
logger = get_logger(__name__)
12+
13+
14+
def chunk_with_llama_index(
15+
content: str,
16+
language: str = "text",
17+
chunk_size: int = 800,
18+
chunk_overlap: int = 100
19+
) -> List[str]:
20+
"""
21+
Chunk text or code using llama-index's splitters.
22+
23+
Args:
24+
content: Text or code content to chunk
25+
language: Programming language (python, javascript, etc.) or "text"
26+
chunk_size: Target size for each chunk in characters
27+
chunk_overlap: Overlap between chunks in characters
28+
29+
Returns:
30+
List of text chunks
31+
"""
32+
# Map language names to llama-index language identifiers
33+
language_map = {
34+
"python": "python",
35+
"javascript": "js",
36+
"typescript": "ts",
37+
"java": "java",
38+
"go": "go",
39+
"rust": "rust",
40+
"c": "c",
41+
"cpp": "cpp",
42+
"c++": "cpp",
43+
}
44+
45+
try:
46+
# Check if it's a supported code language
47+
llama_lang = language_map.get(language.lower())
48+
49+
if llama_lang:
50+
# Use CodeSplitter for code
51+
splitter = CodeSplitter(
52+
language=llama_lang,
53+
chunk_lines=40, # Target lines per chunk (approximation)
54+
chunk_lines_overlap=5, # Overlap in lines
55+
max_chars=chunk_size
56+
)
57+
logger.debug(f"Using CodeSplitter for language: {llama_lang}")
58+
else:
59+
# Use SentenceSplitter for text or unknown languages
60+
splitter = SentenceSplitter(
61+
chunk_size=chunk_size,
62+
chunk_overlap=chunk_overlap,
63+
paragraph_separator="\n\n",
64+
secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?"
65+
)
66+
logger.debug(f"Using SentenceSplitter for language: {language}")
67+
68+
# Create a document and split it
69+
doc = Document(text=content)
70+
nodes = splitter.get_nodes_from_documents([doc])
71+
72+
# Extract text from nodes
73+
chunks = [node.text for node in nodes if node.text]
74+
75+
logger.debug(f"Split content into {len(chunks)} chunks")
76+
return chunks if chunks else [content]
77+
78+
except Exception as e:
79+
logger.exception(f"Error chunking with llama-index: {e}")
80+
# Fallback to simple chunking
81+
return simple_chunk(content, chunk_size, chunk_overlap)
82+
83+
84+
def simple_chunk(text: str, chunk_size: int = 800, chunk_overlap: int = 100) -> List[str]:
85+
"""
86+
Simple character-based chunking fallback.
87+
88+
Args:
89+
text: Text to chunk
90+
chunk_size: Size of each chunk
91+
chunk_overlap: Overlap between chunks
92+
93+
Returns:
94+
List of text chunks
95+
"""
96+
if not text:
97+
return []
98+
99+
chunks = []
100+
step = max(1, chunk_size - chunk_overlap)
101+
102+
for i in range(0, len(text), step):
103+
end = min(i + chunk_size, len(text))
104+
chunk = text[i:end]
105+
if chunk.strip():
106+
chunks.append(chunk)
107+
108+
if end >= len(text):
109+
break
110+
111+
return chunks if chunks else [text]

ai/llama_embeddings.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
LlamaIndex-compatible embeddings using OpenAI API.
3+
Replaces the custom EmbeddingClient with llama-index's embedding abstraction.
4+
"""
5+
from typing import List, Optional
6+
from llama_index.core.embeddings import BaseEmbedding
7+
from llama_index.core.bridge.pydantic import PrivateAttr
8+
from openai import OpenAI
9+
10+
from utils.config import CFG
11+
from utils.logger import get_logger
12+
13+
logger = get_logger(__name__)
14+
15+
16+
class OpenAICompatibleEmbedding(BaseEmbedding):
17+
"""
18+
LlamaIndex-compatible embedding model using OpenAI-compatible API.
19+
Works with any OpenAI-compatible endpoint (OpenAI, Azure, local servers, etc.)
20+
"""
21+
22+
_client: OpenAI = PrivateAttr()
23+
_model: str = PrivateAttr()
24+
25+
def __init__(
26+
self,
27+
api_key: Optional[str] = None,
28+
api_base: Optional[str] = None,
29+
model: Optional[str] = None,
30+
**kwargs
31+
):
32+
"""
33+
Initialize the embedding model.
34+
35+
Args:
36+
api_key: OpenAI API key (defaults to config)
37+
api_base: API base URL (defaults to config)
38+
model: Model name (defaults to config)
39+
"""
40+
super().__init__(**kwargs)
41+
42+
# Get config values
43+
self._client = OpenAI(
44+
api_key=api_key or CFG.get("api_key"),
45+
base_url=api_base or CFG.get("api_url")
46+
)
47+
self._model = model or CFG.get("embedding_model") or "text-embedding-3-small"
48+
49+
logger.info(f"Initialized OpenAICompatibleEmbedding with model: {self._model}")
50+
51+
@classmethod
52+
def class_name(cls) -> str:
53+
return "OpenAICompatibleEmbedding"
54+
55+
async def _aget_query_embedding(self, query: str) -> List[float]:
56+
"""Get query embedding asynchronously."""
57+
return self._get_query_embedding(query)
58+
59+
async def _aget_text_embedding(self, text: str) -> List[float]:
60+
"""Get text embedding asynchronously."""
61+
return self._get_text_embedding(text)
62+
63+
def _get_query_embedding(self, query: str) -> List[float]:
64+
"""Get embedding for a query."""
65+
return self._get_text_embedding(query)
66+
67+
def _get_text_embedding(self, text: str) -> List[float]:
68+
"""Get embedding for a text."""
69+
try:
70+
# Clean the text
71+
text = text.replace("\n", " ").strip()
72+
if not text:
73+
logger.warning("Empty text provided for embedding")
74+
return []
75+
76+
# Call OpenAI API
77+
response = self._client.embeddings.create(
78+
input=[text],
79+
model=self._model
80+
)
81+
82+
if response.data and len(response.data) > 0:
83+
embedding = response.data[0].embedding
84+
logger.debug(f"Generated embedding with dimension: {len(embedding)}")
85+
return embedding
86+
else:
87+
logger.error("No embedding returned from API")
88+
return []
89+
90+
except Exception as e:
91+
logger.exception(f"Failed to generate embedding: {e}")
92+
return []
93+
94+
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
95+
"""Get embeddings for multiple texts."""
96+
embeddings = []
97+
for text in texts:
98+
embedding = self._get_text_embedding(text)
99+
embeddings.append(embedding)
100+
return embeddings

ai/llama_integration.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
LlamaIndex integration for document retrieval.
33
Provides RAG functionality using llama-index with sqlite-vector backend.
44
"""
5-
from typing import List, Optional
5+
from typing import List
66
from llama_index.core import Document
77
from llama_index.core.vector_stores.types import VectorStoreQuery
88

9-
from .openai import EmbeddingClient
9+
from .llama_embeddings import OpenAICompatibleEmbedding
1010
from .llama_vector_store import SQLiteVectorStore
1111
from utils.logger import get_logger
1212

1313
logger = get_logger(__name__)
1414

1515
# Create a module-level embedding client instance
16-
_embedding_client = EmbeddingClient()
16+
_embedding_client = OpenAICompatibleEmbedding()
1717

1818

1919
def llama_index_search(query: str, database_path: str, top_k: int = 5) -> List[Document]:
@@ -29,8 +29,8 @@ def llama_index_search(query: str, database_path: str, top_k: int = 5) -> List[D
2929
List of Document objects with chunk text and metadata
3030
"""
3131
try:
32-
# Get query embedding
33-
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
32+
# Get query embedding using llama-index embedding client
33+
q_emb = _embedding_client._get_query_embedding(query)
3434
if not q_emb:
3535
logger.warning("Failed to generate query embedding")
3636
return []

db/models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ class QueryRequest(BaseModel):
1919
project_id: str
2020
query: str
2121
top_k: Optional[int] = 5
22-
include_content: Optional[bool] = True # Whether to include file content in results
2322

endpoints/project_endpoints.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,26 +104,43 @@ def api_get_project(project_id: str):
104104

105105
# Add indexing statistics if project has a database
106106
db_path = project.get("database_path")
107+
project_path = project.get("path")
108+
107109
if db_path and os.path.exists(db_path):
108110
try:
109111
from db.operations import get_project_stats
110112
stats = get_project_stats(db_path)
113+
114+
# Count total files in project directory for progress tracking
115+
total_files = 0
116+
if project_path and os.path.exists(project_path):
117+
try:
118+
for root, dirs, files in os.walk(project_path):
119+
# Skip common ignored directories
120+
dirs[:] = [d for d in dirs if not d.startswith('.') and d not in ['node_modules', '__pycache__', 'venv', '.venv']]
121+
total_files += len([f for f in files if not f.startswith('.')])
122+
except Exception as e:
123+
logger.warning(f"Could not count total files: {e}")
124+
111125
project["indexing_stats"] = {
112126
"file_count": stats.get("file_count", 0),
113127
"embedding_count": stats.get("embedding_count", 0),
128+
"total_files": total_files,
114129
"is_indexed": stats.get("file_count", 0) > 0
115130
}
116131
except Exception as e:
117132
logger.warning(f"Could not get stats for project {project_id}: {e}")
118133
project["indexing_stats"] = {
119134
"file_count": 0,
120135
"embedding_count": 0,
136+
"total_files": 0,
121137
"is_indexed": False
122138
}
123139
else:
124140
project["indexing_stats"] = {
125141
"file_count": 0,
126142
"embedding_count": 0,
143+
"total_files": 0,
127144
"is_indexed": False
128145
}
129146

0 commit comments

Comments
 (0)