Skip to content

Commit 293eeaa

Browse files
CopilotMte90
andcommitted
Remove analyses table and update code to work directly with projects
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 48d10ef commit 293eeaa

File tree

3 files changed

+61
-146
lines changed

3 files changed

+61
-146
lines changed

analyzer.py

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import concurrent.futures
1111
import threading
1212

13-
from db import create_analysis, store_file, update_analysis_status
13+
from db import store_file
1414
from external_api import get_embedding_for_text, call_coding_api
1515
from llama_index.core import Document
1616
from logger import get_logger
@@ -268,7 +268,6 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
268268
def _process_file_sync(
269269
semaphore: threading.Semaphore,
270270
database_path: str,
271-
analysis_id: int,
272271
full_path: str,
273272
rel_path: str,
274273
cfg: Optional[Dict[str, Any]],
@@ -295,9 +294,9 @@ def _process_file_sync(
295294

296295
# store file (synchronous DB writer)
297296
try:
298-
fid = store_file(database_path, analysis_id, rel_path, content, lang)
297+
fid = store_file(database_path, rel_path, content, lang)
299298
except Exception:
300-
logger.exception("Failed to store file %s for analysis %s", rel_path, analysis_id)
299+
logger.exception("Failed to store file %s", rel_path)
301300
return {"stored": False, "embedded": False}
302301

303302
_ = Document(text=content, extra_info={"path": rel_path, "lang": lang})
@@ -396,14 +395,10 @@ def analyze_local_path_sync(
396395
):
397396
"""
398397
Synchronous implementation of the analysis pipeline.
399-
Submits per-file tasks to a shared ThreadPoolExecutor and updates DB counts/status synchronously.
398+
Submits per-file tasks to a shared ThreadPoolExecutor.
400399
"""
401-
aid = None
402400
semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
403401
try:
404-
name = os.path.basename(os.path.abspath(local_path)) or local_path
405-
aid = create_analysis(database_path, name, local_path, "running")
406-
407402
file_count = 0
408403
emb_count = 0
409404
file_paths: List[Dict[str, str]] = []
@@ -431,7 +426,6 @@ def analyze_local_path_sync(
431426
_process_file_sync,
432427
semaphore,
433428
database_path,
434-
aid,
435429
f["full"],
436430
f["rel"],
437431
cfg,
@@ -447,7 +441,7 @@ def analyze_local_path_sync(
447441
if r.get("embedded"):
448442
emb_count += 1
449443
except Exception:
450-
logger.exception("A per-file task failed for analysis %s", aid)
444+
logger.exception("A per-file task failed")
451445

452446
# store uv_detected.json metadata if possible
453447
uv_info = None
@@ -459,7 +453,6 @@ def analyze_local_path_sync(
459453
try:
460454
store_file(
461455
database_path,
462-
aid,
463456
"uv_detected.json",
464457
json.dumps(uv_info, indent=2),
465458
"meta",
@@ -468,34 +461,9 @@ def analyze_local_path_sync(
468461
try:
469462
print("Failed to store uv_detected.json in DB")
470463
except Exception:
471-
logger.exception("Failed to write uv_detected meta error to disk for analysis %s", aid)
472-
473-
# final counts & status
474-
try:
475-
# update_analysis_counts may be defined elsewhere; call if present
476-
try:
477-
update_analysis_counts # type: ignore
478-
update_analysis_counts(database_path, aid, file_count, emb_count) # type: ignore
479-
except NameError:
480-
# function not present; skip
481-
pass
482-
except Exception:
483-
logger.exception("Failed to update analysis counts for %s", aid)
484-
485-
try:
486-
update_analysis_status(database_path, aid, "completed")
487-
except Exception:
488-
logger.exception("Failed to set analysis status to completed for %s", aid)
464+
logger.exception("Failed to write uv_detected meta error")
489465

490466
except Exception:
491-
try:
492-
if aid:
493-
try:
494-
update_analysis_status(database_path, aid, "failed")
495-
except Exception:
496-
pass
497-
except Exception:
498-
pass
499467
traceback.print_exc()
500468

501469

@@ -534,7 +502,7 @@ def cosine(a, b):
534502
return sum(x * y for x, y in zip(a, b)) / (na * nb)
535503

536504

537-
def search_semantic(query: str, database_path: str, analysis_id: int, top_k: int = 5):
505+
def search_semantic(query: str, database_path: str, top_k: int = 5):
538506
"""
539507
Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
540508
a list of {file_id, path, chunk_index, score}.

db.py

Lines changed: 45 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -173,43 +173,27 @@ def init_db(database_path: str) -> None:
173173
"""
174174
Initialize database schema. Safe to call multiple times.
175175
Creates:
176-
- analyses (embedding_count column kept for backward compat but not used as source of truth)
177-
- files
176+
- files (stores full content of indexed files)
178177
- chunks (with embedding BLOB column for sqlite-vector)
179178
"""
180179
conn = _get_connection(database_path)
181180
try:
182181
cur = conn.cursor()
183-
# analyses table: embedding_count column kept for compatibility but will be computed live
184-
cur.execute(
185-
"""
186-
CREATE TABLE IF NOT EXISTS analyses (
187-
id INTEGER PRIMARY KEY AUTOINCREMENT,
188-
name TEXT NOT NULL,
189-
path TEXT NOT NULL,
190-
status TEXT NOT NULL,
191-
embedding_count INTEGER DEFAULT 0,
192-
created_at TEXT DEFAULT (datetime('now'))
193-
)
194-
"""
195-
)
196-
182+
197183
# files table (stores full content, used to reconstruct chunks)
198184
cur.execute(
199185
"""
200186
CREATE TABLE IF NOT EXISTS files (
201187
id INTEGER PRIMARY KEY AUTOINCREMENT,
202-
analysis_id INTEGER NOT NULL,
203188
path TEXT NOT NULL,
204189
content TEXT,
205190
language TEXT,
206191
snippet TEXT,
207-
created_at TEXT DEFAULT (datetime('now')),
208-
FOREIGN KEY (analysis_id) REFERENCES analyses(id) ON DELETE CASCADE
192+
created_at TEXT DEFAULT (datetime('now'))
209193
)
210194
"""
211195
)
212-
cur.execute("CREATE INDEX IF NOT EXISTS idx_files_analysis ON files(analysis_id);")
196+
cur.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);")
213197

214198
# chunks table: metadata for chunked documents; includes embedding BLOB column
215199
cur.execute(
@@ -231,39 +215,15 @@ def init_db(database_path: str) -> None:
231215
conn.close()
232216

233217

234-
def create_analysis(database_path: str, name: str, path: str, status: str = "pending") -> int:
235-
conn = _get_connection(database_path)
236-
try:
237-
cur = conn.cursor()
238-
cur.execute(
239-
"INSERT INTO analyses (name, path, status) VALUES (?, ?, ?)",
240-
(name, path, status),
241-
)
242-
conn.commit()
243-
return int(cur.lastrowid)
244-
finally:
245-
conn.close()
246-
247-
248-
def update_analysis_status(database_path: str, analysis_id: int, status: str) -> None:
249-
conn = _get_connection(database_path)
250-
try:
251-
cur = conn.cursor()
252-
cur.execute("UPDATE analyses SET status = ? WHERE id = ?", (status, analysis_id))
253-
conn.commit()
254-
finally:
255-
conn.close()
256-
257-
258-
def store_file(database_path, analysis_id, path, content, language):
218+
def store_file(database_path, path, content, language):
259219
"""
260220
Insert a file record into the DB using a queued single-writer to avoid
261221
sqlite 'database is locked' errors in multithreaded scenarios.
262222
Returns lastrowid (same as the previous store_file implementation).
263223
"""
264224
snippet = (content[:512] if content else "")
265-
sql = "INSERT INTO files (analysis_id, path, content, language, snippet) VALUES (?, ?, ?, ?, ?)"
266-
params = (analysis_id, path, content, language, snippet)
225+
sql = "INSERT INTO files (path, content, language, snippet) VALUES (?, ?, ?, ?)"
226+
params = (path, content, language, snippet)
267227

268228
writer = _get_writer(database_path)
269229
# We wait for the background writer to complete the insert and then return the row id.
@@ -289,75 +249,66 @@ def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path:
289249
conn.close()
290250

291251

292-
def list_analyses(database_path: str) -> List[Dict[str, Any]]:
252+
def get_project_stats(database_path: str) -> Dict[str, Any]:
293253
"""
294-
Return analyses with computed file_count and computed embedding_count (from chunks.embedding).
295-
This ensures the UI shows accurate, up-to-date counts based on actual rows.
254+
Get statistics for a project database.
255+
Returns file_count and embedding_count.
296256
"""
297257
conn = _get_connection(database_path)
298258
try:
299259
cur = conn.cursor()
300-
rows = cur.execute(
301-
"""
302-
SELECT
303-
a.id,
304-
a.name,
305-
a.path,
306-
a.status,
307-
(SELECT COUNT(*) FROM files f WHERE f.analysis_id = a.id) AS file_count,
308-
(SELECT COUNT(*) FROM chunks ch JOIN files f2 ON ch.file_id = f2.id
309-
WHERE f2.analysis_id = a.id AND ch.embedding IS NOT NULL) AS embedding_count,
310-
a.created_at
311-
FROM analyses a
312-
ORDER BY a.id DESC
313-
"""
314-
).fetchall()
315-
results: List[Dict[str, Any]] = []
316-
for r in rows:
317-
results.append(
318-
{
319-
"id": r["id"],
320-
"name": r["name"],
321-
"path": r["path"],
322-
"status": r["status"],
323-
"file_count": int(r["file_count"]),
324-
"embedding_count": int(r["embedding_count"]),
325-
"created_at": r["created_at"],
326-
}
327-
)
328-
return results
260+
261+
# Count files
262+
cur.execute("SELECT COUNT(*) FROM files")
263+
file_count = cur.fetchone()[0]
264+
265+
# Count embeddings
266+
cur.execute("SELECT COUNT(*) FROM chunks WHERE embedding IS NOT NULL")
267+
embedding_count = cur.fetchone()[0]
268+
269+
return {
270+
"file_count": int(file_count),
271+
"embedding_count": int(embedding_count)
272+
}
329273
finally:
330274
conn.close()
331275

332276

333-
def list_files_for_analysis(database_path: str, analysis_id: int) -> List[Dict[str, Any]]:
277+
def list_files(database_path: str) -> List[Dict[str, Any]]:
278+
"""
279+
List all files in a project database.
280+
"""
334281
conn = _get_connection(database_path)
335282
try:
336283
rows = conn.execute(
337-
"SELECT id, path, snippet FROM files WHERE analysis_id = ? ORDER BY id DESC", (analysis_id,)
284+
"SELECT id, path, snippet, language, created_at FROM files ORDER BY id DESC"
338285
).fetchall()
339-
return [{"id": r["id"], "path": r["path"], "snippet": r["snippet"]} for r in rows]
286+
return [
287+
{
288+
"id": r["id"],
289+
"path": r["path"],
290+
"snippet": r["snippet"],
291+
"language": r["language"],
292+
"created_at": r["created_at"]
293+
}
294+
for r in rows
295+
]
340296
finally:
341297
conn.close()
342298

343299

344-
def delete_analysis(database_path: str, analysis_id: int) -> None:
300+
def clear_project_data(database_path: str) -> None:
345301
"""
346-
Delete an analysis and cascade-delete associated files / chunks.
347-
Foreign key enforcement varies by SQLite build; do explicit deletes for safety.
302+
Clear all files and chunks from a project database.
303+
Used when re-indexing a project.
348304
"""
349305
conn = _get_connection(database_path)
350306
try:
351307
cur = conn.cursor()
352-
# delete chunks for files in analysis
353-
cur.execute(
354-
"DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",
355-
(analysis_id,),
356-
)
357-
# delete files
358-
cur.execute("DELETE FROM files WHERE analysis_id = ?", (analysis_id,))
359-
# delete analysis row
360-
cur.execute("DELETE FROM analyses WHERE id = ?", (analysis_id,))
308+
# Delete chunks first due to foreign key
309+
cur.execute("DELETE FROM chunks")
310+
# Delete files
311+
cur.execute("DELETE FROM files")
361312
conn.commit()
362313
finally:
363314
conn.close()

main.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from typing import Optional
1010
from datetime import datetime
1111

12-
from db import init_db, list_analyses
12+
from db import init_db, get_project_stats
1313
from analyzer import analyze_local_path_background, search_semantic, call_coding_model
1414
from config import CFG
1515
from projects import (
@@ -158,15 +158,13 @@ def api_query(request: QueryRequest):
158158

159159
db_path = project["database_path"]
160160

161-
# Get the first analysis ID from the project database
162-
analyses = list_analyses(db_path)
163-
if not analyses:
161+
# Check if project has been indexed
162+
stats = get_project_stats(db_path)
163+
if stats["file_count"] == 0:
164164
return JSONResponse({"error": "Project not indexed yet"}, status_code=400)
165165

166-
analysis_id = analyses[0]["id"]
167-
168166
# Perform semantic search
169-
results = search_semantic(request.query, db_path, analysis_id=analysis_id, top_k=request.top_k)
167+
results = search_semantic(request.query, db_path, top_k=request.top_k)
170168

171169
return JSONResponse({
172170
"results": results,
@@ -298,12 +296,10 @@ def code_endpoint(request: Request):
298296

299297
database_path = project["database_path"]
300298

301-
# Get the first analysis from this project's database
302-
analyses = list_analyses(database_path)
303-
if not analyses:
299+
# Check if project has been indexed
300+
stats = get_project_stats(database_path)
301+
if stats["file_count"] == 0:
304302
return JSONResponse({"error": "Project not indexed yet. Please run indexing first."}, status_code=400)
305-
306-
analysis_id = analyses[0]["id"]
307303
except Exception as e:
308304
logger.exception(f"Error getting project: {e}")
309305
return JSONResponse({"error": "Failed to retrieve project"}, status_code=500)
@@ -319,7 +315,7 @@ def code_endpoint(request: Request):
319315
# If RAG requested, perform semantic search and build context
320316
if use_rag:
321317
try:
322-
retrieved = search_semantic(prompt, database_path, analysis_id=int(analysis_id), top_k=top_k)
318+
retrieved = search_semantic(prompt, database_path, top_k=top_k)
323319
# Build context WITHOUT including snippets: only include file references and scores
324320
context_parts = []
325321
total_len = len(combined_context)

0 commit comments

Comments
 (0)