Remove analyses table and update code to work directly with projects

Copilot · Mte90 · Copilot · commit 293eeaaf7fc4 · 2025-11-06T17:41:53.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/analyzer.py b/analyzer.py
@@ -10,7 +10,7 @@
 import concurrent.futures
 import threading
 
-from db import create_analysis, store_file, update_analysis_status
+from db import store_file
 from external_api import get_embedding_for_text, call_coding_api
 from llama_index.core import Document
 from logger import get_logger
@@ -268,7 +268,6 @@ def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optio
 def _process_file_sync(
     semaphore: threading.Semaphore,
     database_path: str,
-    analysis_id: int,
     full_path: str,
     rel_path: str,
     cfg: Optional[Dict[str, Any]],
@@ -295,9 +294,9 @@ def _process_file_sync(
 
         # store file (synchronous DB writer)
         try:
-            fid = store_file(database_path, analysis_id, rel_path, content, lang)
+            fid = store_file(database_path, rel_path, content, lang)
         except Exception:
-            logger.exception("Failed to store file %s for analysis %s", rel_path, analysis_id)
+            logger.exception("Failed to store file %s", rel_path)
             return {"stored": False, "embedded": False}
 
         _ = Document(text=content, extra_info={"path": rel_path, "lang": lang})
@@ -396,14 +395,10 @@ def analyze_local_path_sync(
 ):
     """
     Synchronous implementation of the analysis pipeline.
-    Submits per-file tasks to a shared ThreadPoolExecutor and updates DB counts/status synchronously.
+    Submits per-file tasks to a shared ThreadPoolExecutor.
     """
-    aid = None
     semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
     try:
-        name = os.path.basename(os.path.abspath(local_path)) or local_path
-        aid = create_analysis(database_path, name, local_path, "running")
-
         file_count = 0
         emb_count = 0
         file_paths: List[Dict[str, str]] = []
@@ -431,7 +426,6 @@ def analyze_local_path_sync(
                     _process_file_sync,
                     semaphore,
                     database_path,
-                    aid,
                     f["full"],
                     f["rel"],
                     cfg,
@@ -447,7 +441,7 @@ def analyze_local_path_sync(
                         if r.get("embedded"):
                             emb_count += 1
                 except Exception:
-                    logger.exception("A per-file task failed for analysis %s", aid)
+                    logger.exception("A per-file task failed")
 
         # store uv_detected.json metadata if possible
         uv_info = None
@@ -459,7 +453,6 @@ def analyze_local_path_sync(
         try:
             store_file(
                 database_path,
-                aid,
                 "uv_detected.json",
                 json.dumps(uv_info, indent=2),
                 "meta",
@@ -468,34 +461,9 @@ def analyze_local_path_sync(
             try:
                 print("Failed to store uv_detected.json in DB")
             except Exception:
-                logger.exception("Failed to write uv_detected meta error to disk for analysis %s", aid)
-
-        # final counts & status
-        try:
-            # update_analysis_counts may be defined elsewhere; call if present
-            try:
-                update_analysis_counts  # type: ignore
-                update_analysis_counts(database_path, aid, file_count, emb_count)  # type: ignore
-            except NameError:
-                # function not present; skip
-                pass
-        except Exception:
-            logger.exception("Failed to update analysis counts for %s", aid)
-
-        try:
-            update_analysis_status(database_path, aid, "completed")
-        except Exception:
-            logger.exception("Failed to set analysis status to completed for %s", aid)
+                logger.exception("Failed to write uv_detected meta error")
 
     except Exception:
-        try:
-            if aid:
-                try:
-                    update_analysis_status(database_path, aid, "failed")
-                except Exception:
-                    pass
-        except Exception:
-            pass
         traceback.print_exc()
 
 
@@ -534,7 +502,7 @@ def cosine(a, b):
     return sum(x * y for x, y in zip(a, b)) / (na * nb)
 
 
-def search_semantic(query: str, database_path: str, analysis_id: int, top_k: int = 5):
+def search_semantic(query: str, database_path: str, top_k: int = 5):
     """
     Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
     a list of {file_id, path, chunk_index, score}.
diff --git a/db.py b/db.py
@@ -173,43 +173,27 @@ def init_db(database_path: str) -> None:
     """
     Initialize database schema. Safe to call multiple times.
     Creates:
-    - analyses (embedding_count column kept for backward compat but not used as source of truth)
-    - files
+    - files (stores full content of indexed files)
     - chunks (with embedding BLOB column for sqlite-vector)
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
-        # analyses table: embedding_count column kept for compatibility but will be computed live
-        cur.execute(
-            """
-            CREATE TABLE IF NOT EXISTS analyses (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                name TEXT NOT NULL,
-                path TEXT NOT NULL,
-                status TEXT NOT NULL,
-                embedding_count INTEGER DEFAULT 0,
-                created_at TEXT DEFAULT (datetime('now'))
-            )
-            """
-        )
-
+        
         # files table (stores full content, used to reconstruct chunks)
         cur.execute(
             """
             CREATE TABLE IF NOT EXISTS files (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
-                analysis_id INTEGER NOT NULL,
                 path TEXT NOT NULL,
                 content TEXT,
                 language TEXT,
                 snippet TEXT,
-                created_at TEXT DEFAULT (datetime('now')),
-                FOREIGN KEY (analysis_id) REFERENCES analyses(id) ON DELETE CASCADE
+                created_at TEXT DEFAULT (datetime('now'))
             )
             """
         )
-        cur.execute("CREATE INDEX IF NOT EXISTS idx_files_analysis ON files(analysis_id);")
+        cur.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);")
 
         # chunks table: metadata for chunked documents; includes embedding BLOB column
         cur.execute(
@@ -231,39 +215,15 @@ def init_db(database_path: str) -> None:
         conn.close()
 
 
-def create_analysis(database_path: str, name: str, path: str, status: str = "pending") -> int:
-    conn = _get_connection(database_path)
-    try:
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO analyses (name, path, status) VALUES (?, ?, ?)",
-            (name, path, status),
-        )
-        conn.commit()
-        return int(cur.lastrowid)
-    finally:
-        conn.close()
-
-
-def update_analysis_status(database_path: str, analysis_id: int, status: str) -> None:
-    conn = _get_connection(database_path)
-    try:
-        cur = conn.cursor()
-        cur.execute("UPDATE analyses SET status = ? WHERE id = ?", (status, analysis_id))
-        conn.commit()
-    finally:
-        conn.close()
-
-
-def store_file(database_path, analysis_id, path, content, language):
+def store_file(database_path, path, content, language):
     """
     Insert a file record into the DB using a queued single-writer to avoid
     sqlite 'database is locked' errors in multithreaded scenarios.
     Returns lastrowid (same as the previous store_file implementation).
     """
     snippet = (content[:512] if content else "")
-    sql = "INSERT INTO files (analysis_id, path, content, language, snippet) VALUES (?, ?, ?, ?, ?)"
-    params = (analysis_id, path, content, language, snippet)
+    sql = "INSERT INTO files (path, content, language, snippet) VALUES (?, ?, ?, ?)"
+    params = (path, content, language, snippet)
 
     writer = _get_writer(database_path)
     # We wait for the background writer to complete the insert and then return the row id.
@@ -289,75 +249,66 @@ def insert_chunk_row_with_null_embedding(database_path: str, file_id: int, path:
         conn.close()
 
 
-def list_analyses(database_path: str) -> List[Dict[str, Any]]:
+def get_project_stats(database_path: str) -> Dict[str, Any]:
     """
-    Return analyses with computed file_count and computed embedding_count (from chunks.embedding).
-    This ensures the UI shows accurate, up-to-date counts based on actual rows.
+    Get statistics for a project database.
+    Returns file_count and embedding_count.
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
-        rows = cur.execute(
-            """
-            SELECT
-                a.id,
-                a.name,
-                a.path,
-                a.status,
-                (SELECT COUNT(*) FROM files f WHERE f.analysis_id = a.id) AS file_count,
-                (SELECT COUNT(*) FROM chunks ch JOIN files f2 ON ch.file_id = f2.id
-                    WHERE f2.analysis_id = a.id AND ch.embedding IS NOT NULL) AS embedding_count,
-                a.created_at
-            FROM analyses a
-            ORDER BY a.id DESC
-            """
-        ).fetchall()
-        results: List[Dict[str, Any]] = []
-        for r in rows:
-            results.append(
-                {
-                    "id": r["id"],
-                    "name": r["name"],
-                    "path": r["path"],
-                    "status": r["status"],
-                    "file_count": int(r["file_count"]),
-                    "embedding_count": int(r["embedding_count"]),
-                    "created_at": r["created_at"],
-                }
-            )
-        return results
+        
+        # Count files
+        cur.execute("SELECT COUNT(*) FROM files")
+        file_count = cur.fetchone()[0]
+        
+        # Count embeddings
+        cur.execute("SELECT COUNT(*) FROM chunks WHERE embedding IS NOT NULL")
+        embedding_count = cur.fetchone()[0]
+        
+        return {
+            "file_count": int(file_count),
+            "embedding_count": int(embedding_count)
+        }
     finally:
         conn.close()
 
 
-def list_files_for_analysis(database_path: str, analysis_id: int) -> List[Dict[str, Any]]:
+def list_files(database_path: str) -> List[Dict[str, Any]]:
+    """
+    List all files in a project database.
+    """
     conn = _get_connection(database_path)
     try:
         rows = conn.execute(
-            "SELECT id, path, snippet FROM files WHERE analysis_id = ? ORDER BY id DESC", (analysis_id,)
+            "SELECT id, path, snippet, language, created_at FROM files ORDER BY id DESC"
         ).fetchall()
-        return [{"id": r["id"], "path": r["path"], "snippet": r["snippet"]} for r in rows]
+        return [
+            {
+                "id": r["id"], 
+                "path": r["path"], 
+                "snippet": r["snippet"],
+                "language": r["language"],
+                "created_at": r["created_at"]
+            } 
+            for r in rows
+        ]
     finally:
         conn.close()
 
 
-def delete_analysis(database_path: str, analysis_id: int) -> None:
+def clear_project_data(database_path: str) -> None:
     """
-    Delete an analysis and cascade-delete associated files / chunks.
-    Foreign key enforcement varies by SQLite build; do explicit deletes for safety.
+    Clear all files and chunks from a project database.
+    Used when re-indexing a project.
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
-        # delete chunks for files in analysis
-        cur.execute(
-            "DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE analysis_id = ?)",
-            (analysis_id,),
-        )
-        # delete files
-        cur.execute("DELETE FROM files WHERE analysis_id = ?", (analysis_id,))
-        # delete analysis row
-        cur.execute("DELETE FROM analyses WHERE id = ?", (analysis_id,))
+        # Delete chunks first due to foreign key
+        cur.execute("DELETE FROM chunks")
+        # Delete files
+        cur.execute("DELETE FROM files")
         conn.commit()
     finally:
         conn.close()
diff --git a/main.py b/main.py
@@ -9,7 +9,7 @@
 from typing import Optional
 from datetime import datetime
 
-from db import init_db, list_analyses
+from db import init_db, get_project_stats
 from analyzer import analyze_local_path_background, search_semantic, call_coding_model
 from config import CFG
 from projects import (
@@ -158,15 +158,13 @@ def api_query(request: QueryRequest):
         
         db_path = project["database_path"]
         
-        # Get the first analysis ID from the project database
-        analyses = list_analyses(db_path)
-        if not analyses:
+        # Check if project has been indexed
+        stats = get_project_stats(db_path)
+        if stats["file_count"] == 0:
             return JSONResponse({"error": "Project not indexed yet"}, status_code=400)
         
-        analysis_id = analyses[0]["id"]
-        
         # Perform semantic search
-        results = search_semantic(request.query, db_path, analysis_id=analysis_id, top_k=request.top_k)
+        results = search_semantic(request.query, db_path, top_k=request.top_k)
         
         return JSONResponse({
             "results": results,
@@ -298,12 +296,10 @@ def code_endpoint(request: Request):
         
         database_path = project["database_path"]
         
-        # Get the first analysis from this project's database
-        analyses = list_analyses(database_path)
-        if not analyses:
+        # Check if project has been indexed
+        stats = get_project_stats(database_path)
+        if stats["file_count"] == 0:
             return JSONResponse({"error": "Project not indexed yet. Please run indexing first."}, status_code=400)
-        
-        analysis_id = analyses[0]["id"]
     except Exception as e:
         logger.exception(f"Error getting project: {e}")
         return JSONResponse({"error": "Failed to retrieve project"}, status_code=500)
@@ -319,7 +315,7 @@ def code_endpoint(request: Request):
     # If RAG requested, perform semantic search and build context
     if use_rag:
         try:
-            retrieved = search_semantic(prompt, database_path, analysis_id=int(analysis_id), top_k=top_k)
+            retrieved = search_semantic(prompt, database_path, top_k=top_k)
             # Build context WITHOUT including snippets: only include file references and scores
             context_parts = []
             total_len = len(combined_context)