Skip to content

Commit 760f8b2

Browse files
CopilotMte90
andcommitted
Add detailed progress logging for indexing embeddings
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent df6ff33 commit 760f8b2

File tree

1 file changed

+34
-4
lines changed

1 file changed

+34
-4
lines changed

ai/analyzer.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,8 @@ def _process_file_sync(
333333
rel_path: str,
334334
cfg: Optional[Dict[str, Any]],
335335
incremental: bool = True,
336+
processed_count: Optional[list] = None,
337+
total_files: int = 0,
336338
):
337339
"""
338340
Synchronous implementation of per-file processing.
@@ -364,8 +366,12 @@ def _process_file_sync(
364366
logger.debug(f"Skipping unchanged file: {rel_path}")
365367
return {"stored": False, "embedded": False, "skipped": True}
366368

367-
# Log file processing
368-
logger.info(f"Processing file: {rel_path}")
369+
# Log file processing with progress
370+
if processed_count is not None and total_files > 0:
371+
current = len(processed_count)
372+
logger.info(f"Processing file ({current}/{total_files}): {rel_path}")
373+
else:
374+
logger.info(f"Processing file: {rel_path}")
369375

370376
# store file (synchronous DB writer) with metadata
371377
try:
@@ -409,8 +415,13 @@ def _process_file_sync(
409415
chunk_tasks.append((idx, chunk_doc))
410416

411417
# Process embeddings in parallel batches for better throughput
412-
for batch_start in range(0, len(chunk_tasks), EMBEDDING_BATCH_SIZE):
418+
num_batches = (len(chunk_tasks) + EMBEDDING_BATCH_SIZE - 1) // EMBEDDING_BATCH_SIZE
419+
for batch_num, batch_start in enumerate(range(0, len(chunk_tasks), EMBEDDING_BATCH_SIZE), 1):
413420
batch = chunk_tasks[batch_start:batch_start + EMBEDDING_BATCH_SIZE]
421+
422+
# Log batch processing start
423+
logger.info(f"Generating embeddings for {rel_path}: batch {batch_num}/{num_batches} ({len(batch)} chunks)")
424+
414425
embedding_futures = []
415426

416427
for idx, chunk_doc in batch:
@@ -424,6 +435,7 @@ def _process_file_sync(
424435
raise
425436

426437
# Wait for batch to complete and store results
438+
saved_count = 0
427439
for idx, chunk_doc, future in embedding_futures:
428440
try:
429441
emb = future.result() # This will re-raise any exception from the worker
@@ -439,6 +451,7 @@ def _process_file_sync(
439451
try:
440452
_load_sqlite_vector_extension(conn2)
441453
_insert_chunk_vector_with_retry(conn2, fid, rel_path, idx, emb)
454+
saved_count += 1
442455
finally:
443456
conn2.close()
444457
embedded_any = True
@@ -454,6 +467,9 @@ def _process_file_sync(
454467
print(err_content)
455468
except Exception:
456469
logger.exception("Failed to write empty-embedding error to disk for %s chunk %d", rel_path, idx)
470+
471+
# Log batch completion
472+
logger.info(f"Saved {saved_count}/{len(batch)} embeddings for {rel_path} batch {batch_num}/{num_batches}")
457473

458474
return {"stored": True, "embedded": embedded_any, "skipped": False}
459475
except Exception:
@@ -514,7 +530,12 @@ def analyze_local_path_sync(
514530
continue
515531
file_paths.append({"full": full, "rel": rel})
516532

517-
logger.info(f"Found {len(file_paths)} files to process")
533+
total_files = len(file_paths)
534+
logger.info(f"Found {total_files} files to process")
535+
536+
# Thread-safe counter for progress tracking
537+
processed_count = []
538+
processed_lock = threading.Lock()
518539

519540
# Process files in chunks to avoid too many futures at once.
520541
CHUNK_SUBMIT = 256
@@ -530,19 +551,28 @@ def analyze_local_path_sync(
530551
f["rel"],
531552
cfg,
532553
incremental,
554+
processed_count,
555+
total_files,
533556
)
534557
futures.append(fut)
535558

536559
for fut in concurrent.futures.as_completed(futures):
537560
try:
538561
r = fut.result()
562+
with processed_lock:
563+
processed_count.append(1)
539564
if isinstance(r, dict):
540565
if r.get("stored"):
541566
file_count += 1
542567
if r.get("embedded"):
543568
emb_count += 1
544569
if r.get("skipped"):
545570
skipped_count += 1
571+
572+
# Log periodic progress updates (every 10 files)
573+
current_processed = len(processed_count)
574+
if current_processed % 10 == 0:
575+
logger.info(f"Progress: {current_processed}/{total_files} files processed ({file_count} stored, {emb_count} with embeddings, {skipped_count} skipped)")
546576
except Exception:
547577
logger.exception("A per-file task failed")
548578

0 commit comments

Comments
 (0)