@@ -333,6 +333,8 @@ def _process_file_sync(
333333 rel_path : str ,
334334 cfg : Optional [Dict [str , Any ]],
335335 incremental : bool = True ,
336+ processed_count : Optional [list ] = None ,
337+ total_files : int = 0 ,
336338):
337339 """
338340 Synchronous implementation of per-file processing.
@@ -364,8 +366,12 @@ def _process_file_sync(
364366 logger .debug (f"Skipping unchanged file: { rel_path } " )
365367 return {"stored" : False , "embedded" : False , "skipped" : True }
366368
367- # Log file processing
368- logger .info (f"Processing file: { rel_path } " )
369+ # Log file processing with progress
370+ if processed_count is not None and total_files > 0 :
371+ current = len (processed_count )
372+ logger .info (f"Processing file ({ current } /{ total_files } ): { rel_path } " )
373+ else :
374+ logger .info (f"Processing file: { rel_path } " )
369375
370376 # store file (synchronous DB writer) with metadata
371377 try :
@@ -409,8 +415,13 @@ def _process_file_sync(
409415 chunk_tasks .append ((idx , chunk_doc ))
410416
411417 # Process embeddings in parallel batches for better throughput
412- for batch_start in range (0 , len (chunk_tasks ), EMBEDDING_BATCH_SIZE ):
418+ num_batches = (len (chunk_tasks ) + EMBEDDING_BATCH_SIZE - 1 ) // EMBEDDING_BATCH_SIZE
419+ for batch_num , batch_start in enumerate (range (0 , len (chunk_tasks ), EMBEDDING_BATCH_SIZE ), 1 ):
413420 batch = chunk_tasks [batch_start :batch_start + EMBEDDING_BATCH_SIZE ]
421+
422+ # Log batch processing start
423+ logger .info (f"Generating embeddings for { rel_path } : batch { batch_num } /{ num_batches } ({ len (batch )} chunks)" )
424+
414425 embedding_futures = []
415426
416427 for idx , chunk_doc in batch :
@@ -424,6 +435,7 @@ def _process_file_sync(
424435 raise
425436
426437 # Wait for batch to complete and store results
438+ saved_count = 0
427439 for idx , chunk_doc , future in embedding_futures :
428440 try :
429441 emb = future .result () # This will re-raise any exception from the worker
@@ -439,6 +451,7 @@ def _process_file_sync(
439451 try :
440452 _load_sqlite_vector_extension (conn2 )
441453 _insert_chunk_vector_with_retry (conn2 , fid , rel_path , idx , emb )
454+ saved_count += 1
442455 finally :
443456 conn2 .close ()
444457 embedded_any = True
@@ -454,6 +467,9 @@ def _process_file_sync(
454467 print (err_content )
455468 except Exception :
456469 logger .exception ("Failed to write empty-embedding error to disk for %s chunk %d" , rel_path , idx )
470+
471+ # Log batch completion
472+ logger .info (f"Saved { saved_count } /{ len (batch )} embeddings for { rel_path } batch { batch_num } /{ num_batches } " )
457473
458474 return {"stored" : True , "embedded" : embedded_any , "skipped" : False }
459475 except Exception :
@@ -514,7 +530,12 @@ def analyze_local_path_sync(
514530 continue
515531 file_paths .append ({"full" : full , "rel" : rel })
516532
517- logger .info (f"Found { len (file_paths )} files to process" )
533+ total_files = len (file_paths )
534+ logger .info (f"Found { total_files } files to process" )
535+
536+ # Thread-safe counter for progress tracking
537+ processed_count = []
538+ processed_lock = threading .Lock ()
518539
519540 # Process files in chunks to avoid too many futures at once.
520541 CHUNK_SUBMIT = 256
@@ -530,19 +551,28 @@ def analyze_local_path_sync(
530551 f ["rel" ],
531552 cfg ,
532553 incremental ,
554+ processed_count ,
555+ total_files ,
533556 )
534557 futures .append (fut )
535558
536559 for fut in concurrent .futures .as_completed (futures ):
537560 try :
538561 r = fut .result ()
562+ with processed_lock :
563+ processed_count .append (1 )
539564 if isinstance (r , dict ):
540565 if r .get ("stored" ):
541566 file_count += 1
542567 if r .get ("embedded" ):
543568 emb_count += 1
544569 if r .get ("skipped" ):
545570 skipped_count += 1
571+
572+ # Log periodic progress updates (every 10 files)
573+ current_processed = len (processed_count )
574+ if current_processed % 10 == 0 :
575+ logger .info (f"Progress: { current_processed } /{ total_files } files processed ({ file_count } stored, { emb_count } with embeddings, { skipped_count } skipped)" )
546576 except Exception :
547577 logger .exception ("A per-file task failed" )
548578
0 commit comments