@@ -326,7 +326,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
326326
327327
328328def download_dca_databases (
329- experiment_name : str , pat : str , projects : List [Project ]
329+ language : str ,
330+ experiment_name : str ,
331+ pat : str ,
332+ projects : List [Project ],
330333) -> List [tuple [Project , str | None ]]:
331334 """
332335 Download databases from a DCA experiment.
@@ -337,7 +340,6 @@ def download_dca_databases(
337340 Returns:
338341 List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
339342 """
340- database_results = {}
341343 print ("\n === Finding projects ===" )
342344 response = get_json_from_github (
343345 f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{ experiment_name } /reports/downloads.json" ,
@@ -363,7 +365,7 @@ def download_dca_databases(
363365
364366 artifact_map [pretty_name ] = analyzed_database
365367
366- def download (item : tuple [str , dict ]) -> str :
368+ def download_and_extract (item : tuple [str , dict ]) -> str :
367369 pretty_name , analyzed_database = item
368370 artifact_name = analyzed_database ["artifact_name" ]
369371 repository = analyzed_database ["repository" ]
@@ -391,16 +393,19 @@ def download(item: tuple[str, dict]) -> str:
391393 # And then we extract it to build_dir/artifact_name
392394 zip_ref .extractall (artifact_unzipped_location )
393395 # And then we iterate over the contents of the extracted directory
394- # and extract the tar.gz files inside it
395- for entry in os .listdir (artifact_unzipped_location ):
396- artifact_tar_location = os .path .join (artifact_unzipped_location , entry )
397- with tarfile .open (artifact_tar_location , "r:gz" ) as tar_ref :
398- # And we just untar it to the same directory as the zip file
399- tar_ref .extractall (artifact_unzipped_location )
400- return os .path .join (artifact_unzipped_location , remove_extension (entry ))
396+ # and extract the language tar.gz file inside it
397+ artifact_tar_location = os .path .join (
398+ artifact_unzipped_location , f"{ language } .tar.gz"
399+ )
400+ with tarfile .open (artifact_tar_location , "r:gz" ) as tar_ref :
401+ # And we just untar it to the same directory as the zip file
402+ tar_ref .extractall (artifact_unzipped_location )
403+ ret = os .path .join (artifact_unzipped_location , language )
404+ print (f"Extraction complete: { ret } " )
405+ return ret
401406
402407 results = run_in_parallel (
403- download ,
408+ download_and_extract ,
404409 list (artifact_map .items ()),
405410 on_error = lambda item , exc : print (
406411 f"ERROR: Failed to download database for { item [0 ]} : { exc } "
@@ -410,7 +415,7 @@ def download(item: tuple[str, dict]) -> str:
410415 ),
411416 )
412417
413- print (f"\n === Extracted { len (database_results )} databases ===" )
418+ print (f"\n === Extracted { len (results )} databases ===" )
414419
415420 return [(project_map [n ], r ) for n , r in zip (artifact_map , results )]
416421
@@ -463,7 +468,9 @@ def main(config, args) -> None:
463468 case "repo" :
464469 extractor_options = config .get ("extractor_options" , [])
465470 database_results = build_databases_from_projects (
466- language , extractor_options , projects
471+ language ,
472+ extractor_options ,
473+ projects ,
467474 )
468475 case "dca" :
469476 experiment_name = args .dca
@@ -480,7 +487,10 @@ def main(config, args) -> None:
480487 with open (args .pat , "r" ) as f :
481488 pat = f .read ().strip ()
482489 database_results = download_dca_databases (
483- experiment_name , pat , projects
490+ language ,
491+ experiment_name ,
492+ pat ,
493+ projects ,
484494 )
485495
486496 # Generate models for all projects
0 commit comments