Bulk generator script tweaks

paldepind · paldepind · commit 1682d9bcecd6 · 2026-01-07T14:03:05.000+01:00
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -44,6 +44,15 @@ def missing_module(module_name: str) -> None:
 build_dir = pathlib.Path(gitroot, "mad-generation-build")
 
 
+def database_dir_for_project(name: str) -> pathlib.Path:
+    return build_dir / f"{name}-db"
+
+
+def database_for_project_exists(name: str) -> bool:
+    path = database_dir_for_project(name)
+    return path.exists()
+
+
 # A project to generate models for
 Project = TypedDict(
     "Project",
@@ -127,7 +136,7 @@ def run_in_parallel[T, U](
     if not items:
         return []
     max_workers = min(max_workers, len(items))
-    results = [None for _ in range(len(items))]
+    results: List[Optional[U]] = [None for _ in range(len(items))]
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Start cloning tasks and keep track of them
         futures = {
@@ -175,7 +184,7 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
 
 def build_database(
     language: str, extractor_options, project: Project, project_dir: str
-) -> str | None:
+) -> bool:
     """
     Build a CodeQL database for a project.
 
@@ -186,12 +195,12 @@ def build_database(
         project_dir: Path to the CodeQL database.
 
     Returns:
-        The path to the created database directory.
+        True if the build was successful, False otherwise.
     """
     name = project["name"]
 
     # Create database directory path
-    database_dir = build_dir / f"{name}-db"
+    database_dir = database_dir_for_project(name)
 
     # Only build the database if it doesn't already exist
     if not database_dir.exists():
@@ -214,16 +223,16 @@ def build_database(
             print(f"Successfully created database at {database_dir}")
         except subprocess.CalledProcessError as e:
             print(f"Failed to create database for {name}: {e}")
-            return None
+            return False
     else:
         print(
             f"Skipping database creation for {name} as it already exists at {database_dir}"
         )
 
-    return database_dir
+    return True
 
 
-def generate_models(config, args, project: Project, database_dir: str) -> None:
+def generate_models(config, args, project: Project) -> None:
     """
     Generate models for a project.
 
@@ -235,6 +244,7 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
     name = project["name"]
     language = config["language"]
 
+    print("\n--- Generating models for project: " + name + " ---")
     generator = mad.Generator(language)
     generator.with_sinks = should_generate_sinks(project)
     generator.with_sources = should_generate_sources(project)
@@ -245,13 +255,13 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
         generator.single_file = name
     else:
         generator.folder = name
-    generator.setenvironment(database=database_dir)
+    generator.setenvironment(database=database_dir_for_project(name))
     generator.run()
 
 
 def build_databases_from_projects(
     language: str, extractor_options, projects: List[Project]
-) -> List[tuple[Project, str | None]]:
+) -> List[tuple[Project, bool]]:
     """
     Build databases for all projects in parallel.
 
@@ -261,7 +271,7 @@ def build_databases_from_projects(
         projects: List of projects to build databases for.
 
     Returns:
-        List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
+        List of (project_name, success) pairs, where success is False if the build failed.
     """
     # Clone projects in parallel
     print("=== Cloning projects ===")
@@ -333,19 +343,20 @@ def download_dca_databases(
     experiment_names: list[str],
     pat: str,
     projects: List[Project],
-) -> List[tuple[Project, str | None]]:
+):  # -> List[tuple[Project, bool]]:
     """
     Download databases from a DCA experiment.
     Args:
         experiment_names: The names of the DCA experiments to download databases from.
         pat: Personal Access Token for GitHub API authentication.
         projects: List of projects to download databases for.
     Returns:
-        List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
+        List of (project_name, success) pairs, where success is False if the download failed.
     """
     print("\n=== Finding projects ===")
     project_map = {project["name"]: project for project in projects}
-    analyzed_databases = {n: None for n in project_map}
+
+    analyzed_databases = {}
     for experiment_name in experiment_names:
         response = get_json_from_github(
             f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -358,26 +369,28 @@ def download_dca_databases(
             artifact_name = analyzed_database["artifact_name"]
             pretty_name = pretty_name_from_artifact_name(artifact_name)
 
-            if not pretty_name in analyzed_databases:
+            if not pretty_name in project_map:
                 print(f"Skipping {pretty_name} as it is not in the list of projects")
                 continue
 
-            if analyzed_databases[pretty_name] is not None:
+            if pretty_name in analyzed_databases:
                 print(
                     f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
                 )
 
             analyzed_databases[pretty_name] = analyzed_database
 
-    not_found = [name for name, db in analyzed_databases.items() if db is None]
+    not_found = [name for name in project_map if name not in analyzed_databases]
     if not_found:
         print(
             f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
         )
         sys.exit(1)
 
-    def download_and_decompress(analyzed_database: dict) -> str:
+    def download_and_decompress(analyzed_database: dict) -> bool:
         artifact_name = analyzed_database["artifact_name"]
+        pretty_name = pretty_name_from_artifact_name(artifact_name)
+        database_location = database_dir_for_project(pretty_name)
         repository = analyzed_database["repository"]
         run_id = analyzed_database["run_id"]
         print(f"=== Finding artifact: {artifact_name} ===")
@@ -398,33 +411,38 @@ def download_and_decompress(analyzed_database: dict) -> str:
         # First we open the zip file
         with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
             artifact_unzipped_location = build_dir / artifact_name
+
             # clean up any remnants of previous runs
             shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
+            shutil.rmtree(database_location, ignore_errors=True)
+
             # And then we extract it to build_dir/artifact_name
             zip_ref.extractall(artifact_unzipped_location)
             # And then we extract the language tar.gz file inside it
             artifact_tar_location = artifact_unzipped_location / f"{language}.tar.gz"
             with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
                 # And we just untar it to the same directory as the zip file
                 tar_ref.extractall(artifact_unzipped_location)
-        ret = artifact_unzipped_location / language
-        print(f"Decompression complete: {ret}")
-        return ret
+        # Move the database to the canonical location
+        shutil.move(artifact_unzipped_location / language, database_location)
+
+        print(f"Decompression complete: {database_location}")
+        return True
 
-    results = run_in_parallel(
+    run_in_parallel(
         download_and_decompress,
         list(analyzed_databases.values()),
         on_error=lambda db, exc: print(
-            f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
+            f"ERROR: Failed to download and decompress {db['artifact_name']}: {exc}"
         ),
         error_summary=lambda failures: print(
             f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
         ),
     )
 
-    print(f"\n=== Fetched {len(results)} databases ===")
+    print(f"\n=== Fetched {len(analyzed_databases.values())} databases ===")
 
-    return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
+    # return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
 
 
 def clean_up_mad_destination_for_project(config, name: str):
@@ -460,55 +478,50 @@ def main(config, args) -> None:
     # Create build directory if it doesn't exist
     build_dir.mkdir(parents=True, exist_ok=True)
 
-    database_results = []
-    match get_strategy(config):
-        case "repo":
-            extractor_options = config.get("extractor_options", [])
-            database_results = build_databases_from_projects(
-                language,
-                extractor_options,
-                projects,
-            )
-        case "dca":
-            experiment_names = args.dca
-            if experiment_names is None:
-                print("ERROR: --dca argument is required for DCA strategy")
-                sys.exit(1)
-
-            if args.pat is None:
-                print("ERROR: --pat argument is required for DCA strategy")
-                sys.exit(1)
-            if not args.pat.exists():
-                print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
-                sys.exit(1)
-            with open(args.pat, "r") as f:
-                pat = f.read().strip()
-                database_results = download_dca_databases(
+    # Check reuse databases flag is given and all databases exist
+    skip_database_creation = args.reuse_databases and all(
+        database_for_project_exists(project["name"]) for project in projects
+    )
+
+    if not skip_database_creation:
+        match get_strategy(config):
+            case "repo":
+                extractor_options = config.get("extractor_options", [])
+                build_databases_from_projects(
                     language,
-                    experiment_names,
-                    pat,
+                    extractor_options,
                     projects,
                 )
-
-    # Generate models for all projects
-    print("\n=== Generating models ===")
-
-    failed_builds = [
-        project["name"] for project, db_dir in database_results if db_dir is None
-    ]
-    if failed_builds:
-        print(
-            f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
-        )
-        sys.exit(1)
+            case "dca":
+                experiment_names = args.dca
+                if experiment_names is None:
+                    print("ERROR: --dca argument is required for DCA strategy")
+                    sys.exit(1)
+
+                if args.pat is None:
+                    print("ERROR: --pat argument is required for DCA strategy")
+                    sys.exit(1)
+                if not args.pat.exists():
+                    print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
+                    sys.exit(1)
+                with open(args.pat, "r") as f:
+                    pat = f.read().strip()
+                    download_dca_databases(
+                        language,
+                        experiment_names,
+                        pat,
+                        projects,
+                    )
 
     # clean up existing MaD data for the projects
-    for project, _ in database_results:
+    for project in projects:
         clean_up_mad_destination_for_project(config, project["name"])
 
-    for project, database_dir in database_results:
-        if database_dir is not None:
-            generate_models(config, args, project, database_dir)
+    # Generate models for all projects
+    print("\n=== Generating models ===")
+
+    for project in projects:
+        generate_models(config, args, project)
 
 
 if __name__ == "__main__":
@@ -543,6 +556,11 @@ def main(config, args) -> None:
         help="What `--threads` value to pass to `codeql` (default %(default)s)",
         default=0,
     )
+    parser.add_argument(
+        "--reuse-databases",
+        action="store_true",
+        help="Whether to reuse existing databases instead of rebuilding/redownloading them",
+    )
     args = parser.parse_args()
 
     # Load config file