From cb0b566588d6967740429c5fe37fb47d4d3b5901 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 16:29:19 +0100
Subject: [PATCH 01/22] C++: Put autogenerated models in the same folder
 structure as Rust.

---
 cpp/ql/lib/ext/generated/{ => openssl}/openssl.model.yml | 0
 cpp/ql/lib/ext/generated/{ => sqlite}/sqlite.model.yml   | 0
 cpp/ql/lib/qlpack.yml                                    | 2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/ql/lib/ext/generated/{ => openssl}/openssl.model.yml (100%)
 rename cpp/ql/lib/ext/generated/{ => sqlite}/sqlite.model.yml (100%)

diff --git a/cpp/ql/lib/ext/generated/openssl.model.yml b/cpp/ql/lib/ext/generated/openssl/openssl.model.yml
similarity index 100%
rename from cpp/ql/lib/ext/generated/openssl.model.yml
rename to cpp/ql/lib/ext/generated/openssl/openssl.model.yml
diff --git a/cpp/ql/lib/ext/generated/sqlite.model.yml b/cpp/ql/lib/ext/generated/sqlite/sqlite.model.yml
similarity index 100%
rename from cpp/ql/lib/ext/generated/sqlite.model.yml
rename to cpp/ql/lib/ext/generated/sqlite/sqlite.model.yml
diff --git a/cpp/ql/lib/qlpack.yml b/cpp/ql/lib/qlpack.yml
index e15623e2ddb9..ef2d81c4f84c 100644
--- a/cpp/ql/lib/qlpack.yml
+++ b/cpp/ql/lib/qlpack.yml
@@ -17,7 +17,7 @@ dependencies:
   codeql/xml: ${workspace}
 dataExtensions:
   - ext/*.model.yml
-  - ext/generated/*.model.yml
+  - ext/generated/**/*.model.yml
   - ext/deallocation/*.model.yml
   - ext/allocation/*.model.yml
 warnOnImplicitThis: true

From 40d937a2eb7a3fce4f47c4f736af948f2c19d717 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:07:25 +0100
Subject: [PATCH 02/22] Bulk generator: Some imports we will need.

---
 misc/scripts/models-as-data/rust_bulk_generate_mad.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
index 76d67b1fba15..c9ed1e2540e3 100644
--- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
@@ -10,6 +10,12 @@
 from typing import NotRequired, TypedDict, List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import time
+import argparse
+import json
+import requests
+import zipfile
+import tarfile
+from functools import cmp_to_key
 
 import generate_mad as mad
 

From b87ba31c434f7fd59dd094053d291153c7baddf7 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:08:57 +0100
Subject: [PATCH 03/22] Bulk generator: Get rid of the hardcoded project list
 and move it into a configuration file.

---
 .../models-as-data/rust_bulk_generate_mad.py  | 70 -------------------
 rust/misc/bulk_generation_targets.json        | 69 ++++++++++++++++++
 2 files changed, 69 insertions(+), 70 deletions(-)
 create mode 100644 rust/misc/bulk_generation_targets.json

diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
index c9ed1e2540e3..48c5c362fec5 100644
--- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
@@ -47,76 +47,6 @@ class Project(TypedDict):
     git_tag: NotRequired[str]
 
 
-# List of Rust projects to generate models for.
-projects: List[Project] = [
-    {
-        "name": "libc",
-        "git_repo": "https://github.com/rust-lang/libc",
-        "git_tag": "0.2.172",
-    },
-    {
-        "name": "log",
-        "git_repo": "https://github.com/rust-lang/log",
-        "git_tag": "0.4.27",
-    },
-    {
-        "name": "memchr",
-        "git_repo": "https://github.com/BurntSushi/memchr",
-        "git_tag": "2.7.4",
-    },
-    {
-        "name": "once_cell",
-        "git_repo": "https://github.com/matklad/once_cell",
-        "git_tag": "v1.21.3",
-    },
-    {
-        "name": "rand",
-        "git_repo": "https://github.com/rust-random/rand",
-        "git_tag": "0.9.1",
-    },
-    {
-        "name": "smallvec",
-        "git_repo": "https://github.com/servo/rust-smallvec",
-        "git_tag": "v1.15.0",
-    },
-    {
-        "name": "serde",
-        "git_repo": "https://github.com/serde-rs/serde",
-        "git_tag": "v1.0.219",
-    },
-    {
-        "name": "tokio",
-        "git_repo": "https://github.com/tokio-rs/tokio",
-        "git_tag": "tokio-1.45.0",
-    },
-    {
-        "name": "reqwest",
-        "git_repo": "https://github.com/seanmonstar/reqwest",
-        "git_tag": "v0.12.15",
-    },
-    {
-        "name": "rocket",
-        "git_repo": "https://github.com/SergioBenitez/Rocket",
-        "git_tag": "v0.5.1",
-    },
-    {
-        "name": "actix-web",
-        "git_repo": "https://github.com/actix/actix-web",
-        "git_tag": "web-v4.11.0",
-    },
-    {
-        "name": "hyper",
-        "git_repo": "https://github.com/hyperium/hyper",
-        "git_tag": "v1.6.0",
-    },
-    {
-        "name": "clap",
-        "git_repo": "https://github.com/clap-rs/clap",
-        "git_tag": "v4.5.38",
-    },
-]
-
-
 def clone_project(project: Project) -> str:
     """
     Shallow clone a project into the build directory.
diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json
new file mode 100644
index 000000000000..e7efddfe5b85
--- /dev/null
+++ b/rust/misc/bulk_generation_targets.json
@@ -0,0 +1,69 @@
+{
+    "targets": [
+        {
+            "name": "libc",
+            "git_repo": "https://github.com/rust-lang/libc",
+            "git_tag": "0.2.172"
+        },
+        {
+            "name": "log",
+            "git_repo": "https://github.com/rust-lang/log",
+            "git_tag": "0.4.27"
+        },
+        {
+            "name": "memchr",
+            "git_repo": "https://github.com/BurntSushi/memchr",
+            "git_tag": "2.7.4"
+        },
+        {
+            "name": "once_cell",
+            "git_repo": "https://github.com/matklad/once_cell",
+            "git_tag": "v1.21.3"
+        },
+        {
+            "name": "rand",
+            "git_repo": "https://github.com/rust-random/rand",
+            "git_tag": "0.9.1"
+        },
+        {
+            "name": "smallvec",
+            "git_repo": "https://github.com/servo/rust-smallvec",
+            "git_tag": "v1.15.0"
+        },
+        {
+            "name": "serde",
+            "git_repo": "https://github.com/serde-rs/serde",
+            "git_tag": "v1.0.219"
+        },
+        {
+            "name": "tokio",
+            "git_repo": "https://github.com/tokio-rs/tokio",
+            "git_tag": "tokio-1.45.0"
+        },
+        {
+            "name": "reqwest",
+            "git_repo": "https://github.com/seanmonstar/reqwest",
+            "git_tag": "v0.12.15"
+        },
+        {
+            "name": "rocket",
+            "git_repo": "https://github.com/SergioBenitez/Rocket",
+            "git_tag": "v0.5.1"
+        },
+        {
+            "name": "actix-web",
+            "git_repo": "https://github.com/actix/actix-web",
+            "git_tag": "web-v4.11.0"
+        },
+        {
+            "name": "hyper",
+            "git_repo": "https://github.com/hyperium/hyper",
+            "git_tag": "v1.6.0"
+        },
+        {
+            "name": "clap",
+            "git_repo": "https://github.com/clap-rs/clap",
+            "git_tag": "v4.5.38"
+        }
+    ]
+}
\ No newline at end of file

From 6ff2bebbc2e939d82917a2b00a9d10ffef4f14f2 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:11:20 +0100
Subject: [PATCH 04/22] Bulk generator: Add command-line arguments.

---
 .../models-as-data/rust_bulk_generate_mad.py  | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
index 48c5c362fec5..48d7bf68d418 100644
--- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
@@ -268,4 +268,24 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True)
+    parser.add_argument("--lang", type=str, help="The language to generate models for", required=True)
+    parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False)
+    parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False)
+    parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False)
+    args = parser.parse_args()
+
+    # Load config file
+    config = {}
+    if not os.path.exists(args.config):
+        print(f"ERROR: Config file '{args.config}' does not exist.")
+        sys.exit(1)
+    try:
+        with open(args.config, "r") as f:
+            config = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
+        sys.exit(1)
+
+    main(config, args)

From e721fc07aaef1ef91bff825dc95aa5c18176e3b4 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:17:15 +0100
Subject: [PATCH 05/22] Bulk generator: Prepare for adding DCA support. This
 commits just generalizes the existing functionality to be independent of Rust
 and instead depend on the configuration file and the command-line arguments.

---
 .../models-as-data/rust_bulk_generate_mad.py  | 103 +++++++++++-------
 rust/misc/bulk_generation_targets.json        |   5 +
 2 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
index 48d7bf68d418..0524a7179758 100644
--- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
@@ -26,15 +26,10 @@
 )
 build_dir = os.path.join(gitroot, "mad-generation-build")
 
-
-def path_to_mad_directory(language: str, name: str) -> str:
-    return os.path.join(gitroot, f"{language}/ql/lib/ext/generated/{name}")
-
-
 # A project to generate models for
 class Project(TypedDict):
     """
-    Type definition for Rust projects to model.
+    Type definition for projects (acquired via a GitHub repo) to model.
 
     Attributes:
         name: The name of the project
@@ -139,13 +134,15 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
     return project_dirs
 
 
-def build_database(project: Project, project_dir: str) -> str | None:
+def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None:
     """
     Build a CodeQL database for a project.
 
     Args:
+        language: The language for which to build the database (e.g., "rust").
+        extractor_options: Additional options for the extractor.
         project: A dictionary containing project information with 'name' and 'git_repo' keys.
-        project_dir: The directory containing the project source code.
+        project_dir: Path to the CodeQL database.
 
     Returns:
         The path to the created database directory.
@@ -158,17 +155,17 @@ def build_database(project: Project, project_dir: str) -> str | None:
     # Only build the database if it doesn't already exist
     if not os.path.exists(database_dir):
         print(f"Building CodeQL database for {name}...")
+        extractor_options = [option for x in extractor_options for option in ("-O", x)]
         try:
             subprocess.check_call(
                 [
                     "codeql",
                     "database",
                     "create",
-                    "--language=rust",
+                    f"--language={language}",
                     "--source-root=" + project_dir,
                     "--overwrite",
-                    "-O",
-                    "cargo_features='*'",
+                    *extractor_options,
                     "--",
                     database_dir,
                 ]
@@ -184,40 +181,72 @@ def build_database(project: Project, project_dir: str) -> str | None:
 
     return database_dir
 
-
-def generate_models(project: Project, database_dir: str) -> None:
+def generate_models(args, name: str, database_dir: str) -> None:
     """
     Generate models for a project.
 
     Args:
-        project: A dictionary containing project information with 'name' and 'git_repo' keys.
-        project_dir: The directory containing the project source code.
+        args: Command line arguments passed to this script.
+        name: The name of the project.
+        database_dir: Path to the CodeQL database.
     """
-    name = project["name"]
 
-    generator = mad.Generator("rust")
-    generator.generateSinks = True
-    generator.generateSources = True
-    generator.generateSummaries = True
+    generator = mad.Generator(args.lang)
+    generator.generateSinks = args.with_sinks
+    generator.generateSources = args.with_sources
+    generator.generateSummaries = args.with_summaries
     generator.setenvironment(database=database_dir, folder=name)
     generator.run()
 
+def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]:
+    """
+    Build databases for all projects in parallel.
+
+    Args:
+        language: The language for which to build the databases (e.g., "rust").
+        extractor_options: Additional options for the extractor.
+        projects: List of projects to build databases for.
+
+    Returns:
+        List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
+    """
+    # Phase 1: Clone projects in parallel
+    print("=== Phase 1: Cloning projects ===")
+    project_dirs = clone_projects(projects)
+
+    # Phase 2: Build databases for all projects
+    print("\n=== Phase 2: Building databases ===")
+    database_results = [
+        (project["name"], build_database(language, extractor_options, project, project_dir))
+        for project, project_dir in project_dirs
+    ]
+    return database_results
+      
+def get_destination_for_project(config, name: str) -> str:
+    return os.path.join(config["destination"], name)
+
+def get_strategy(config) -> str:
+    return config["strategy"].lower()
 
-def main() -> None:
+def main(config, args) -> None:
     """
-    Process all projects in three distinct phases:
-    1. Clone projects (in parallel)
-    2. Build databases for projects
-    3. Generate models for successful database builds
+    Main function to handle the bulk generation of MaD models.
+    Args:
+        config: Configuration dictionary containing project details and other settings.
+        args: Command line arguments passed to this script.
     """
 
+    projects = config["targets"]
+    destination = config["destination"]
+    language = args.lang
+
     # Create build directory if it doesn't exist
     if not os.path.exists(build_dir):
         os.makedirs(build_dir)
 
     # Check if any of the MaD directories contain working directory changes in git
     for project in projects:
-        mad_dir = path_to_mad_directory("rust", project["name"])
+        mad_dir = get_destination_for_project(config, project["name"])
         if os.path.exists(mad_dir):
             git_status_output = subprocess.check_output(
                 ["git", "status", "-s", mad_dir], text=True
@@ -232,22 +261,17 @@ def main() -> None:
                 )
                 sys.exit(1)
 
-    # Phase 1: Clone projects in parallel
-    print("=== Phase 1: Cloning projects ===")
-    project_dirs = clone_projects(projects)
-
-    # Phase 2: Build databases for all projects
-    print("\n=== Phase 2: Building databases ===")
-    database_results = [
-        (project, build_database(project, project_dir))
-        for project, project_dir in project_dirs
-    ]
+    database_results = []
+    match get_strategy(config):
+        case "repo":
+            extractor_options = config.get("extractor_options", [])
+            database_results = build_databases_from_projects(language, extractor_options, projects)
 
     # Phase 3: Generate models for all projects
     print("\n=== Phase 3: Generating models ===")
 
     failed_builds = [
-        project["name"] for project, db_dir in database_results if db_dir is None
+        project for project, db_dir in database_results if db_dir is None
     ]
     if failed_builds:
         print(
@@ -257,15 +281,14 @@ def main() -> None:
 
     # Delete the MaD directory for each project
     for project, database_dir in database_results:
-        mad_dir = path_to_mad_directory("rust", project["name"])
+        mad_dir = get_destination_for_project(config, project)
         if os.path.exists(mad_dir):
             print(f"Deleting existing MaD directory at {mad_dir}")
             subprocess.check_call(["rm", "-rf", mad_dir])
 
     for project, database_dir in database_results:
         if database_dir is not None:
-            generate_models(project, database_dir)
-
+            generate_models(args, project, database_dir)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json
index e7efddfe5b85..ca30b76eb12e 100644
--- a/rust/misc/bulk_generation_targets.json
+++ b/rust/misc/bulk_generation_targets.json
@@ -1,4 +1,5 @@
 {
+    "strategy": "repo",
     "targets": [
         {
             "name": "libc",
@@ -65,5 +66,9 @@
             "git_repo": "https://github.com/clap-rs/clap",
             "git_tag": "v4.5.38"
         }
+    ],
+    "destination": "rust/ql/lib/ext/generated",
+    "extractor_options": [
+        "cargo_features='*'"
     ]
 }
\ No newline at end of file

From 5051790e24d2fceb7902fbf4dac06d16f4afd4aa Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:20:18 +0100
Subject: [PATCH 06/22] Bulk generator: Add DCA support.

---
 cpp/misc/bulk_generation_targets.json         |   8 ++
 .../models-as-data/rust_bulk_generate_mad.py  | 122 +++++++++++++++++-
 2 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 cpp/misc/bulk_generation_targets.json

diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json
new file mode 100644
index 000000000000..5f74b094d35a
--- /dev/null
+++ b/cpp/misc/bulk_generation_targets.json
@@ -0,0 +1,8 @@
+{
+  "strategy": "dca",
+  "targets": [
+      { "name": "openssl" },
+      { "name": "sqlite" }
+  ],
+  "destination": "cpp/ql/lib/ext/generated"
+}
\ No newline at end of file
diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
index 0524a7179758..922deaa7627f 100644
--- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py
@@ -1,7 +1,5 @@
 """
 Experimental script for bulk generation of MaD models based on a list of projects.
-
-Currently the script only targets Rust.
 """
 
 import os.path
@@ -221,6 +219,114 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li
         for project, project_dir in project_dirs
     ]
     return database_results
+
+def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
+    """
+    Download a JSON file from GitHub using a personal access token (PAT).
+    Args:
+        url: The URL to download the JSON file from.
+        pat: Personal Access Token for GitHub API authentication.
+        extra_headers: Additional headers to include in the request.
+    Returns:
+        The JSON response as a dictionary.
+    """
+    headers = { "Authorization": f"token {pat}" } | extra_headers
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        print(f"Failed to download JSON: {response.status_code} {response.text}")
+        sys.exit(1)
+    else:
+        return response.json()
+
+def download_artifact(url: str, artifact_name: str, pat: str) -> str:
+    """
+    Download a GitHub Actions artifact from a given URL.
+    Args:
+        url: The URL to download the artifact from.
+        artifact_name: The name of the artifact (used for naming the downloaded file).
+        pat: Personal Access Token for GitHub API authentication.
+    Returns:
+        The path to the downloaded artifact file.
+    """
+    headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" }
+    response = requests.get(url, stream=True, headers=headers)
+    zipName = artifact_name + ".zip"
+    if response.status_code == 200:
+        target_zip = os.path.join(build_dir, zipName)
+        with open(target_zip, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print(f"Download complete: {target_zip}")
+        return target_zip
+    else:
+        print(f"Failed to download file. Status code: {response.status_code}")
+        sys.exit(1)
+
+def remove_extension(filename: str) -> str:
+    while "." in filename:
+        filename, _ = os.path.splitext(filename)
+    return filename
+
+def pretty_name_from_artifact_name(artifact_name: str) -> str:
+    return artifact_name.split("___")[1]
+
+def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]:
+    """
+    Download databases from a DCA experiment.
+    Args:
+        experiment_name: The name of the DCA experiment to download databases from.
+        pat: Personal Access Token for GitHub API authentication.
+        projects: List of projects to download databases for.
+    Returns:
+        List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
+    """
+    database_results = []
+    print("\n=== Finding projects ===")
+    response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat)
+    targets = response["targets"]
+    for target, data in targets.items():
+      downloads = data["downloads"]
+      analyzed_database = downloads["analyzed_database"]
+      artifact_name = analyzed_database["artifact_name"]
+      pretty_name = pretty_name_from_artifact_name(artifact_name)
+
+      if not pretty_name in [project["name"] for project in projects]:
+        print(f"Skipping {pretty_name} as it is not in the list of projects")
+        continue
+
+      repository = analyzed_database["repository"]
+      run_id = analyzed_database["run_id"]
+      print(f"=== Finding artifact: {artifact_name} ===")
+      response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" })
+      artifacts = response["artifacts"]
+      artifact_map = {artifact["name"]: artifact for artifact in artifacts}
+      print(f"=== Downloading artifact: {artifact_name} ===")
+      archive_download_url = artifact_map[artifact_name]["archive_download_url"]
+      artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat)
+      print(f"=== Extracting artifact: {artifact_name} ===")
+      # The database is in a zip file, which contains a tar.gz file with the DB
+      # First we open the zip file
+      with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref:
+        artifact_unzipped_location = os.path.join(build_dir, artifact_name)
+        # And then we extract it to build_dir/artifact_name
+        zip_ref.extractall(artifact_unzipped_location)
+        # And then we iterate over the contents of the extracted directory
+        # and extract the tar.gz files inside it
+        for entry in os.listdir(artifact_unzipped_location):
+            artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
+            with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
+                # And we just untar it to the same directory as the zip file
+                tar_ref.extractall(artifact_unzipped_location)
+                database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry))))
+    print(f"\n=== Extracted {len(database_results)} databases ===")
+
+    def compare(a, b):
+        a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0])
+        b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0])
+        return a_index - b_index
+
+    # Sort the database results based on the order in the projects file
+    return sorted(database_results, key=cmp_to_key(compare))
       
 def get_destination_for_project(config, name: str) -> str:
     return os.path.join(config["destination"], name)
@@ -266,6 +372,16 @@ def main(config, args) -> None:
         case "repo":
             extractor_options = config.get("extractor_options", [])
             database_results = build_databases_from_projects(language, extractor_options, projects)
+        case "dca":
+            experiment_name = args.dca
+            if experiment_name is None:
+                print("ERROR: --dca argument is required for DCA strategy")
+                sys.exit(1)
+            pat = args.pat
+            if pat is None:
+                print("ERROR: --pat argument is required for DCA strategy")
+                sys.exit(1)
+            database_results = download_dca_databases(experiment_name, pat, projects)
 
     # Phase 3: Generate models for all projects
     print("\n=== Phase 3: Generating models ===")
@@ -293,6 +409,8 @@ def main(config, args) -> None:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True)
+    parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False)
+    parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False)
     parser.add_argument("--lang", type=str, help="The language to generate models for", required=True)
     parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False)
     parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False)

From cb938701a14d693733c5759604c37246d391153e Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Thu, 29 May 2025 17:48:19 +0100
Subject: [PATCH 07/22] Bulk generator: Rename file since it is no longer Rust
 specific.

---
 .../{rust_bulk_generate_mad.py => bulk_generate_mad.py}           | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename misc/scripts/models-as-data/{rust_bulk_generate_mad.py => bulk_generate_mad.py} (100%)

diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
similarity index 100%
rename from misc/scripts/models-as-data/rust_bulk_generate_mad.py
rename to misc/scripts/models-as-data/bulk_generate_mad.py

From 7ecf8c8ea2d11f0a9bfcca86a55b12827e38314c Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 11:38:20 +0100
Subject: [PATCH 08/22] Bulk generator: Format file and add a note at the top
 of the file specifying the formatting requirements.

---
 .../models-as-data/bulk_generate_mad.py       | 171 ++++++++++++------
 1 file changed, 117 insertions(+), 54 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 922deaa7627f..9490ef44c45c 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -1,5 +1,7 @@
 """
 Experimental script for bulk generation of MaD models based on a list of projects.
+
+Note: This file must be formatted using the Black Python formatter.
 """
 
 import os.path
@@ -24,6 +26,7 @@
 )
 build_dir = os.path.join(gitroot, "mad-generation-build")
 
+
 # A project to generate models for
 class Project(TypedDict):
     """
@@ -132,7 +135,9 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
     return project_dirs
 
 
-def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None:
+def build_database(
+    language: str, extractor_options, project: Project, project_dir: str
+) -> str | None:
     """
     Build a CodeQL database for a project.
 
@@ -179,6 +184,7 @@ def build_database(language: str, extractor_options, project: Project, project_d
 
     return database_dir
 
+
 def generate_models(args, name: str, database_dir: str) -> None:
     """
     Generate models for a project.
@@ -196,7 +202,10 @@ def generate_models(args, name: str, database_dir: str) -> None:
     generator.setenvironment(database=database_dir, folder=name)
     generator.run()
 
-def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]:
+
+def build_databases_from_projects(
+    language: str, extractor_options, projects: List[Project]
+) -> List[tuple[str, str | None]]:
     """
     Build databases for all projects in parallel.
 
@@ -215,11 +224,15 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li
     # Phase 2: Build databases for all projects
     print("\n=== Phase 2: Building databases ===")
     database_results = [
-        (project["name"], build_database(language, extractor_options, project, project_dir))
+        (
+            project["name"],
+            build_database(language, extractor_options, project, project_dir),
+        )
         for project, project_dir in project_dirs
     ]
     return database_results
 
+
 def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
     """
     Download a JSON file from GitHub using a personal access token (PAT).
@@ -230,7 +243,7 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
     Returns:
         The JSON response as a dictionary.
     """
-    headers = { "Authorization": f"token {pat}" } | extra_headers
+    headers = {"Authorization": f"token {pat}"} | extra_headers
     response = requests.get(url, headers=headers)
     if response.status_code != 200:
         print(f"Failed to download JSON: {response.status_code} {response.text}")
@@ -238,6 +251,7 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
     else:
         return response.json()
 
+
 def download_artifact(url: str, artifact_name: str, pat: str) -> str:
     """
     Download a GitHub Actions artifact from a given URL.
@@ -248,7 +262,7 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
     Returns:
         The path to the downloaded artifact file.
     """
-    headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" }
+    headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"}
     response = requests.get(url, stream=True, headers=headers)
     zipName = artifact_name + ".zip"
     if response.status_code == 200:
@@ -262,15 +276,20 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
         print(f"Failed to download file. Status code: {response.status_code}")
         sys.exit(1)
 
+
 def remove_extension(filename: str) -> str:
     while "." in filename:
         filename, _ = os.path.splitext(filename)
     return filename
 
+
 def pretty_name_from_artifact_name(artifact_name: str) -> str:
     return artifact_name.split("___")[1]
 
-def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]:
+
+def download_dca_databases(
+    experiment_name: str, pat: str, projects
+) -> List[tuple[str, str | None]]:
     """
     Download databases from a DCA experiment.
     Args:
@@ -282,58 +301,81 @@ def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tup
     """
     database_results = []
     print("\n=== Finding projects ===")
-    response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat)
+    response = github(
+        f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
+        pat,
+    )
     targets = response["targets"]
     for target, data in targets.items():
-      downloads = data["downloads"]
-      analyzed_database = downloads["analyzed_database"]
-      artifact_name = analyzed_database["artifact_name"]
-      pretty_name = pretty_name_from_artifact_name(artifact_name)
-
-      if not pretty_name in [project["name"] for project in projects]:
-        print(f"Skipping {pretty_name} as it is not in the list of projects")
-        continue
-
-      repository = analyzed_database["repository"]
-      run_id = analyzed_database["run_id"]
-      print(f"=== Finding artifact: {artifact_name} ===")
-      response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" })
-      artifacts = response["artifacts"]
-      artifact_map = {artifact["name"]: artifact for artifact in artifacts}
-      print(f"=== Downloading artifact: {artifact_name} ===")
-      archive_download_url = artifact_map[artifact_name]["archive_download_url"]
-      artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat)
-      print(f"=== Extracting artifact: {artifact_name} ===")
-      # The database is in a zip file, which contains a tar.gz file with the DB
-      # First we open the zip file
-      with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref:
-        artifact_unzipped_location = os.path.join(build_dir, artifact_name)
-        # And then we extract it to build_dir/artifact_name
-        zip_ref.extractall(artifact_unzipped_location)
-        # And then we iterate over the contents of the extracted directory
-        # and extract the tar.gz files inside it
-        for entry in os.listdir(artifact_unzipped_location):
-            artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
-            with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
-                # And we just untar it to the same directory as the zip file
-                tar_ref.extractall(artifact_unzipped_location)
-                database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry))))
+        downloads = data["downloads"]
+        analyzed_database = downloads["analyzed_database"]
+        artifact_name = analyzed_database["artifact_name"]
+        pretty_name = pretty_name_from_artifact_name(artifact_name)
+
+        if not pretty_name in [project["name"] for project in projects]:
+            print(f"Skipping {pretty_name} as it is not in the list of projects")
+            continue
+
+        repository = analyzed_database["repository"]
+        run_id = analyzed_database["run_id"]
+        print(f"=== Finding artifact: {artifact_name} ===")
+        response = github(
+            f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts",
+            pat,
+            {"Accept": "application/vnd.github+json"},
+        )
+        artifacts = response["artifacts"]
+        artifact_map = {artifact["name"]: artifact for artifact in artifacts}
+        print(f"=== Downloading artifact: {artifact_name} ===")
+        archive_download_url = artifact_map[artifact_name]["archive_download_url"]
+        artifact_zip_location = download_artifact(
+            archive_download_url, artifact_name, pat
+        )
+        print(f"=== Extracting artifact: {artifact_name} ===")
+        # The database is in a zip file, which contains a tar.gz file with the DB
+        # First we open the zip file
+        with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
+            artifact_unzipped_location = os.path.join(build_dir, artifact_name)
+            # And then we extract it to build_dir/artifact_name
+            zip_ref.extractall(artifact_unzipped_location)
+            # And then we iterate over the contents of the extracted directory
+            # and extract the tar.gz files inside it
+            for entry in os.listdir(artifact_unzipped_location):
+                artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
+                with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
+                    # And we just untar it to the same directory as the zip file
+                    tar_ref.extractall(artifact_unzipped_location)
+                    database_results.append(
+                        (
+                            pretty_name,
+                            os.path.join(
+                                artifact_unzipped_location, remove_extension(entry)
+                            ),
+                        )
+                    )
     print(f"\n=== Extracted {len(database_results)} databases ===")
 
     def compare(a, b):
-        a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0])
-        b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0])
+        a_index = next(
+            i for i, project in enumerate(projects) if project["name"] == a[0]
+        )
+        b_index = next(
+            i for i, project in enumerate(projects) if project["name"] == b[0]
+        )
         return a_index - b_index
 
     # Sort the database results based on the order in the projects file
     return sorted(database_results, key=cmp_to_key(compare))
-      
+
+
 def get_destination_for_project(config, name: str) -> str:
     return os.path.join(config["destination"], name)
 
+
 def get_strategy(config) -> str:
     return config["strategy"].lower()
 
+
 def main(config, args) -> None:
     """
     Main function to handle the bulk generation of MaD models.
@@ -371,7 +413,9 @@ def main(config, args) -> None:
     match get_strategy(config):
         case "repo":
             extractor_options = config.get("extractor_options", [])
-            database_results = build_databases_from_projects(language, extractor_options, projects)
+            database_results = build_databases_from_projects(
+                language, extractor_options, projects
+            )
         case "dca":
             experiment_name = args.dca
             if experiment_name is None:
@@ -386,9 +430,7 @@ def main(config, args) -> None:
     # Phase 3: Generate models for all projects
     print("\n=== Phase 3: Generating models ===")
 
-    failed_builds = [
-        project for project, db_dir in database_results if db_dir is None
-    ]
+    failed_builds = [project for project, db_dir in database_results if db_dir is None]
     if failed_builds:
         print(
             f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
@@ -406,15 +448,36 @@ def main(config, args) -> None:
         if database_dir is not None:
             generate_models(args, project, database_dir)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True)
-    parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False)
-    parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False)
-    parser.add_argument("--lang", type=str, help="The language to generate models for", required=True)
-    parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False)
-    parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False)
-    parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False)
+    parser.add_argument(
+        "--config", type=str, help="Path to the configuration file.", required=True
+    )
+    parser.add_argument(
+        "--dca",
+        type=str,
+        help="Name of a DCA run that built all the projects",
+        required=False,
+    )
+    parser.add_argument(
+        "--pat",
+        type=str,
+        help="PAT token to grab DCA databases (the same as the one you use for DCA)",
+        required=False,
+    )
+    parser.add_argument(
+        "--lang", type=str, help="The language to generate models for", required=True
+    )
+    parser.add_argument(
+        "--with-sources", action="store_true", help="Generate sources", required=False
+    )
+    parser.add_argument(
+        "--with-sinks", action="store_true", help="Generate sinks", required=False
+    )
+    parser.add_argument(
+        "--with-summaries", action="store_true", help="Generate sinks", required=False
+    )
     args = parser.parse_args()
 
     # Load config file

From 566bf431d7cd6a193a63eca714c6d577c97207bb Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 11:42:15 +0100
Subject: [PATCH 09/22] Bulk generator: Rename 'github' to
 'get_json_from_github'.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 9490ef44c45c..edb1dd3bfa89 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -233,7 +233,9 @@ def build_databases_from_projects(
     return database_results
 
 
-def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
+def get_json_from_github(
+    url: str, pat: str, extra_headers: dict[str, str] = {}
+) -> dict:
     """
     Download a JSON file from GitHub using a personal access token (PAT).
     Args:
@@ -301,7 +303,7 @@ def download_dca_databases(
     """
     database_results = []
     print("\n=== Finding projects ===")
-    response = github(
+    response = get_json_from_github(
         f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
         pat,
     )
@@ -319,7 +321,7 @@ def download_dca_databases(
         repository = analyzed_database["repository"]
         run_id = analyzed_database["run_id"]
         print(f"=== Finding artifact: {artifact_name} ===")
-        response = github(
+        response = get_json_from_github(
             f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts",
             pat,
             {"Accept": "application/vnd.github+json"},

From b640474a61f75d1bbe6796acbfe7cf55d5b5b84c Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 11:43:30 +0100
Subject: [PATCH 10/22] Bulk generator: Remove 'Phase' part of log message.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index edb1dd3bfa89..cf493d48064a 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -217,12 +217,12 @@ def build_databases_from_projects(
     Returns:
         List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
     """
-    # Phase 1: Clone projects in parallel
-    print("=== Phase 1: Cloning projects ===")
+    # Clone projects in parallel
+    print("=== Cloning projects ===")
     project_dirs = clone_projects(projects)
 
-    # Phase 2: Build databases for all projects
-    print("\n=== Phase 2: Building databases ===")
+    # Build databases for all projects
+    print("\n=== Building databases ===")
     database_results = [
         (
             project["name"],
@@ -429,8 +429,8 @@ def main(config, args) -> None:
                 sys.exit(1)
             database_results = download_dca_databases(experiment_name, pat, projects)
 
-    # Phase 3: Generate models for all projects
-    print("\n=== Phase 3: Generating models ===")
+    # Generate models for all projects
+    print("\n=== Generating models ===")
 
     failed_builds = [project for project, db_dir in database_results if db_dir is None]
     if failed_builds:

From 5d79a8de89fd76aa6537b20230d024156467b566 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 11:48:30 +0100
Subject: [PATCH 11/22] Update misc/scripts/models-as-data/bulk_generate_mad.py

Co-authored-by: Simon Friis Vindum <paldepind@github.com>
---
 misc/scripts/models-as-data/bulk_generate_mad.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index cf493d48064a..6d9b52e2266d 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -387,7 +387,6 @@ def main(config, args) -> None:
     """
 
     projects = config["targets"]
-    destination = config["destination"]
     language = args.lang
 
     # Create build directory if it doesn't exist

From 7c89d6d6dde1ac804f2fb26821623cc90ffe8bf2 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 11:49:48 +0100
Subject: [PATCH 12/22] Bulk generator: Rename 'get_destination_for_project' to
 'get_mad_destination_for_project'.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 6d9b52e2266d..6f75536ade54 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -370,7 +370,7 @@ def compare(a, b):
     return sorted(database_results, key=cmp_to_key(compare))
 
 
-def get_destination_for_project(config, name: str) -> str:
+def get_mad_destination_for_project(config, name: str) -> str:
     return os.path.join(config["destination"], name)
 
 
@@ -395,7 +395,7 @@ def main(config, args) -> None:
 
     # Check if any of the MaD directories contain working directory changes in git
     for project in projects:
-        mad_dir = get_destination_for_project(config, project["name"])
+        mad_dir = get_mad_destination_for_project(config, project["name"])
         if os.path.exists(mad_dir):
             git_status_output = subprocess.check_output(
                 ["git", "status", "-s", mad_dir], text=True
@@ -440,7 +440,7 @@ def main(config, args) -> None:
 
     # Delete the MaD directory for each project
     for project, database_dir in database_results:
-        mad_dir = get_destination_for_project(config, project)
+        mad_dir = get_mad_destination_for_project(config, project)
         if os.path.exists(mad_dir):
             print(f"Deleting existing MaD directory at {mad_dir}")
             subprocess.check_call(["rm", "-rf", mad_dir])

From 7121f5c57edbbd70520ace39aa42a502f354190e Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:08:42 +0100
Subject: [PATCH 13/22] Bulk generator: Use the 'Project' type throughout the
 file.

---
 .../models-as-data/bulk_generate_mad.py       | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 6f75536ade54..8b5482628748 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -39,7 +39,7 @@ class Project(TypedDict):
     """
 
     name: str
-    git_repo: str
+    git_repo: NotRequired[str]
     git_tag: NotRequired[str]
 
 
@@ -185,7 +185,7 @@ def build_database(
     return database_dir
 
 
-def generate_models(args, name: str, database_dir: str) -> None:
+def generate_models(args, project: Project, database_dir: str) -> None:
     """
     Generate models for a project.
 
@@ -194,6 +194,7 @@ def generate_models(args, name: str, database_dir: str) -> None:
         name: The name of the project.
         database_dir: Path to the CodeQL database.
     """
+    name = project["name"]
 
     generator = mad.Generator(args.lang)
     generator.generateSinks = args.with_sinks
@@ -205,7 +206,7 @@ def generate_models(args, name: str, database_dir: str) -> None:
 
 def build_databases_from_projects(
     language: str, extractor_options, projects: List[Project]
-) -> List[tuple[str, str | None]]:
+) -> List[tuple[Project, str | None]]:
     """
     Build databases for all projects in parallel.
 
@@ -225,7 +226,7 @@ def build_databases_from_projects(
     print("\n=== Building databases ===")
     database_results = [
         (
-            project["name"],
+            project,
             build_database(language, extractor_options, project, project_dir),
         )
         for project, project_dir in project_dirs
@@ -290,8 +291,8 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
 
 
 def download_dca_databases(
-    experiment_name: str, pat: str, projects
-) -> List[tuple[str, str | None]]:
+    experiment_name: str, pat: str, projects: List[Project]
+) -> List[tuple[Project, str | None]]:
     """
     Download databases from a DCA experiment.
     Args:
@@ -308,7 +309,7 @@ def download_dca_databases(
         pat,
     )
     targets = response["targets"]
-    for target, data in targets.items():
+    for data in targets.values():
         downloads = data["downloads"]
         analyzed_database = downloads["analyzed_database"]
         artifact_name = analyzed_database["artifact_name"]
@@ -349,20 +350,21 @@ def download_dca_databases(
                     tar_ref.extractall(artifact_unzipped_location)
                     database_results.append(
                         (
-                            pretty_name,
+                            {"name": pretty_name},
                             os.path.join(
                                 artifact_unzipped_location, remove_extension(entry)
                             ),
                         )
                     )
+
     print(f"\n=== Extracted {len(database_results)} databases ===")
 
     def compare(a, b):
         a_index = next(
-            i for i, project in enumerate(projects) if project["name"] == a[0]
+            i for i, project in enumerate(projects) if project["name"] == a[0]["name"]
         )
         b_index = next(
-            i for i, project in enumerate(projects) if project["name"] == b[0]
+            i for i, project in enumerate(projects) if project["name"] == b[0]["name"]
         )
         return a_index - b_index
 
@@ -431,7 +433,9 @@ def main(config, args) -> None:
     # Generate models for all projects
     print("\n=== Generating models ===")
 
-    failed_builds = [project for project, db_dir in database_results if db_dir is None]
+    failed_builds = [
+        project["name"] for project, db_dir in database_results if db_dir is None
+    ]
     if failed_builds:
         print(
             f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
@@ -440,7 +444,7 @@ def main(config, args) -> None:
 
     # Delete the MaD directory for each project
     for project, database_dir in database_results:
-        mad_dir = get_mad_destination_for_project(config, project)
+        mad_dir = get_mad_destination_for_project(config, project["name"])
         if os.path.exists(mad_dir):
             print(f"Deleting existing MaD directory at {mad_dir}")
             subprocess.check_call(["rm", "-rf", mad_dir])

From fc165db8acb30401569073f8e0564e880749d092 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:17:03 +0100
Subject: [PATCH 14/22] Bulk generator: Specify 'with-summaries',
 'with-sources', and 'with-sinks' in the config file.

---
 cpp/misc/bulk_generation_targets.json         |  4 +-
 .../models-as-data/bulk_generate_mad.py       | 38 ++++++-----
 rust/misc/bulk_generation_targets.json        | 65 +++++++++++++++----
 3 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json
index 5f74b094d35a..6cc2223b5e9d 100644
--- a/cpp/misc/bulk_generation_targets.json
+++ b/cpp/misc/bulk_generation_targets.json
@@ -1,8 +1,8 @@
 {
   "strategy": "dca",
   "targets": [
-      { "name": "openssl" },
-      { "name": "sqlite" }
+      { "name": "openssl", "with_summaries": true },
+      { "name": "sqlite", "with_summaries": true }
   ],
   "destination": "cpp/ql/lib/ext/generated"
 }
\ No newline at end of file
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 8b5482628748..bed3442f7904 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -41,7 +41,18 @@ class Project(TypedDict):
     name: str
     git_repo: NotRequired[str]
     git_tag: NotRequired[str]
+    with_sinks: NotRequired[bool]
+    with_sinks: NotRequired[bool]
+    with_summaries: NotRequired[bool]
 
+def shouldGenerateSinks(project: Project) -> bool:
+    return project.get("with_sinks", False)
+
+def shouldGenerateSources(project: Project) -> bool:
+    return project.get("with_sources", False)
+
+def shouldGenerateSummaries(project: Project) -> bool:
+    return project.get("with_summaries", False)
 
 def clone_project(project: Project) -> str:
     """
@@ -185,7 +196,7 @@ def build_database(
     return database_dir
 
 
-def generate_models(args, project: Project, database_dir: str) -> None:
+def generate_models(language: str, config, project: Project, database_dir: str) -> None:
     """
     Generate models for a project.
 
@@ -196,10 +207,11 @@ def generate_models(args, project: Project, database_dir: str) -> None:
     """
     name = project["name"]
 
-    generator = mad.Generator(args.lang)
-    generator.generateSinks = args.with_sinks
-    generator.generateSources = args.with_sources
-    generator.generateSummaries = args.with_summaries
+    generator = mad.Generator(language)
+    # Note: The argument parser converts with-sinks to with_sinks, etc.
+    generator.generateSinks = shouldGenerateSinks(project)
+    generator.generateSources = shouldGenerateSources(project)
+    generator.generateSummaries = shouldGenerateSummaries(project)
     generator.setenvironment(database=database_dir, folder=name)
     generator.run()
 
@@ -309,13 +321,14 @@ def download_dca_databases(
         pat,
     )
     targets = response["targets"]
+    project_map = {project["name"]: project for project in projects}
     for data in targets.values():
         downloads = data["downloads"]
         analyzed_database = downloads["analyzed_database"]
         artifact_name = analyzed_database["artifact_name"]
         pretty_name = pretty_name_from_artifact_name(artifact_name)
 
-        if not pretty_name in [project["name"] for project in projects]:
+        if not pretty_name in project_map:
             print(f"Skipping {pretty_name} as it is not in the list of projects")
             continue
 
@@ -350,7 +363,7 @@ def download_dca_databases(
                     tar_ref.extractall(artifact_unzipped_location)
                     database_results.append(
                         (
-                            {"name": pretty_name},
+                            project_map[pretty_name],
                             os.path.join(
                                 artifact_unzipped_location, remove_extension(entry)
                             ),
@@ -451,7 +464,7 @@ def main(config, args) -> None:
 
     for project, database_dir in database_results:
         if database_dir is not None:
-            generate_models(args, project, database_dir)
+            generate_models(language, config, project, database_dir)
 
 
 if __name__ == "__main__":
@@ -474,15 +487,6 @@ def main(config, args) -> None:
     parser.add_argument(
         "--lang", type=str, help="The language to generate models for", required=True
     )
-    parser.add_argument(
-        "--with-sources", action="store_true", help="Generate sources", required=False
-    )
-    parser.add_argument(
-        "--with-sinks", action="store_true", help="Generate sinks", required=False
-    )
-    parser.add_argument(
-        "--with-summaries", action="store_true", help="Generate sinks", required=False
-    )
     args = parser.parse_args()
 
     # Load config file
diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json
index ca30b76eb12e..85c28c7f8170 100644
--- a/rust/misc/bulk_generation_targets.json
+++ b/rust/misc/bulk_generation_targets.json
@@ -4,67 +4,106 @@
         {
             "name": "libc",
             "git_repo": "https://github.com/rust-lang/libc",
-            "git_tag": "0.2.172"
+            "git_tag": "0.2.172",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "log",
             "git_repo": "https://github.com/rust-lang/log",
-            "git_tag": "0.4.27"
+            "git_tag": "0.4.27",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "memchr",
             "git_repo": "https://github.com/BurntSushi/memchr",
-            "git_tag": "2.7.4"
+            "git_tag": "2.7.4",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "once_cell",
             "git_repo": "https://github.com/matklad/once_cell",
-            "git_tag": "v1.21.3"
+            "git_tag": "v1.21.3",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "rand",
             "git_repo": "https://github.com/rust-random/rand",
-            "git_tag": "0.9.1"
+            "git_tag": "0.9.1",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "smallvec",
             "git_repo": "https://github.com/servo/rust-smallvec",
-            "git_tag": "v1.15.0"
+            "git_tag": "v1.15.0",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "serde",
             "git_repo": "https://github.com/serde-rs/serde",
-            "git_tag": "v1.0.219"
+            "git_tag": "v1.0.219",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "tokio",
             "git_repo": "https://github.com/tokio-rs/tokio",
-            "git_tag": "tokio-1.45.0"
+            "git_tag": "tokio-1.45.0",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "reqwest",
             "git_repo": "https://github.com/seanmonstar/reqwest",
-            "git_tag": "v0.12.15"
+            "git_tag": "v0.12.15",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "rocket",
             "git_repo": "https://github.com/SergioBenitez/Rocket",
-            "git_tag": "v0.5.1"
+            "git_tag": "v0.5.1",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "actix-web",
             "git_repo": "https://github.com/actix/actix-web",
-            "git_tag": "web-v4.11.0"
+            "git_tag": "web-v4.11.0",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "hyper",
             "git_repo": "https://github.com/hyperium/hyper",
-            "git_tag": "v1.6.0"
+            "git_tag": "v1.6.0",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         },
         {
             "name": "clap",
             "git_repo": "https://github.com/clap-rs/clap",
-            "git_tag": "v4.5.38"
+            "git_tag": "v4.5.38",
+            "with-sources": true,
+            "with-sinks": true,
+            "with-summaries": true
         }
     ],
     "destination": "rust/ql/lib/ext/generated",

From 122808091486223251514db6112e121126605d17 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:23:17 +0100
Subject: [PATCH 15/22] Bulk generator: Specify 'language' in the config file.

---
 cpp/misc/bulk_generation_targets.json            |  1 +
 misc/scripts/models-as-data/bulk_generate_mad.py | 13 +++++++------
 rust/misc/bulk_generation_targets.json           |  1 +
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json
index 6cc2223b5e9d..11935335d81a 100644
--- a/cpp/misc/bulk_generation_targets.json
+++ b/cpp/misc/bulk_generation_targets.json
@@ -1,5 +1,6 @@
 {
   "strategy": "dca",
+  "language": "cpp",
   "targets": [
       { "name": "openssl", "with_summaries": true },
       { "name": "sqlite", "with_summaries": true }
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index bed3442f7904..1ceffe993ce7 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -196,7 +196,7 @@ def build_database(
     return database_dir
 
 
-def generate_models(language: str, config, project: Project, database_dir: str) -> None:
+def generate_models(config, project: Project, database_dir: str) -> None:
     """
     Generate models for a project.
 
@@ -206,6 +206,7 @@ def generate_models(language: str, config, project: Project, database_dir: str)
         database_dir: Path to the CodeQL database.
     """
     name = project["name"]
+    language = config["language"]
 
     generator = mad.Generator(language)
     # Note: The argument parser converts with-sinks to with_sinks, etc.
@@ -402,7 +403,10 @@ def main(config, args) -> None:
     """
 
     projects = config["targets"]
-    language = args.lang
+    if not "language" in config:
+        print("ERROR: 'language' key is missing in the configuration file.")
+        sys.exit(1)
+    language = config["language"]
 
     # Create build directory if it doesn't exist
     if not os.path.exists(build_dir):
@@ -464,7 +468,7 @@ def main(config, args) -> None:
 
     for project, database_dir in database_results:
         if database_dir is not None:
-            generate_models(language, config, project, database_dir)
+            generate_models(config, project, database_dir)
 
 
 if __name__ == "__main__":
@@ -484,9 +488,6 @@ def main(config, args) -> None:
         help="PAT token to grab DCA databases (the same as the one you use for DCA)",
         required=False,
     )
-    parser.add_argument(
-        "--lang", type=str, help="The language to generate models for", required=True
-    )
     args = parser.parse_args()
 
     # Load config file
diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json
index 85c28c7f8170..4591042b1401 100644
--- a/rust/misc/bulk_generation_targets.json
+++ b/rust/misc/bulk_generation_targets.json
@@ -1,5 +1,6 @@
 {
     "strategy": "repo",
+    "language": "rust",
     "targets": [
         {
             "name": "libc",

From 7c2612a6a10ef208abfdcf78e43f171e034303ae Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:47:07 +0100
Subject: [PATCH 16/22] Bulk generator: Specify a path to the PAT instead of
 the PAT itself.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 1ceffe993ce7..fa679594e9d2 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -441,11 +441,18 @@ def main(config, args) -> None:
             if experiment_name is None:
                 print("ERROR: --dca argument is required for DCA strategy")
                 sys.exit(1)
-            pat = args.pat
-            if pat is None:
+
+            if args.pat is None:
                 print("ERROR: --pat argument is required for DCA strategy")
                 sys.exit(1)
-            database_results = download_dca_databases(experiment_name, pat, projects)
+            if not os.path.exists(args.pat):
+                print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
+                sys.exit(1)
+            with open(args.pat, "r") as f:
+                pat = f.read().strip()
+                database_results = download_dca_databases(
+                    experiment_name, pat, projects
+                )
 
     # Generate models for all projects
     print("\n=== Generating models ===")
@@ -485,7 +492,7 @@ def main(config, args) -> None:
     parser.add_argument(
         "--pat",
         type=str,
-        help="PAT token to grab DCA databases (the same as the one you use for DCA)",
+        help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
         required=False,
     )
     args = parser.parse_args()

From 3ddca327056bc21e7db95f9ef8b40cfa00c57c7a Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:48:50 +0100
Subject: [PATCH 17/22] Update misc/scripts/models-as-data/bulk_generate_mad.py

Co-authored-by: Simon Friis Vindum <paldepind@github.com>
---
 misc/scripts/models-as-data/bulk_generate_mad.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index fa679594e9d2..eea09c6a10c5 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -281,16 +281,15 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
     headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"}
     response = requests.get(url, stream=True, headers=headers)
     zipName = artifact_name + ".zip"
-    if response.status_code == 200:
-        target_zip = os.path.join(build_dir, zipName)
-        with open(target_zip, "wb") as file:
-            for chunk in response.iter_content(chunk_size=8192):
-                file.write(chunk)
-        print(f"Download complete: {target_zip}")
-        return target_zip
-    else:
+    if response.status_code != 200:
         print(f"Failed to download file. Status code: {response.status_code}")
         sys.exit(1)
+    target_zip = os.path.join(build_dir, zipName)
+    with open(target_zip, "wb") as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+    print(f"Download complete: {target_zip}")
+    return target_zip
 
 
 def remove_extension(filename: str) -> str:

From cdd869a970a1348efd3e2992dc4e6d14b5d5f64b Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 12:49:12 +0100
Subject: [PATCH 18/22] Bulk generator: Autoformat.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index eea09c6a10c5..3a104861580c 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -45,15 +45,19 @@ class Project(TypedDict):
     with_sinks: NotRequired[bool]
     with_summaries: NotRequired[bool]
 
+
 def shouldGenerateSinks(project: Project) -> bool:
     return project.get("with_sinks", False)
 
+
 def shouldGenerateSources(project: Project) -> bool:
     return project.get("with_sources", False)
 
+
 def shouldGenerateSummaries(project: Project) -> bool:
     return project.get("with_summaries", False)
 
+
 def clone_project(project: Project) -> str:
     """
     Shallow clone a project into the build directory.

From bdf411afbc0431d746dab4f756bcafe5f28a3694 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 13:09:55 +0100
Subject: [PATCH 19/22] Bulk generator: Make 'database_results' a map to
 simplify away the explicit sorting.

---
 .../models-as-data/bulk_generate_mad.py       | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 3a104861580c..dc15dab26a11 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -318,7 +318,7 @@ def download_dca_databases(
     Returns:
         List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
     """
-    database_results = []
+    database_results = {}
     print("\n=== Finding projects ===")
     response = get_json_from_github(
         f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -365,28 +365,13 @@ def download_dca_databases(
                 with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
                     # And we just untar it to the same directory as the zip file
                     tar_ref.extractall(artifact_unzipped_location)
-                    database_results.append(
-                        (
-                            project_map[pretty_name],
-                            os.path.join(
-                                artifact_unzipped_location, remove_extension(entry)
-                            ),
-                        )
+                    database_results[pretty_name] = os.path.join(
+                        artifact_unzipped_location, remove_extension(entry)
                     )
 
     print(f"\n=== Extracted {len(database_results)} databases ===")
 
-    def compare(a, b):
-        a_index = next(
-            i for i, project in enumerate(projects) if project["name"] == a[0]["name"]
-        )
-        b_index = next(
-            i for i, project in enumerate(projects) if project["name"] == b[0]["name"]
-        )
-        return a_index - b_index
-
-    # Sort the database results based on the order in the projects file
-    return sorted(database_results, key=cmp_to_key(compare))
+    return [(project, database_results[project["name"]]) for project in projects]
 
 
 def get_mad_destination_for_project(config, name: str) -> str:

From 3444c986ec6f98880f529aa2e438d91920c51bf7 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 13:25:12 +0100
Subject: [PATCH 20/22] Bulk generator: Fix field name.

---
 cpp/misc/bulk_generation_targets.json            | 4 ++--
 misc/scripts/models-as-data/bulk_generate_mad.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json
index 11935335d81a..4a11b1f8c6b1 100644
--- a/cpp/misc/bulk_generation_targets.json
+++ b/cpp/misc/bulk_generation_targets.json
@@ -2,8 +2,8 @@
   "strategy": "dca",
   "language": "cpp",
   "targets": [
-      { "name": "openssl", "with_summaries": true },
-      { "name": "sqlite", "with_summaries": true }
+      { "name": "openssl", "with-summaries": true },
+      { "name": "sqlite", "with-summaries": true }
   ],
   "destination": "cpp/ql/lib/ext/generated"
 }
\ No newline at end of file
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index dc15dab26a11..84c2e51c7f49 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -47,15 +47,15 @@ class Project(TypedDict):
 
 
 def shouldGenerateSinks(project: Project) -> bool:
-    return project.get("with_sinks", False)
+    return project.get("with-sinks", False)
 
 
 def shouldGenerateSources(project: Project) -> bool:
-    return project.get("with_sources", False)
+    return project.get("with-sources", False)
 
 
 def shouldGenerateSummaries(project: Project) -> bool:
-    return project.get("with_summaries", False)
+    return project.get("with-summaries", False)
 
 
 def clone_project(project: Project) -> str:

From 0f30644afd77c6c5ceef1b38e35d52760a7333a4 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 13:26:53 +0100
Subject: [PATCH 21/22] Bulk generator: Snake case things.

---
 misc/scripts/models-as-data/bulk_generate_mad.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 84c2e51c7f49..61e66ffef12d 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -46,15 +46,15 @@ class Project(TypedDict):
     with_summaries: NotRequired[bool]
 
 
-def shouldGenerateSinks(project: Project) -> bool:
+def should_generate_sinks(project: Project) -> bool:
     return project.get("with-sinks", False)
 
 
-def shouldGenerateSources(project: Project) -> bool:
+def should_generate_sources(project: Project) -> bool:
     return project.get("with-sources", False)
 
 
-def shouldGenerateSummaries(project: Project) -> bool:
+def should_generate_summaries(project: Project) -> bool:
     return project.get("with-summaries", False)
 
 
@@ -214,9 +214,9 @@ def generate_models(config, project: Project, database_dir: str) -> None:
 
     generator = mad.Generator(language)
     # Note: The argument parser converts with-sinks to with_sinks, etc.
-    generator.generateSinks = shouldGenerateSinks(project)
-    generator.generateSources = shouldGenerateSources(project)
-    generator.generateSummaries = shouldGenerateSummaries(project)
+    generator.generateSinks = should_generate_sinks(project)
+    generator.generateSources = should_generate_sources(project)
+    generator.generateSummaries = should_generate_summaries(project)
     generator.setenvironment(database=database_dir, folder=name)
     generator.run()
 

From 7cb9024cc620a2ecd9b7f41e8a3224e56c0cb8f3 Mon Sep 17 00:00:00 2001
From: Mathias Vorreiter Pedersen <mathiasvp@github.com>
Date: Fri, 30 May 2025 13:33:24 +0100
Subject: [PATCH 22/22] Bulk generator: Flip default values for summaries,
 sources, and sinks.

---
 cpp/misc/bulk_generation_targets.json         |  4 +-
 .../models-as-data/bulk_generate_mad.py       |  6 +-
 rust/misc/bulk_generation_targets.json        | 65 ++++---------------
 3 files changed, 18 insertions(+), 57 deletions(-)

diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json
index 4a11b1f8c6b1..4cddef005b2f 100644
--- a/cpp/misc/bulk_generation_targets.json
+++ b/cpp/misc/bulk_generation_targets.json
@@ -2,8 +2,8 @@
   "strategy": "dca",
   "language": "cpp",
   "targets": [
-      { "name": "openssl", "with-summaries": true },
-      { "name": "sqlite", "with-summaries": true }
+      { "name": "openssl", "with-sources": false, "with-sinks": false },
+      { "name": "sqlite", "with-sources": false, "with-sinks": false }
   ],
   "destination": "cpp/ql/lib/ext/generated"
 }
\ No newline at end of file
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
index 61e66ffef12d..22a872dc2bf2 100644
--- a/misc/scripts/models-as-data/bulk_generate_mad.py
+++ b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -47,15 +47,15 @@ class Project(TypedDict):
 
 
 def should_generate_sinks(project: Project) -> bool:
-    return project.get("with-sinks", False)
+    return project.get("with-sinks", True)
 
 
 def should_generate_sources(project: Project) -> bool:
-    return project.get("with-sources", False)
+    return project.get("with-sources", True)
 
 
 def should_generate_summaries(project: Project) -> bool:
-    return project.get("with-summaries", False)
+    return project.get("with-summaries", True)
 
 
 def clone_project(project: Project) -> str:
diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json
index 4591042b1401..274d5dc5b361 100644
--- a/rust/misc/bulk_generation_targets.json
+++ b/rust/misc/bulk_generation_targets.json
@@ -5,106 +5,67 @@
         {
             "name": "libc",
             "git_repo": "https://github.com/rust-lang/libc",
-            "git_tag": "0.2.172",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "0.2.172"
         },
         {
             "name": "log",
             "git_repo": "https://github.com/rust-lang/log",
-            "git_tag": "0.4.27",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "0.4.27"
         },
         {
             "name": "memchr",
             "git_repo": "https://github.com/BurntSushi/memchr",
-            "git_tag": "2.7.4",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "2.7.4"
         },
         {
             "name": "once_cell",
             "git_repo": "https://github.com/matklad/once_cell",
-            "git_tag": "v1.21.3",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v1.21.3"
         },
         {
             "name": "rand",
             "git_repo": "https://github.com/rust-random/rand",
-            "git_tag": "0.9.1",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "0.9.1"
         },
         {
             "name": "smallvec",
             "git_repo": "https://github.com/servo/rust-smallvec",
-            "git_tag": "v1.15.0",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v1.15.0"
         },
         {
             "name": "serde",
             "git_repo": "https://github.com/serde-rs/serde",
-            "git_tag": "v1.0.219",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v1.0.219"
         },
         {
             "name": "tokio",
             "git_repo": "https://github.com/tokio-rs/tokio",
-            "git_tag": "tokio-1.45.0",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "tokio-1.45.0"
         },
         {
             "name": "reqwest",
             "git_repo": "https://github.com/seanmonstar/reqwest",
-            "git_tag": "v0.12.15",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v0.12.15"
         },
         {
             "name": "rocket",
             "git_repo": "https://github.com/SergioBenitez/Rocket",
-            "git_tag": "v0.5.1",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v0.5.1"
         },
         {
             "name": "actix-web",
             "git_repo": "https://github.com/actix/actix-web",
-            "git_tag": "web-v4.11.0",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "web-v4.11.0"
         },
         {
             "name": "hyper",
             "git_repo": "https://github.com/hyperium/hyper",
-            "git_tag": "v1.6.0",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v1.6.0"
         },
         {
             "name": "clap",
             "git_repo": "https://github.com/clap-rs/clap",
-            "git_tag": "v4.5.38",
-            "with-sources": true,
-            "with-sinks": true,
-            "with-summaries": true
+            "git_tag": "v4.5.38"
         }
     ],
     "destination": "rust/ql/lib/ext/generated",