From cb0b566588d6967740429c5fe37fb47d4d3b5901 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 16:29:19 +0100 Subject: [PATCH 01/22] C++: Put autogenerated models in the same folder structure as Rust. --- cpp/ql/lib/ext/generated/{ => openssl}/openssl.model.yml | 0 cpp/ql/lib/ext/generated/{ => sqlite}/sqlite.model.yml | 0 cpp/ql/lib/qlpack.yml | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename cpp/ql/lib/ext/generated/{ => openssl}/openssl.model.yml (100%) rename cpp/ql/lib/ext/generated/{ => sqlite}/sqlite.model.yml (100%) diff --git a/cpp/ql/lib/ext/generated/openssl.model.yml b/cpp/ql/lib/ext/generated/openssl/openssl.model.yml similarity index 100% rename from cpp/ql/lib/ext/generated/openssl.model.yml rename to cpp/ql/lib/ext/generated/openssl/openssl.model.yml diff --git a/cpp/ql/lib/ext/generated/sqlite.model.yml b/cpp/ql/lib/ext/generated/sqlite/sqlite.model.yml similarity index 100% rename from cpp/ql/lib/ext/generated/sqlite.model.yml rename to cpp/ql/lib/ext/generated/sqlite/sqlite.model.yml diff --git a/cpp/ql/lib/qlpack.yml b/cpp/ql/lib/qlpack.yml index e15623e2ddb9..ef2d81c4f84c 100644 --- a/cpp/ql/lib/qlpack.yml +++ b/cpp/ql/lib/qlpack.yml @@ -17,7 +17,7 @@ dependencies: codeql/xml: ${workspace} dataExtensions: - ext/*.model.yml - - ext/generated/*.model.yml + - ext/generated/**/*.model.yml - ext/deallocation/*.model.yml - ext/allocation/*.model.yml warnOnImplicitThis: true From 40d937a2eb7a3fce4f47c4f736af948f2c19d717 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:07:25 +0100 Subject: [PATCH 02/22] Bulk generator: Some imports we will need. --- misc/scripts/models-as-data/rust_bulk_generate_mad.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py index 76d67b1fba15..c9ed1e2540e3 100644 --- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py +++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py @@ -10,6 +10,12 @@ from typing import NotRequired, TypedDict, List from concurrent.futures import ThreadPoolExecutor, as_completed import time +import argparse +import json +import requests +import zipfile +import tarfile +from functools import cmp_to_key import generate_mad as mad From b87ba31c434f7fd59dd094053d291153c7baddf7 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:08:57 +0100 Subject: [PATCH 03/22] Bulk generator: Get rid of the hardcoded project list and move it into a configuration file. --- .../models-as-data/rust_bulk_generate_mad.py | 70 ------------------- rust/misc/bulk_generation_targets.json | 69 ++++++++++++++++++ 2 files changed, 69 insertions(+), 70 deletions(-) create mode 100644 rust/misc/bulk_generation_targets.json diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py index c9ed1e2540e3..48c5c362fec5 100644 --- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py +++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py @@ -47,76 +47,6 @@ class Project(TypedDict): git_tag: NotRequired[str] -# List of Rust projects to generate models for. -projects: List[Project] = [ - { - "name": "libc", - "git_repo": "https://github.com/rust-lang/libc", - "git_tag": "0.2.172", - }, - { - "name": "log", - "git_repo": "https://github.com/rust-lang/log", - "git_tag": "0.4.27", - }, - { - "name": "memchr", - "git_repo": "https://github.com/BurntSushi/memchr", - "git_tag": "2.7.4", - }, - { - "name": "once_cell", - "git_repo": "https://github.com/matklad/once_cell", - "git_tag": "v1.21.3", - }, - { - "name": "rand", - "git_repo": "https://github.com/rust-random/rand", - "git_tag": "0.9.1", - }, - { - "name": "smallvec", - "git_repo": "https://github.com/servo/rust-smallvec", - "git_tag": "v1.15.0", - }, - { - "name": "serde", - "git_repo": "https://github.com/serde-rs/serde", - "git_tag": "v1.0.219", - }, - { - "name": "tokio", - "git_repo": "https://github.com/tokio-rs/tokio", - "git_tag": "tokio-1.45.0", - }, - { - "name": "reqwest", - "git_repo": "https://github.com/seanmonstar/reqwest", - "git_tag": "v0.12.15", - }, - { - "name": "rocket", - "git_repo": "https://github.com/SergioBenitez/Rocket", - "git_tag": "v0.5.1", - }, - { - "name": "actix-web", - "git_repo": "https://github.com/actix/actix-web", - "git_tag": "web-v4.11.0", - }, - { - "name": "hyper", - "git_repo": "https://github.com/hyperium/hyper", - "git_tag": "v1.6.0", - }, - { - "name": "clap", - "git_repo": "https://github.com/clap-rs/clap", - "git_tag": "v4.5.38", - }, -] - - def clone_project(project: Project) -> str: """ Shallow clone a project into the build directory. diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json new file mode 100644 index 000000000000..e7efddfe5b85 --- /dev/null +++ b/rust/misc/bulk_generation_targets.json @@ -0,0 +1,69 @@ +{ + "targets": [ + { + "name": "libc", + "git_repo": "https://github.com/rust-lang/libc", + "git_tag": "0.2.172" + }, + { + "name": "log", + "git_repo": "https://github.com/rust-lang/log", + "git_tag": "0.4.27" + }, + { + "name": "memchr", + "git_repo": "https://github.com/BurntSushi/memchr", + "git_tag": "2.7.4" + }, + { + "name": "once_cell", + "git_repo": "https://github.com/matklad/once_cell", + "git_tag": "v1.21.3" + }, + { + "name": "rand", + "git_repo": "https://github.com/rust-random/rand", + "git_tag": "0.9.1" + }, + { + "name": "smallvec", + "git_repo": "https://github.com/servo/rust-smallvec", + "git_tag": "v1.15.0" + }, + { + "name": "serde", + "git_repo": "https://github.com/serde-rs/serde", + "git_tag": "v1.0.219" + }, + { + "name": "tokio", + "git_repo": "https://github.com/tokio-rs/tokio", + "git_tag": "tokio-1.45.0" + }, + { + "name": "reqwest", + "git_repo": "https://github.com/seanmonstar/reqwest", + "git_tag": "v0.12.15" + }, + { + "name": "rocket", + "git_repo": "https://github.com/SergioBenitez/Rocket", + "git_tag": "v0.5.1" + }, + { + "name": "actix-web", + "git_repo": "https://github.com/actix/actix-web", + "git_tag": "web-v4.11.0" + }, + { + "name": "hyper", + "git_repo": "https://github.com/hyperium/hyper", + "git_tag": "v1.6.0" + }, + { + "name": "clap", + "git_repo": "https://github.com/clap-rs/clap", + "git_tag": "v4.5.38" + } + ] +} \ No newline at end of file From 6ff2bebbc2e939d82917a2b00a9d10ffef4f14f2 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:11:20 +0100 Subject: [PATCH 04/22] Bulk generator: Add command-line arguments. --- .../models-as-data/rust_bulk_generate_mad.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py index 48c5c362fec5..48d7bf68d418 100644 --- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py +++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py @@ -268,4 +268,24 @@ def main() -> None: if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True) + parser.add_argument("--lang", type=str, help="The language to generate models for", required=True) + parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False) + parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False) + parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False) + args = parser.parse_args() + + # Load config file + config = {} + if not os.path.exists(args.config): + print(f"ERROR: Config file '{args.config}' does not exist.") + sys.exit(1) + try: + with open(args.config, "r") as f: + config = json.load(f) + except json.JSONDecodeError as e: + print(f"ERROR: Failed to parse JSON file {args.config}: {e}") + sys.exit(1) + + main(config, args) From e721fc07aaef1ef91bff825dc95aa5c18176e3b4 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:17:15 +0100 Subject: [PATCH 05/22] Bulk generator: Prepare for adding DCA support. This commits just generalizes the existing functionality to be independent of Rust and instead depend on the configuration file and the command-line arguments. --- .../models-as-data/rust_bulk_generate_mad.py | 103 +++++++++++------- rust/misc/bulk_generation_targets.json | 5 + 2 files changed, 68 insertions(+), 40 deletions(-) diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py index 48d7bf68d418..0524a7179758 100644 --- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py +++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py @@ -26,15 +26,10 @@ ) build_dir = os.path.join(gitroot, "mad-generation-build") - -def path_to_mad_directory(language: str, name: str) -> str: - return os.path.join(gitroot, f"{language}/ql/lib/ext/generated/{name}") - - # A project to generate models for class Project(TypedDict): """ - Type definition for Rust projects to model. + Type definition for projects (acquired via a GitHub repo) to model. Attributes: name: The name of the project @@ -139,13 +134,15 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: return project_dirs -def build_database(project: Project, project_dir: str) -> str | None: +def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None: """ Build a CodeQL database for a project. Args: + language: The language for which to build the database (e.g., "rust"). + extractor_options: Additional options for the extractor. project: A dictionary containing project information with 'name' and 'git_repo' keys. - project_dir: The directory containing the project source code. + project_dir: Path to the CodeQL database. Returns: The path to the created database directory. @@ -158,17 +155,17 @@ def build_database(project: Project, project_dir: str) -> str | None: # Only build the database if it doesn't already exist if not os.path.exists(database_dir): print(f"Building CodeQL database for {name}...") + extractor_options = [option for x in extractor_options for option in ("-O", x)] try: subprocess.check_call( [ "codeql", "database", "create", - "--language=rust", + f"--language={language}", "--source-root=" + project_dir, "--overwrite", - "-O", - "cargo_features='*'", + *extractor_options, "--", database_dir, ] @@ -184,40 +181,72 @@ def build_database(project: Project, project_dir: str) -> str | None: return database_dir - -def generate_models(project: Project, database_dir: str) -> None: +def generate_models(args, name: str, database_dir: str) -> None: """ Generate models for a project. Args: - project: A dictionary containing project information with 'name' and 'git_repo' keys. - project_dir: The directory containing the project source code. + args: Command line arguments passed to this script. + name: The name of the project. + database_dir: Path to the CodeQL database. """ - name = project["name"] - generator = mad.Generator("rust") - generator.generateSinks = True - generator.generateSources = True - generator.generateSummaries = True + generator = mad.Generator(args.lang) + generator.generateSinks = args.with_sinks + generator.generateSources = args.with_sources + generator.generateSummaries = args.with_summaries generator.setenvironment(database=database_dir, folder=name) generator.run() +def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]: + """ + Build databases for all projects in parallel. + + Args: + language: The language for which to build the databases (e.g., "rust"). + extractor_options: Additional options for the extractor. + projects: List of projects to build databases for. + + Returns: + List of (project_name, database_dir) pairs, where database_dir is None if the build failed. + """ + # Phase 1: Clone projects in parallel + print("=== Phase 1: Cloning projects ===") + project_dirs = clone_projects(projects) + + # Phase 2: Build databases for all projects + print("\n=== Phase 2: Building databases ===") + database_results = [ + (project["name"], build_database(language, extractor_options, project, project_dir)) + for project, project_dir in project_dirs + ] + return database_results + +def get_destination_for_project(config, name: str) -> str: + return os.path.join(config["destination"], name) + +def get_strategy(config) -> str: + return config["strategy"].lower() -def main() -> None: +def main(config, args) -> None: """ - Process all projects in three distinct phases: - 1. Clone projects (in parallel) - 2. Build databases for projects - 3. Generate models for successful database builds + Main function to handle the bulk generation of MaD models. + Args: + config: Configuration dictionary containing project details and other settings. + args: Command line arguments passed to this script. """ + projects = config["targets"] + destination = config["destination"] + language = args.lang + # Create build directory if it doesn't exist if not os.path.exists(build_dir): os.makedirs(build_dir) # Check if any of the MaD directories contain working directory changes in git for project in projects: - mad_dir = path_to_mad_directory("rust", project["name"]) + mad_dir = get_destination_for_project(config, project["name"]) if os.path.exists(mad_dir): git_status_output = subprocess.check_output( ["git", "status", "-s", mad_dir], text=True @@ -232,22 +261,17 @@ def main() -> None: ) sys.exit(1) - # Phase 1: Clone projects in parallel - print("=== Phase 1: Cloning projects ===") - project_dirs = clone_projects(projects) - - # Phase 2: Build databases for all projects - print("\n=== Phase 2: Building databases ===") - database_results = [ - (project, build_database(project, project_dir)) - for project, project_dir in project_dirs - ] + database_results = [] + match get_strategy(config): + case "repo": + extractor_options = config.get("extractor_options", []) + database_results = build_databases_from_projects(language, extractor_options, projects) # Phase 3: Generate models for all projects print("\n=== Phase 3: Generating models ===") failed_builds = [ - project["name"] for project, db_dir in database_results if db_dir is None + project for project, db_dir in database_results if db_dir is None ] if failed_builds: print( @@ -257,15 +281,14 @@ def main() -> None: # Delete the MaD directory for each project for project, database_dir in database_results: - mad_dir = path_to_mad_directory("rust", project["name"]) + mad_dir = get_destination_for_project(config, project) if os.path.exists(mad_dir): print(f"Deleting existing MaD directory at {mad_dir}") subprocess.check_call(["rm", "-rf", mad_dir]) for project, database_dir in database_results: if database_dir is not None: - generate_models(project, database_dir) - + generate_models(args, project, database_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json index e7efddfe5b85..ca30b76eb12e 100644 --- a/rust/misc/bulk_generation_targets.json +++ b/rust/misc/bulk_generation_targets.json @@ -1,4 +1,5 @@ { + "strategy": "repo", "targets": [ { "name": "libc", @@ -65,5 +66,9 @@ "git_repo": "https://github.com/clap-rs/clap", "git_tag": "v4.5.38" } + ], + "destination": "rust/ql/lib/ext/generated", + "extractor_options": [ + "cargo_features='*'" ] } \ No newline at end of file From 5051790e24d2fceb7902fbf4dac06d16f4afd4aa Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:20:18 +0100 Subject: [PATCH 06/22] Bulk generator: Add DCA support. --- cpp/misc/bulk_generation_targets.json | 8 ++ .../models-as-data/rust_bulk_generate_mad.py | 122 +++++++++++++++++- 2 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 cpp/misc/bulk_generation_targets.json diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json new file mode 100644 index 000000000000..5f74b094d35a --- /dev/null +++ b/cpp/misc/bulk_generation_targets.json @@ -0,0 +1,8 @@ +{ + "strategy": "dca", + "targets": [ + { "name": "openssl" }, + { "name": "sqlite" } + ], + "destination": "cpp/ql/lib/ext/generated" +} \ No newline at end of file diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/rust_bulk_generate_mad.py index 0524a7179758..922deaa7627f 100644 --- a/misc/scripts/models-as-data/rust_bulk_generate_mad.py +++ b/misc/scripts/models-as-data/rust_bulk_generate_mad.py @@ -1,7 +1,5 @@ """ Experimental script for bulk generation of MaD models based on a list of projects. - -Currently the script only targets Rust. """ import os.path @@ -221,6 +219,114 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li for project, project_dir in project_dirs ] return database_results + +def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: + """ + Download a JSON file from GitHub using a personal access token (PAT). + Args: + url: The URL to download the JSON file from. + pat: Personal Access Token for GitHub API authentication. + extra_headers: Additional headers to include in the request. + Returns: + The JSON response as a dictionary. + """ + headers = { "Authorization": f"token {pat}" } | extra_headers + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Failed to download JSON: {response.status_code} {response.text}") + sys.exit(1) + else: + return response.json() + +def download_artifact(url: str, artifact_name: str, pat: str) -> str: + """ + Download a GitHub Actions artifact from a given URL. + Args: + url: The URL to download the artifact from. + artifact_name: The name of the artifact (used for naming the downloaded file). + pat: Personal Access Token for GitHub API authentication. + Returns: + The path to the downloaded artifact file. + """ + headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" } + response = requests.get(url, stream=True, headers=headers) + zipName = artifact_name + ".zip" + if response.status_code == 200: + target_zip = os.path.join(build_dir, zipName) + with open(target_zip, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + print(f"Download complete: {target_zip}") + return target_zip + else: + print(f"Failed to download file. Status code: {response.status_code}") + sys.exit(1) + +def remove_extension(filename: str) -> str: + while "." in filename: + filename, _ = os.path.splitext(filename) + return filename + +def pretty_name_from_artifact_name(artifact_name: str) -> str: + return artifact_name.split("___")[1] + +def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]: + """ + Download databases from a DCA experiment. + Args: + experiment_name: The name of the DCA experiment to download databases from. + pat: Personal Access Token for GitHub API authentication. + projects: List of projects to download databases for. + Returns: + List of (project_name, database_dir) pairs, where database_dir is None if the download failed. + """ + database_results = [] + print("\n=== Finding projects ===") + response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat) + targets = response["targets"] + for target, data in targets.items(): + downloads = data["downloads"] + analyzed_database = downloads["analyzed_database"] + artifact_name = analyzed_database["artifact_name"] + pretty_name = pretty_name_from_artifact_name(artifact_name) + + if not pretty_name in [project["name"] for project in projects]: + print(f"Skipping {pretty_name} as it is not in the list of projects") + continue + + repository = analyzed_database["repository"] + run_id = analyzed_database["run_id"] + print(f"=== Finding artifact: {artifact_name} ===") + response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" }) + artifacts = response["artifacts"] + artifact_map = {artifact["name"]: artifact for artifact in artifacts} + print(f"=== Downloading artifact: {artifact_name} ===") + archive_download_url = artifact_map[artifact_name]["archive_download_url"] + artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat) + print(f"=== Extracting artifact: {artifact_name} ===") + # The database is in a zip file, which contains a tar.gz file with the DB + # First we open the zip file + with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref: + artifact_unzipped_location = os.path.join(build_dir, artifact_name) + # And then we extract it to build_dir/artifact_name + zip_ref.extractall(artifact_unzipped_location) + # And then we iterate over the contents of the extracted directory + # and extract the tar.gz files inside it + for entry in os.listdir(artifact_unzipped_location): + artifact_tar_location = os.path.join(artifact_unzipped_location, entry) + with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: + # And we just untar it to the same directory as the zip file + tar_ref.extractall(artifact_unzipped_location) + database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry)))) + print(f"\n=== Extracted {len(database_results)} databases ===") + + def compare(a, b): + a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0]) + b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0]) + return a_index - b_index + + # Sort the database results based on the order in the projects file + return sorted(database_results, key=cmp_to_key(compare)) def get_destination_for_project(config, name: str) -> str: return os.path.join(config["destination"], name) @@ -266,6 +372,16 @@ def main(config, args) -> None: case "repo": extractor_options = config.get("extractor_options", []) database_results = build_databases_from_projects(language, extractor_options, projects) + case "dca": + experiment_name = args.dca + if experiment_name is None: + print("ERROR: --dca argument is required for DCA strategy") + sys.exit(1) + pat = args.pat + if pat is None: + print("ERROR: --pat argument is required for DCA strategy") + sys.exit(1) + database_results = download_dca_databases(experiment_name, pat, projects) # Phase 3: Generate models for all projects print("\n=== Phase 3: Generating models ===") @@ -293,6 +409,8 @@ def main(config, args) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True) + parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False) + parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False) parser.add_argument("--lang", type=str, help="The language to generate models for", required=True) parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False) parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False) From cb938701a14d693733c5759604c37246d391153e Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Thu, 29 May 2025 17:48:19 +0100 Subject: [PATCH 07/22] Bulk generator: Rename file since it is no longer Rust specific. --- .../{rust_bulk_generate_mad.py => bulk_generate_mad.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename misc/scripts/models-as-data/{rust_bulk_generate_mad.py => bulk_generate_mad.py} (100%) diff --git a/misc/scripts/models-as-data/rust_bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py similarity index 100% rename from misc/scripts/models-as-data/rust_bulk_generate_mad.py rename to misc/scripts/models-as-data/bulk_generate_mad.py From 7ecf8c8ea2d11f0a9bfcca86a55b12827e38314c Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 11:38:20 +0100 Subject: [PATCH 08/22] Bulk generator: Format file and add a note at the top of the file specifying the formatting requirements. --- .../models-as-data/bulk_generate_mad.py | 171 ++++++++++++------ 1 file changed, 117 insertions(+), 54 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 922deaa7627f..9490ef44c45c 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -1,5 +1,7 @@ """ Experimental script for bulk generation of MaD models based on a list of projects. + +Note: This file must be formatted using the Black Python formatter. """ import os.path @@ -24,6 +26,7 @@ ) build_dir = os.path.join(gitroot, "mad-generation-build") + # A project to generate models for class Project(TypedDict): """ @@ -132,7 +135,9 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: return project_dirs -def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None: +def build_database( + language: str, extractor_options, project: Project, project_dir: str +) -> str | None: """ Build a CodeQL database for a project. @@ -179,6 +184,7 @@ def build_database(language: str, extractor_options, project: Project, project_d return database_dir + def generate_models(args, name: str, database_dir: str) -> None: """ Generate models for a project. @@ -196,7 +202,10 @@ def generate_models(args, name: str, database_dir: str) -> None: generator.setenvironment(database=database_dir, folder=name) generator.run() -def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]: + +def build_databases_from_projects( + language: str, extractor_options, projects: List[Project] +) -> List[tuple[str, str | None]]: """ Build databases for all projects in parallel. @@ -215,11 +224,15 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li # Phase 2: Build databases for all projects print("\n=== Phase 2: Building databases ===") database_results = [ - (project["name"], build_database(language, extractor_options, project, project_dir)) + ( + project["name"], + build_database(language, extractor_options, project, project_dir), + ) for project, project_dir in project_dirs ] return database_results + def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: """ Download a JSON file from GitHub using a personal access token (PAT). @@ -230,7 +243,7 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: Returns: The JSON response as a dictionary. """ - headers = { "Authorization": f"token {pat}" } | extra_headers + headers = {"Authorization": f"token {pat}"} | extra_headers response = requests.get(url, headers=headers) if response.status_code != 200: print(f"Failed to download JSON: {response.status_code} {response.text}") @@ -238,6 +251,7 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: else: return response.json() + def download_artifact(url: str, artifact_name: str, pat: str) -> str: """ Download a GitHub Actions artifact from a given URL. @@ -248,7 +262,7 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str: Returns: The path to the downloaded artifact file. """ - headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" } + headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"} response = requests.get(url, stream=True, headers=headers) zipName = artifact_name + ".zip" if response.status_code == 200: @@ -262,15 +276,20 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str: print(f"Failed to download file. Status code: {response.status_code}") sys.exit(1) + def remove_extension(filename: str) -> str: while "." in filename: filename, _ = os.path.splitext(filename) return filename + def pretty_name_from_artifact_name(artifact_name: str) -> str: return artifact_name.split("___")[1] -def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]: + +def download_dca_databases( + experiment_name: str, pat: str, projects +) -> List[tuple[str, str | None]]: """ Download databases from a DCA experiment. Args: @@ -282,58 +301,81 @@ def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tup """ database_results = [] print("\n=== Finding projects ===") - response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat) + response = github( + f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", + pat, + ) targets = response["targets"] for target, data in targets.items(): - downloads = data["downloads"] - analyzed_database = downloads["analyzed_database"] - artifact_name = analyzed_database["artifact_name"] - pretty_name = pretty_name_from_artifact_name(artifact_name) - - if not pretty_name in [project["name"] for project in projects]: - print(f"Skipping {pretty_name} as it is not in the list of projects") - continue - - repository = analyzed_database["repository"] - run_id = analyzed_database["run_id"] - print(f"=== Finding artifact: {artifact_name} ===") - response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" }) - artifacts = response["artifacts"] - artifact_map = {artifact["name"]: artifact for artifact in artifacts} - print(f"=== Downloading artifact: {artifact_name} ===") - archive_download_url = artifact_map[artifact_name]["archive_download_url"] - artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat) - print(f"=== Extracting artifact: {artifact_name} ===") - # The database is in a zip file, which contains a tar.gz file with the DB - # First we open the zip file - with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref: - artifact_unzipped_location = os.path.join(build_dir, artifact_name) - # And then we extract it to build_dir/artifact_name - zip_ref.extractall(artifact_unzipped_location) - # And then we iterate over the contents of the extracted directory - # and extract the tar.gz files inside it - for entry in os.listdir(artifact_unzipped_location): - artifact_tar_location = os.path.join(artifact_unzipped_location, entry) - with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: - # And we just untar it to the same directory as the zip file - tar_ref.extractall(artifact_unzipped_location) - database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry)))) + downloads = data["downloads"] + analyzed_database = downloads["analyzed_database"] + artifact_name = analyzed_database["artifact_name"] + pretty_name = pretty_name_from_artifact_name(artifact_name) + + if not pretty_name in [project["name"] for project in projects]: + print(f"Skipping {pretty_name} as it is not in the list of projects") + continue + + repository = analyzed_database["repository"] + run_id = analyzed_database["run_id"] + print(f"=== Finding artifact: {artifact_name} ===") + response = github( + f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", + pat, + {"Accept": "application/vnd.github+json"}, + ) + artifacts = response["artifacts"] + artifact_map = {artifact["name"]: artifact for artifact in artifacts} + print(f"=== Downloading artifact: {artifact_name} ===") + archive_download_url = artifact_map[artifact_name]["archive_download_url"] + artifact_zip_location = download_artifact( + archive_download_url, artifact_name, pat + ) + print(f"=== Extracting artifact: {artifact_name} ===") + # The database is in a zip file, which contains a tar.gz file with the DB + # First we open the zip file + with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref: + artifact_unzipped_location = os.path.join(build_dir, artifact_name) + # And then we extract it to build_dir/artifact_name + zip_ref.extractall(artifact_unzipped_location) + # And then we iterate over the contents of the extracted directory + # and extract the tar.gz files inside it + for entry in os.listdir(artifact_unzipped_location): + artifact_tar_location = os.path.join(artifact_unzipped_location, entry) + with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: + # And we just untar it to the same directory as the zip file + tar_ref.extractall(artifact_unzipped_location) + database_results.append( + ( + pretty_name, + os.path.join( + artifact_unzipped_location, remove_extension(entry) + ), + ) + ) print(f"\n=== Extracted {len(database_results)} databases ===") def compare(a, b): - a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0]) - b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0]) + a_index = next( + i for i, project in enumerate(projects) if project["name"] == a[0] + ) + b_index = next( + i for i, project in enumerate(projects) if project["name"] == b[0] + ) return a_index - b_index # Sort the database results based on the order in the projects file return sorted(database_results, key=cmp_to_key(compare)) - + + def get_destination_for_project(config, name: str) -> str: return os.path.join(config["destination"], name) + def get_strategy(config) -> str: return config["strategy"].lower() + def main(config, args) -> None: """ Main function to handle the bulk generation of MaD models. @@ -371,7 +413,9 @@ def main(config, args) -> None: match get_strategy(config): case "repo": extractor_options = config.get("extractor_options", []) - database_results = build_databases_from_projects(language, extractor_options, projects) + database_results = build_databases_from_projects( + language, extractor_options, projects + ) case "dca": experiment_name = args.dca if experiment_name is None: @@ -386,9 +430,7 @@ def main(config, args) -> None: # Phase 3: Generate models for all projects print("\n=== Phase 3: Generating models ===") - failed_builds = [ - project for project, db_dir in database_results if db_dir is None - ] + failed_builds = [project for project, db_dir in database_results if db_dir is None] if failed_builds: print( f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}" @@ -406,15 +448,36 @@ def main(config, args) -> None: if database_dir is not None: generate_models(args, project, database_dir) + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True) - parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False) - parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False) - parser.add_argument("--lang", type=str, help="The language to generate models for", required=True) - parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False) - parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False) - parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False) + parser.add_argument( + "--config", type=str, help="Path to the configuration file.", required=True + ) + parser.add_argument( + "--dca", + type=str, + help="Name of a DCA run that built all the projects", + required=False, + ) + parser.add_argument( + "--pat", + type=str, + help="PAT token to grab DCA databases (the same as the one you use for DCA)", + required=False, + ) + parser.add_argument( + "--lang", type=str, help="The language to generate models for", required=True + ) + parser.add_argument( + "--with-sources", action="store_true", help="Generate sources", required=False + ) + parser.add_argument( + "--with-sinks", action="store_true", help="Generate sinks", required=False + ) + parser.add_argument( + "--with-summaries", action="store_true", help="Generate sinks", required=False + ) args = parser.parse_args() # Load config file From 566bf431d7cd6a193a63eca714c6d577c97207bb Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 11:42:15 +0100 Subject: [PATCH 09/22] Bulk generator: Rename 'github' to 'get_json_from_github'. --- misc/scripts/models-as-data/bulk_generate_mad.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 9490ef44c45c..edb1dd3bfa89 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -233,7 +233,9 @@ def build_databases_from_projects( return database_results -def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: +def get_json_from_github( + url: str, pat: str, extra_headers: dict[str, str] = {} +) -> dict: """ Download a JSON file from GitHub using a personal access token (PAT). Args: @@ -301,7 +303,7 @@ def download_dca_databases( """ database_results = [] print("\n=== Finding projects ===") - response = github( + response = get_json_from_github( f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat, ) @@ -319,7 +321,7 @@ def download_dca_databases( repository = analyzed_database["repository"] run_id = analyzed_database["run_id"] print(f"=== Finding artifact: {artifact_name} ===") - response = github( + response = get_json_from_github( f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, {"Accept": "application/vnd.github+json"}, From b640474a61f75d1bbe6796acbfe7cf55d5b5b84c Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 11:43:30 +0100 Subject: [PATCH 10/22] Bulk generator: Remove 'Phase' part of log message. --- misc/scripts/models-as-data/bulk_generate_mad.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index edb1dd3bfa89..cf493d48064a 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -217,12 +217,12 @@ def build_databases_from_projects( Returns: List of (project_name, database_dir) pairs, where database_dir is None if the build failed. """ - # Phase 1: Clone projects in parallel - print("=== Phase 1: Cloning projects ===") + # Clone projects in parallel + print("=== Cloning projects ===") project_dirs = clone_projects(projects) - # Phase 2: Build databases for all projects - print("\n=== Phase 2: Building databases ===") + # Build databases for all projects + print("\n=== Building databases ===") database_results = [ ( project["name"], @@ -429,8 +429,8 @@ def main(config, args) -> None: sys.exit(1) database_results = download_dca_databases(experiment_name, pat, projects) - # Phase 3: Generate models for all projects - print("\n=== Phase 3: Generating models ===") + # Generate models for all projects + print("\n=== Generating models ===") failed_builds = [project for project, db_dir in database_results if db_dir is None] if failed_builds: From 5d79a8de89fd76aa6537b20230d024156467b566 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 11:48:30 +0100 Subject: [PATCH 11/22] Update misc/scripts/models-as-data/bulk_generate_mad.py Co-authored-by: Simon Friis Vindum --- misc/scripts/models-as-data/bulk_generate_mad.py | 1 - 1 file changed, 1 deletion(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index cf493d48064a..6d9b52e2266d 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -387,7 +387,6 @@ def main(config, args) -> None: """ projects = config["targets"] - destination = config["destination"] language = args.lang # Create build directory if it doesn't exist From 7c89d6d6dde1ac804f2fb26821623cc90ffe8bf2 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 11:49:48 +0100 Subject: [PATCH 12/22] Bulk generator: Rename 'get_destination_for_project' to 'get_mad_destination_for_project'. --- misc/scripts/models-as-data/bulk_generate_mad.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 6d9b52e2266d..6f75536ade54 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -370,7 +370,7 @@ def compare(a, b): return sorted(database_results, key=cmp_to_key(compare)) -def get_destination_for_project(config, name: str) -> str: +def get_mad_destination_for_project(config, name: str) -> str: return os.path.join(config["destination"], name) @@ -395,7 +395,7 @@ def main(config, args) -> None: # Check if any of the MaD directories contain working directory changes in git for project in projects: - mad_dir = get_destination_for_project(config, project["name"]) + mad_dir = get_mad_destination_for_project(config, project["name"]) if os.path.exists(mad_dir): git_status_output = subprocess.check_output( ["git", "status", "-s", mad_dir], text=True @@ -440,7 +440,7 @@ def main(config, args) -> None: # Delete the MaD directory for each project for project, database_dir in database_results: - mad_dir = get_destination_for_project(config, project) + mad_dir = get_mad_destination_for_project(config, project) if os.path.exists(mad_dir): print(f"Deleting existing MaD directory at {mad_dir}") subprocess.check_call(["rm", "-rf", mad_dir]) From 7121f5c57edbbd70520ace39aa42a502f354190e Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:08:42 +0100 Subject: [PATCH 13/22] Bulk generator: Use the 'Project' type throughout the file. --- .../models-as-data/bulk_generate_mad.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 6f75536ade54..8b5482628748 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -39,7 +39,7 @@ class Project(TypedDict): """ name: str - git_repo: str + git_repo: NotRequired[str] git_tag: NotRequired[str] @@ -185,7 +185,7 @@ def build_database( return database_dir -def generate_models(args, name: str, database_dir: str) -> None: +def generate_models(args, project: Project, database_dir: str) -> None: """ Generate models for a project. @@ -194,6 +194,7 @@ def generate_models(args, name: str, database_dir: str) -> None: name: The name of the project. database_dir: Path to the CodeQL database. """ + name = project["name"] generator = mad.Generator(args.lang) generator.generateSinks = args.with_sinks @@ -205,7 +206,7 @@ def generate_models(args, name: str, database_dir: str) -> None: def build_databases_from_projects( language: str, extractor_options, projects: List[Project] -) -> List[tuple[str, str | None]]: +) -> List[tuple[Project, str | None]]: """ Build databases for all projects in parallel. @@ -225,7 +226,7 @@ def build_databases_from_projects( print("\n=== Building databases ===") database_results = [ ( - project["name"], + project, build_database(language, extractor_options, project, project_dir), ) for project, project_dir in project_dirs @@ -290,8 +291,8 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str: def download_dca_databases( - experiment_name: str, pat: str, projects -) -> List[tuple[str, str | None]]: + experiment_name: str, pat: str, projects: List[Project] +) -> List[tuple[Project, str | None]]: """ Download databases from a DCA experiment. Args: @@ -308,7 +309,7 @@ def download_dca_databases( pat, ) targets = response["targets"] - for target, data in targets.items(): + for data in targets.values(): downloads = data["downloads"] analyzed_database = downloads["analyzed_database"] artifact_name = analyzed_database["artifact_name"] @@ -349,20 +350,21 @@ def download_dca_databases( tar_ref.extractall(artifact_unzipped_location) database_results.append( ( - pretty_name, + {"name": pretty_name}, os.path.join( artifact_unzipped_location, remove_extension(entry) ), ) ) + print(f"\n=== Extracted {len(database_results)} databases ===") def compare(a, b): a_index = next( - i for i, project in enumerate(projects) if project["name"] == a[0] + i for i, project in enumerate(projects) if project["name"] == a[0]["name"] ) b_index = next( - i for i, project in enumerate(projects) if project["name"] == b[0] + i for i, project in enumerate(projects) if project["name"] == b[0]["name"] ) return a_index - b_index @@ -431,7 +433,9 @@ def main(config, args) -> None: # Generate models for all projects print("\n=== Generating models ===") - failed_builds = [project for project, db_dir in database_results if db_dir is None] + failed_builds = [ + project["name"] for project, db_dir in database_results if db_dir is None + ] if failed_builds: print( f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}" @@ -440,7 +444,7 @@ def main(config, args) -> None: # Delete the MaD directory for each project for project, database_dir in database_results: - mad_dir = get_mad_destination_for_project(config, project) + mad_dir = get_mad_destination_for_project(config, project["name"]) if os.path.exists(mad_dir): print(f"Deleting existing MaD directory at {mad_dir}") subprocess.check_call(["rm", "-rf", mad_dir]) From fc165db8acb30401569073f8e0564e880749d092 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:17:03 +0100 Subject: [PATCH 14/22] Bulk generator: Specify 'with-summaries', 'with-sources', and 'with-sinks' in the config file. --- cpp/misc/bulk_generation_targets.json | 4 +- .../models-as-data/bulk_generate_mad.py | 38 ++++++----- rust/misc/bulk_generation_targets.json | 65 +++++++++++++++---- 3 files changed, 75 insertions(+), 32 deletions(-) diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json index 5f74b094d35a..6cc2223b5e9d 100644 --- a/cpp/misc/bulk_generation_targets.json +++ b/cpp/misc/bulk_generation_targets.json @@ -1,8 +1,8 @@ { "strategy": "dca", "targets": [ - { "name": "openssl" }, - { "name": "sqlite" } + { "name": "openssl", "with_summaries": true }, + { "name": "sqlite", "with_summaries": true } ], "destination": "cpp/ql/lib/ext/generated" } \ No newline at end of file diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 8b5482628748..bed3442f7904 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -41,7 +41,18 @@ class Project(TypedDict): name: str git_repo: NotRequired[str] git_tag: NotRequired[str] + with_sinks: NotRequired[bool] + with_sinks: NotRequired[bool] + with_summaries: NotRequired[bool] +def shouldGenerateSinks(project: Project) -> bool: + return project.get("with_sinks", False) + +def shouldGenerateSources(project: Project) -> bool: + return project.get("with_sources", False) + +def shouldGenerateSummaries(project: Project) -> bool: + return project.get("with_summaries", False) def clone_project(project: Project) -> str: """ @@ -185,7 +196,7 @@ def build_database( return database_dir -def generate_models(args, project: Project, database_dir: str) -> None: +def generate_models(language: str, config, project: Project, database_dir: str) -> None: """ Generate models for a project. @@ -196,10 +207,11 @@ def generate_models(args, project: Project, database_dir: str) -> None: """ name = project["name"] - generator = mad.Generator(args.lang) - generator.generateSinks = args.with_sinks - generator.generateSources = args.with_sources - generator.generateSummaries = args.with_summaries + generator = mad.Generator(language) + # Note: The argument parser converts with-sinks to with_sinks, etc. + generator.generateSinks = shouldGenerateSinks(project) + generator.generateSources = shouldGenerateSources(project) + generator.generateSummaries = shouldGenerateSummaries(project) generator.setenvironment(database=database_dir, folder=name) generator.run() @@ -309,13 +321,14 @@ def download_dca_databases( pat, ) targets = response["targets"] + project_map = {project["name"]: project for project in projects} for data in targets.values(): downloads = data["downloads"] analyzed_database = downloads["analyzed_database"] artifact_name = analyzed_database["artifact_name"] pretty_name = pretty_name_from_artifact_name(artifact_name) - if not pretty_name in [project["name"] for project in projects]: + if not pretty_name in project_map: print(f"Skipping {pretty_name} as it is not in the list of projects") continue @@ -350,7 +363,7 @@ def download_dca_databases( tar_ref.extractall(artifact_unzipped_location) database_results.append( ( - {"name": pretty_name}, + project_map[pretty_name], os.path.join( artifact_unzipped_location, remove_extension(entry) ), @@ -451,7 +464,7 @@ def main(config, args) -> None: for project, database_dir in database_results: if database_dir is not None: - generate_models(args, project, database_dir) + generate_models(language, config, project, database_dir) if __name__ == "__main__": @@ -474,15 +487,6 @@ def main(config, args) -> None: parser.add_argument( "--lang", type=str, help="The language to generate models for", required=True ) - parser.add_argument( - "--with-sources", action="store_true", help="Generate sources", required=False - ) - parser.add_argument( - "--with-sinks", action="store_true", help="Generate sinks", required=False - ) - parser.add_argument( - "--with-summaries", action="store_true", help="Generate sinks", required=False - ) args = parser.parse_args() # Load config file diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json index ca30b76eb12e..85c28c7f8170 100644 --- a/rust/misc/bulk_generation_targets.json +++ b/rust/misc/bulk_generation_targets.json @@ -4,67 +4,106 @@ { "name": "libc", "git_repo": "https://github.com/rust-lang/libc", - "git_tag": "0.2.172" + "git_tag": "0.2.172", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "log", "git_repo": "https://github.com/rust-lang/log", - "git_tag": "0.4.27" + "git_tag": "0.4.27", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "memchr", "git_repo": "https://github.com/BurntSushi/memchr", - "git_tag": "2.7.4" + "git_tag": "2.7.4", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "once_cell", "git_repo": "https://github.com/matklad/once_cell", - "git_tag": "v1.21.3" + "git_tag": "v1.21.3", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "rand", "git_repo": "https://github.com/rust-random/rand", - "git_tag": "0.9.1" + "git_tag": "0.9.1", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "smallvec", "git_repo": "https://github.com/servo/rust-smallvec", - "git_tag": "v1.15.0" + "git_tag": "v1.15.0", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "serde", "git_repo": "https://github.com/serde-rs/serde", - "git_tag": "v1.0.219" + "git_tag": "v1.0.219", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "tokio", "git_repo": "https://github.com/tokio-rs/tokio", - "git_tag": "tokio-1.45.0" + "git_tag": "tokio-1.45.0", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "reqwest", "git_repo": "https://github.com/seanmonstar/reqwest", - "git_tag": "v0.12.15" + "git_tag": "v0.12.15", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "rocket", "git_repo": "https://github.com/SergioBenitez/Rocket", - "git_tag": "v0.5.1" + "git_tag": "v0.5.1", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "actix-web", "git_repo": "https://github.com/actix/actix-web", - "git_tag": "web-v4.11.0" + "git_tag": "web-v4.11.0", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "hyper", "git_repo": "https://github.com/hyperium/hyper", - "git_tag": "v1.6.0" + "git_tag": "v1.6.0", + "with-sources": true, + "with-sinks": true, + "with-summaries": true }, { "name": "clap", "git_repo": "https://github.com/clap-rs/clap", - "git_tag": "v4.5.38" + "git_tag": "v4.5.38", + "with-sources": true, + "with-sinks": true, + "with-summaries": true } ], "destination": "rust/ql/lib/ext/generated", From 122808091486223251514db6112e121126605d17 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:23:17 +0100 Subject: [PATCH 15/22] Bulk generator: Specify 'language' in the config file. --- cpp/misc/bulk_generation_targets.json | 1 + misc/scripts/models-as-data/bulk_generate_mad.py | 13 +++++++------ rust/misc/bulk_generation_targets.json | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json index 6cc2223b5e9d..11935335d81a 100644 --- a/cpp/misc/bulk_generation_targets.json +++ b/cpp/misc/bulk_generation_targets.json @@ -1,5 +1,6 @@ { "strategy": "dca", + "language": "cpp", "targets": [ { "name": "openssl", "with_summaries": true }, { "name": "sqlite", "with_summaries": true } diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index bed3442f7904..1ceffe993ce7 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -196,7 +196,7 @@ def build_database( return database_dir -def generate_models(language: str, config, project: Project, database_dir: str) -> None: +def generate_models(config, project: Project, database_dir: str) -> None: """ Generate models for a project. @@ -206,6 +206,7 @@ def generate_models(language: str, config, project: Project, database_dir: str) database_dir: Path to the CodeQL database. """ name = project["name"] + language = config["language"] generator = mad.Generator(language) # Note: The argument parser converts with-sinks to with_sinks, etc. @@ -402,7 +403,10 @@ def main(config, args) -> None: """ projects = config["targets"] - language = args.lang + if not "language" in config: + print("ERROR: 'language' key is missing in the configuration file.") + sys.exit(1) + language = config["language"] # Create build directory if it doesn't exist if not os.path.exists(build_dir): @@ -464,7 +468,7 @@ def main(config, args) -> None: for project, database_dir in database_results: if database_dir is not None: - generate_models(language, config, project, database_dir) + generate_models(config, project, database_dir) if __name__ == "__main__": @@ -484,9 +488,6 @@ def main(config, args) -> None: help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False, ) - parser.add_argument( - "--lang", type=str, help="The language to generate models for", required=True - ) args = parser.parse_args() # Load config file diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json index 85c28c7f8170..4591042b1401 100644 --- a/rust/misc/bulk_generation_targets.json +++ b/rust/misc/bulk_generation_targets.json @@ -1,5 +1,6 @@ { "strategy": "repo", + "language": "rust", "targets": [ { "name": "libc", From 7c2612a6a10ef208abfdcf78e43f171e034303ae Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:47:07 +0100 Subject: [PATCH 16/22] Bulk generator: Specify a path to the PAT instead of the PAT itself. --- misc/scripts/models-as-data/bulk_generate_mad.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 1ceffe993ce7..fa679594e9d2 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -441,11 +441,18 @@ def main(config, args) -> None: if experiment_name is None: print("ERROR: --dca argument is required for DCA strategy") sys.exit(1) - pat = args.pat - if pat is None: + + if args.pat is None: print("ERROR: --pat argument is required for DCA strategy") sys.exit(1) - database_results = download_dca_databases(experiment_name, pat, projects) + if not os.path.exists(args.pat): + print(f"ERROR: Personal Access Token file '{pat}' does not exist.") + sys.exit(1) + with open(args.pat, "r") as f: + pat = f.read().strip() + database_results = download_dca_databases( + experiment_name, pat, projects + ) # Generate models for all projects print("\n=== Generating models ===") @@ -485,7 +492,7 @@ def main(config, args) -> None: parser.add_argument( "--pat", type=str, - help="PAT token to grab DCA databases (the same as the one you use for DCA)", + help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)", required=False, ) args = parser.parse_args() From 3ddca327056bc21e7db95f9ef8b40cfa00c57c7a Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:48:50 +0100 Subject: [PATCH 17/22] Update misc/scripts/models-as-data/bulk_generate_mad.py Co-authored-by: Simon Friis Vindum --- misc/scripts/models-as-data/bulk_generate_mad.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index fa679594e9d2..eea09c6a10c5 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -281,16 +281,15 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str: headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"} response = requests.get(url, stream=True, headers=headers) zipName = artifact_name + ".zip" - if response.status_code == 200: - target_zip = os.path.join(build_dir, zipName) - with open(target_zip, "wb") as file: - for chunk in response.iter_content(chunk_size=8192): - file.write(chunk) - print(f"Download complete: {target_zip}") - return target_zip - else: + if response.status_code != 200: print(f"Failed to download file. Status code: {response.status_code}") sys.exit(1) + target_zip = os.path.join(build_dir, zipName) + with open(target_zip, "wb") as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + print(f"Download complete: {target_zip}") + return target_zip def remove_extension(filename: str) -> str: From cdd869a970a1348efd3e2992dc4e6d14b5d5f64b Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 12:49:12 +0100 Subject: [PATCH 18/22] Bulk generator: Autoformat. --- misc/scripts/models-as-data/bulk_generate_mad.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index eea09c6a10c5..3a104861580c 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -45,15 +45,19 @@ class Project(TypedDict): with_sinks: NotRequired[bool] with_summaries: NotRequired[bool] + def shouldGenerateSinks(project: Project) -> bool: return project.get("with_sinks", False) + def shouldGenerateSources(project: Project) -> bool: return project.get("with_sources", False) + def shouldGenerateSummaries(project: Project) -> bool: return project.get("with_summaries", False) + def clone_project(project: Project) -> str: """ Shallow clone a project into the build directory. From bdf411afbc0431d746dab4f756bcafe5f28a3694 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 13:09:55 +0100 Subject: [PATCH 19/22] Bulk generator: Make 'database_results' a map to simplify away the explicit sorting. --- .../models-as-data/bulk_generate_mad.py | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 3a104861580c..dc15dab26a11 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -318,7 +318,7 @@ def download_dca_databases( Returns: List of (project_name, database_dir) pairs, where database_dir is None if the download failed. """ - database_results = [] + database_results = {} print("\n=== Finding projects ===") response = get_json_from_github( f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", @@ -365,28 +365,13 @@ def download_dca_databases( with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: # And we just untar it to the same directory as the zip file tar_ref.extractall(artifact_unzipped_location) - database_results.append( - ( - project_map[pretty_name], - os.path.join( - artifact_unzipped_location, remove_extension(entry) - ), - ) + database_results[pretty_name] = os.path.join( + artifact_unzipped_location, remove_extension(entry) ) print(f"\n=== Extracted {len(database_results)} databases ===") - def compare(a, b): - a_index = next( - i for i, project in enumerate(projects) if project["name"] == a[0]["name"] - ) - b_index = next( - i for i, project in enumerate(projects) if project["name"] == b[0]["name"] - ) - return a_index - b_index - - # Sort the database results based on the order in the projects file - return sorted(database_results, key=cmp_to_key(compare)) + return [(project, database_results[project["name"]]) for project in projects] def get_mad_destination_for_project(config, name: str) -> str: From 3444c986ec6f98880f529aa2e438d91920c51bf7 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 13:25:12 +0100 Subject: [PATCH 20/22] Bulk generator: Fix field name. --- cpp/misc/bulk_generation_targets.json | 4 ++-- misc/scripts/models-as-data/bulk_generate_mad.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json index 11935335d81a..4a11b1f8c6b1 100644 --- a/cpp/misc/bulk_generation_targets.json +++ b/cpp/misc/bulk_generation_targets.json @@ -2,8 +2,8 @@ "strategy": "dca", "language": "cpp", "targets": [ - { "name": "openssl", "with_summaries": true }, - { "name": "sqlite", "with_summaries": true } + { "name": "openssl", "with-summaries": true }, + { "name": "sqlite", "with-summaries": true } ], "destination": "cpp/ql/lib/ext/generated" } \ No newline at end of file diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index dc15dab26a11..84c2e51c7f49 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -47,15 +47,15 @@ class Project(TypedDict): def shouldGenerateSinks(project: Project) -> bool: - return project.get("with_sinks", False) + return project.get("with-sinks", False) def shouldGenerateSources(project: Project) -> bool: - return project.get("with_sources", False) + return project.get("with-sources", False) def shouldGenerateSummaries(project: Project) -> bool: - return project.get("with_summaries", False) + return project.get("with-summaries", False) def clone_project(project: Project) -> str: From 0f30644afd77c6c5ceef1b38e35d52760a7333a4 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 13:26:53 +0100 Subject: [PATCH 21/22] Bulk generator: Snake case things. --- misc/scripts/models-as-data/bulk_generate_mad.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 84c2e51c7f49..61e66ffef12d 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -46,15 +46,15 @@ class Project(TypedDict): with_summaries: NotRequired[bool] -def shouldGenerateSinks(project: Project) -> bool: +def should_generate_sinks(project: Project) -> bool: return project.get("with-sinks", False) -def shouldGenerateSources(project: Project) -> bool: +def should_generate_sources(project: Project) -> bool: return project.get("with-sources", False) -def shouldGenerateSummaries(project: Project) -> bool: +def should_generate_summaries(project: Project) -> bool: return project.get("with-summaries", False) @@ -214,9 +214,9 @@ def generate_models(config, project: Project, database_dir: str) -> None: generator = mad.Generator(language) # Note: The argument parser converts with-sinks to with_sinks, etc. - generator.generateSinks = shouldGenerateSinks(project) - generator.generateSources = shouldGenerateSources(project) - generator.generateSummaries = shouldGenerateSummaries(project) + generator.generateSinks = should_generate_sinks(project) + generator.generateSources = should_generate_sources(project) + generator.generateSummaries = should_generate_summaries(project) generator.setenvironment(database=database_dir, folder=name) generator.run() From 7cb9024cc620a2ecd9b7f41e8a3224e56c0cb8f3 Mon Sep 17 00:00:00 2001 From: Mathias Vorreiter Pedersen Date: Fri, 30 May 2025 13:33:24 +0100 Subject: [PATCH 22/22] Bulk generator: Flip default values for summaries, sources, and sinks. --- cpp/misc/bulk_generation_targets.json | 4 +- .../models-as-data/bulk_generate_mad.py | 6 +- rust/misc/bulk_generation_targets.json | 65 ++++--------------- 3 files changed, 18 insertions(+), 57 deletions(-) diff --git a/cpp/misc/bulk_generation_targets.json b/cpp/misc/bulk_generation_targets.json index 4a11b1f8c6b1..4cddef005b2f 100644 --- a/cpp/misc/bulk_generation_targets.json +++ b/cpp/misc/bulk_generation_targets.json @@ -2,8 +2,8 @@ "strategy": "dca", "language": "cpp", "targets": [ - { "name": "openssl", "with-summaries": true }, - { "name": "sqlite", "with-summaries": true } + { "name": "openssl", "with-sources": false, "with-sinks": false }, + { "name": "sqlite", "with-sources": false, "with-sinks": false } ], "destination": "cpp/ql/lib/ext/generated" } \ No newline at end of file diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 61e66ffef12d..22a872dc2bf2 100644 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -47,15 +47,15 @@ class Project(TypedDict): def should_generate_sinks(project: Project) -> bool: - return project.get("with-sinks", False) + return project.get("with-sinks", True) def should_generate_sources(project: Project) -> bool: - return project.get("with-sources", False) + return project.get("with-sources", True) def should_generate_summaries(project: Project) -> bool: - return project.get("with-summaries", False) + return project.get("with-summaries", True) def clone_project(project: Project) -> str: diff --git a/rust/misc/bulk_generation_targets.json b/rust/misc/bulk_generation_targets.json index 4591042b1401..274d5dc5b361 100644 --- a/rust/misc/bulk_generation_targets.json +++ b/rust/misc/bulk_generation_targets.json @@ -5,106 +5,67 @@ { "name": "libc", "git_repo": "https://github.com/rust-lang/libc", - "git_tag": "0.2.172", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "0.2.172" }, { "name": "log", "git_repo": "https://github.com/rust-lang/log", - "git_tag": "0.4.27", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "0.4.27" }, { "name": "memchr", "git_repo": "https://github.com/BurntSushi/memchr", - "git_tag": "2.7.4", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "2.7.4" }, { "name": "once_cell", "git_repo": "https://github.com/matklad/once_cell", - "git_tag": "v1.21.3", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v1.21.3" }, { "name": "rand", "git_repo": "https://github.com/rust-random/rand", - "git_tag": "0.9.1", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "0.9.1" }, { "name": "smallvec", "git_repo": "https://github.com/servo/rust-smallvec", - "git_tag": "v1.15.0", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v1.15.0" }, { "name": "serde", "git_repo": "https://github.com/serde-rs/serde", - "git_tag": "v1.0.219", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v1.0.219" }, { "name": "tokio", "git_repo": "https://github.com/tokio-rs/tokio", - "git_tag": "tokio-1.45.0", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "tokio-1.45.0" }, { "name": "reqwest", "git_repo": "https://github.com/seanmonstar/reqwest", - "git_tag": "v0.12.15", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v0.12.15" }, { "name": "rocket", "git_repo": "https://github.com/SergioBenitez/Rocket", - "git_tag": "v0.5.1", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v0.5.1" }, { "name": "actix-web", "git_repo": "https://github.com/actix/actix-web", - "git_tag": "web-v4.11.0", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "web-v4.11.0" }, { "name": "hyper", "git_repo": "https://github.com/hyperium/hyper", - "git_tag": "v1.6.0", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v1.6.0" }, { "name": "clap", "git_repo": "https://github.com/clap-rs/clap", - "git_tag": "v4.5.38", - "with-sources": true, - "with-sinks": true, - "with-summaries": true + "git_tag": "v4.5.38" } ], "destination": "rust/ql/lib/ext/generated",