diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 72e4ea4b3..3e4166b19 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -44,6 +44,7 @@ from vulnerabilities.pipelines.v2_importers import aosp_importer as aosp_importer_v2 from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2 +from vulnerabilities.pipelines.v2_importers import collect_fix_commits as collect_fix_commits_v2 from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, @@ -135,5 +136,43 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + collect_fix_commits_v2.CollectLinuxFixCommitsPipeline, + collect_fix_commits_v2.CollectBusyBoxFixCommitsPipeline, + collect_fix_commits_v2.CollectNginxFixCommitsPipeline, + collect_fix_commits_v2.CollectApacheTomcatFixCommitsPipeline, + collect_fix_commits_v2.CollectMysqlServerFixCommitsPipeline, + collect_fix_commits_v2.CollectPostgresqlFixCommitsPipeline, + collect_fix_commits_v2.CollectMongodbFixCommitsPipeline, + collect_fix_commits_v2.CollectRedisFixCommitsPipeline, + collect_fix_commits_v2.CollectSqliteFixCommitsPipeline, + collect_fix_commits_v2.CollectPhpFixCommitsPipeline, + collect_fix_commits_v2.CollectPythonCpythonFixCommitsPipeline, + collect_fix_commits_v2.CollectRubyFixCommitsPipeline, + collect_fix_commits_v2.CollectGoFixCommitsPipeline, + collect_fix_commits_v2.CollectNodeJsFixCommitsPipeline, + collect_fix_commits_v2.CollectRustFixCommitsPipeline, + collect_fix_commits_v2.CollectOpenjdkFixCommitsPipeline, + collect_fix_commits_v2.CollectSwiftFixCommitsPipeline, + collect_fix_commits_v2.CollectOpensslFixCommitsPipeline, + collect_fix_commits_v2.CollectDjangoFixCommitsPipeline, + collect_fix_commits_v2.CollectRailsFixCommitsPipeline, + collect_fix_commits_v2.CollectLaravelFixCommitsPipeline, + collect_fix_commits_v2.CollectSpringFrameworkFixCommitsPipeline, + collect_fix_commits_v2.CollectReactFixCommitsPipeline, + collect_fix_commits_v2.CollectAngularFixCommitsPipeline, + collect_fix_commits_v2.CollectWordpressFixCommitsPipeline, + collect_fix_commits_v2.CollectDockerMobyFixCommitsPipeline, + collect_fix_commits_v2.CollectKubernetesFixCommitsPipeline, + collect_fix_commits_v2.CollectQemuFixCommitsPipeline, + collect_fix_commits_v2.CollectXenProjectFixCommitsPipeline, + collect_fix_commits_v2.CollectVirtualboxFixCommitsPipeline, + collect_fix_commits_v2.CollectContainerdFixCommitsPipeline, + collect_fix_commits_v2.CollectAnsibleFixCommitsPipeline, + collect_fix_commits_v2.CollectTerraformFixCommitsPipeline, + collect_fix_commits_v2.CollectWiresharkFixCommitsPipeline, + collect_fix_commits_v2.CollectTcpdumpFixCommitsPipeline, + collect_fix_commits_v2.CollectGitFixCommitsPipeline, + collect_fix_commits_v2.CollectJenkinsFixCommitsPipeline, + collect_fix_commits_v2.CollectGitlabFixCommitsPipeline, ] ) diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index 9efd58c05..d263e3b0b 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -8,7 +8,11 @@ # import logging +import re +import shutil +import tempfile import traceback +from collections import defaultdict from datetime import datetime from datetime import timezone from timeit import default_timer as timer @@ -19,8 +23,12 @@ from aboutcode.pipeline import LoopProgress from aboutcode.pipeline import PipelineDefinition from aboutcode.pipeline import humanize_time +from git import Repo +from packageurl.contrib.url2purl import url2purl from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackageV2 +from vulnerabilities.importer import PackageCommitPatchData from vulnerabilities.improver import MAX_CONFIDENCE from vulnerabilities.models import Advisory from vulnerabilities.models import PipelineRun @@ -321,3 +329,109 @@ def collect_and_store_advisories(self): continue self.log(f"Successfully collected {collected_advisory_count:,d} advisories") + + +class CollectVCSFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect fix commits from any git repository. + """ + + repo_url: str + patterns: list[str] = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"GHSA-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}-[2-9cfghjmpqrvwx]{4}", + ] + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + """Clone the repository.""" + self.repo = Repo.clone_from( + url=self.repo_url, + to_path=tempfile.mkdtemp(), + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) + + def advisories_count(self) -> int: + return 0 + + def extract_vulnerability_id(self, commit) -> list[str]: + """ + Extract vulnerability id from a commit message. + Returns a list of matched vulnerability IDs + """ + matches = [] + for pattern in self.patterns: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + return a list with (vuln_id, [(commit_id, commit_message)]). + """ + self.log("Processing git repository fix commits (grouped by vulnerability IDs).") + + grouped_commits = defaultdict(list) + for commit in self.repo.iter_commits("--all"): + matched_ids = self.extract_vulnerability_id(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + grouped_commits[vuln_id].append((commit_id, commit_message)) + + self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") + self.log("Finished processing all commits.") + return grouped_commits + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. + """ + self.log("Generating AdvisoryData objects from grouped commits.") + grouped_commits = self.collect_fix_commits() + purl = url2purl(self.repo_url) + + for vuln_id, commits_data in grouped_commits.items(): + if not commits_data or not vuln_id: + continue + + commit_hash_set = {commit_hash for commit_hash, _ in commits_data} + affected_packages = [ + AffectedPackageV2( + package=purl, + fixed_by_commit_patches=[ + PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash) + for commit_hash in commit_hash_set + ], + ) + ] + + yield AdvisoryData( + advisory_id=vuln_id, + affected_packages=affected_packages, + url=self.repo_url, + ) + + def clean_downloads(self): + """Cleanup any temporary repository data.""" + self.log("Cleaning up local repository resources.") + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) + + def on_failure(self): + """Ensure cleanup is always performed on failure.""" + self.clean_downloads() diff --git a/vulnerabilities/pipelines/v2_importers/collect_fix_commits.py b/vulnerabilities/pipelines/v2_importers/collect_fix_commits.py new file mode 100644 index 000000000..569486d70 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_fix_commits.py @@ -0,0 +1,191 @@ +from vulnerabilities.pipelines import CollectVCSFixCommitPipeline + + +class CollectLinuxFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_linux_fix_commits" + repo_url = "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git" + + +class CollectBusyBoxFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_busybox_fix_commits" + repo_url = "https://github.com/mirror/busybox.git" + + +class CollectNginxFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_nginx_fix_commits" + repo_url = "https://github.com/nginx/nginx.git" + + +class CollectApacheTomcatFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_apache_tomcat_fix_commits" + repo_url = "https://github.com/apache/tomcat.git" + + +class CollectMysqlServerFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_mysql_server_fix_commits" + repo_url = "https://github.com/mysql/mysql-server.git" + + +class CollectPostgresqlFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_postgresql_fix_commits" + repo_url = "https://github.com/postgres/postgres.git" + + +class CollectMongodbFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_mongodb_fix_commits" + repo_url = "https://github.com/mongodb/mongo.git" + + +class CollectRedisFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_redis_fix_commits" + repo_url = "https://github.com/redis/redis.git" + + +class CollectSqliteFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_sqlite_fix_commits" + repo_url = "https://github.com/sqlite/sqlite.git" + + +class CollectPhpFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_php_fix_commits" + repo_url = "https://github.com/php/php-src.git" + + +class CollectPythonCpythonFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_python_cpython_fix_commits" + repo_url = "https://github.com/python/cpython.git" + + +class CollectRubyFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_ruby_fix_commits" + repo_url = "https://github.com/ruby/ruby.git" + + +class CollectGoFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_go_fix_commits" + repo_url = "https://github.com/golang/go.git" + + +class CollectNodeJsFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_node_js_fix_commits" + repo_url = "https://github.com/nodejs/node.git" + + +class CollectRustFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_rust_fix_commits" + repo_url = "https://github.com/rust-lang/rust.git" + + +class CollectOpenjdkFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_openjdk_fix_commits" + repo_url = "https://github.com/openjdk/jdk.git" + + +class CollectSwiftFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_swift_fix_commits" + repo_url = "https://github.com/swiftlang/swift.git" + + +class CollectOpensslFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_openssl_fix_commits" + repo_url = "https://github.com/openssl/openssl.git" + + +class CollectDjangoFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_django_fix_commits" + repo_url = "https://github.com/django/django.git" + + +class CollectRailsFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_rails_fix_commits" + repo_url = "https://github.com/rails/rails.git" + + +class CollectLaravelFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_laravel_fix_commits" + repo_url = "https://github.com/laravel/framework.git" + + +class CollectSpringFrameworkFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_spring_framework_fix_commits" + repo_url = "https://github.com/spring-projects/spring-framework.git" + + +class CollectReactFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_react_fix_commits" + repo_url = "https://github.com/facebook/react.git" + + +class CollectAngularFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_angular_fix_commits" + repo_url = "https://github.com/angular/angular.git" + + +class CollectWordpressFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_wordpress_fix_commits" + repo_url = "https://github.com/WordPress/WordPress.git" + + +class CollectDockerMobyFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_docker_moby_fix_commits" + repo_url = "https://github.com/moby/moby.git" + + +class CollectKubernetesFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_kubernetes_fix_commits" + repo_url = "https://github.com/kubernetes/kubernetes.git" + + +class CollectQemuFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_qemu_fix_commits" + repo_url = "https://gitlab.com/qemu-project/qemu.git" + + +class CollectXenProjectFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_xen_project_fix_commits" + repo_url = "https://github.com/xen-project/xen.git" + + +class CollectVirtualboxFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_virtualbox_fix_commits" + repo_url = "https://github.com/mirror/vbox.git" + + +class CollectContainerdFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_containerd_fix_commits" + repo_url = "https://github.com/containerd/containerd.git" + + +class CollectAnsibleFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_ansible_fix_commits" + repo_url = "https://github.com/ansible/ansible.git" + + +class CollectTerraformFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_terraform_fix_commits" + repo_url = "https://github.com/hashicorp/terraform.git" + + +class CollectWiresharkFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_wireshark_fix_commits" + repo_url = "https://gitlab.com/wireshark/wireshark.git" + + +class CollectTcpdumpFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_tcpdump_fix_commits" + repo_url = "https://github.com/the-tcpdump-group/tcpdump.git" + + +class CollectGitFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_git_fix_commits" + repo_url = "https://github.com/git/git.git" + + +class CollectJenkinsFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_jenkins_fix_commits" + repo_url = "https://github.com/jenkinsci/jenkins.git" + + +class CollectGitlabFixCommitsPipeline(CollectVCSFixCommitPipeline): + pipeline_id = "collect_gitlab_fix_commits" + repo_url = "https://gitlab.com/gitlab-org/gitlab-foss.git" diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py new file mode 100644 index 000000000..de279195c --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py @@ -0,0 +1,120 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from unittest import TestCase +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest + +from vulnerabilities.pipelines import CollectVCSFixCommitPipeline +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = CollectVCSFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.pipeline_id = "collect_repo_fix_commits" + pipeline.log = MagicMock() + return pipeline + + +def test_classify_commit_type_extracts_ids(pipeline): + class DummyCommit: + message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq" + + result = pipeline.extract_vulnerability_id(DummyCommit) + assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"] + + +@patch("vulnerabilities.pipelines.Repo") +def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline): + commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123") + commit2 = MagicMock(message="Patch GHSA-f72r-2h5j-7639", hexsha="def456") + commit3 = MagicMock(message="Unrelated change", hexsha="ghi789") + + pipeline.repo = MagicMock() + pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3] + + pipeline.classify_commit_type = MagicMock( + side_effect=lambda c: ( + ["CVE-2021-0001"] + if "CVE" in c.message + else ["GHSA-dead-beef-baad"] + if "GHSA" in c.message + else [] + ) + ) + + grouped = pipeline.collect_fix_commits() + + expected = { + "CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")], + "GHSA-f72r-2h5j-7639": [("def456", "Patch GHSA-f72r-2h5j-7639")], + } + + assert grouped == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits" + + +class TestRepoFixCommitPipeline(TestCase): + def test_collect_advisories_from_json(self): + input_file = TEST_DATA / "grouped_commits_input.json" + expected_file = TEST_DATA / "expected_linux_advisory_output.json" + + grouped_commits = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = CollectVCSFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file, True) + + +@pytest.mark.parametrize( + "commit_message, expected_ids", + [ + ("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]), + ("Address GHSA-4486-gxhx-5mg7 report", ["GHSA-4486-gxhx-5mg7"]), + ( + "Fix CVE-2023-1111 and GHSA-gch2-phqh-fg9q in kernel", + ["CVE-2023-1111", "GHSA-gch2-phqh-fg9q"], + ), + ("Refactor logging system with no security ID", []), + ], +) +def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids): + """Ensure classify_commit_type correctly extracts vulnerability IDs.""" + + class DummyCommit: + def __init__(self, message): + self.message = message + + commit = DummyCommit(commit_message) + result = pipeline.extract_vulnerability_id(commit) + + assert result == expected_ids, f"Unexpected result for message: {commit_message}" + + +def test_classify_commit_type_case_insensitive(pipeline): + """Ensure pattern matching is case-insensitive.""" + + class DummyCommit: + message = "fix CVE-2022-9999 and GHSA-gqgv-6jq5-jjj9" + + result = pipeline.extract_vulnerability_id(DummyCommit) + assert result == ["CVE-2022-9999", "GHSA-gqgv-6jq5-jjj9"] diff --git a/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json new file mode 100644 index 000000000..ef755d014 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json @@ -0,0 +1,70 @@ +[ + { + "advisory_id": "CVE-2021-0001", + "aliases": [], + "summary": "", + "affected_packages": [ + { + "package": { + "type": "github", + "namespace": "test", + "name": "repo", + "version": "", + "qualifiers": "", + "subpath": "" + }, + "affected_version_range": null, + "fixed_version_range": null, + "introduced_by_commit_patches": [], + "fixed_by_commit_patches": [ + { + "vcs_url": "https://github.com/test/repo", + "commit_hash": "41b43c74bda19753c757036673ea9db74acf494a", + "patch_text": null, + "patch_checksum": null + } + ] + } + ], + "references_v2": [], + "patches": [], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-dead-beef-baad", + "aliases": [], + "summary": "", + "affected_packages": [ + { + "package": { + "type": "github", + "namespace": "test", + "name": "repo", + "version": "", + "qualifiers": "", + "subpath": "" + }, + "affected_version_range": null, + "fixed_version_range": null, + "introduced_by_commit_patches": [], + "fixed_by_commit_patches": [ + { + "vcs_url": "https://github.com/test/repo", + "commit_hash": "49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba", + "patch_text": null, + "patch_checksum": null + } + ] + } + ], + "references_v2": [], + "patches": [], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json new file mode 100644 index 000000000..f905c9710 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json @@ -0,0 +1,8 @@ +{ + "CVE-2021-0001": [ + ["41b43c74bda19753c757036673ea9db74acf494a", "Fixed CVE-2025-59681 -- Protected QuerySet.annotate(), alias(), aggregate(), and extra() against SQL injection in column aliases on MySQL/MariaDB."] + ], + "GHSA-dead-beef-baad": [ + ["49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba", "Fixed CVE-2024-53907 -- Mitigated potential DoS in strip_tags()."] + ] +} \ No newline at end of file