|
| 1 | +# |
| 2 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 3 | +# VulnerableCode is a trademark of nexB Inc. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. |
| 6 | +# See https://github.com/aboutcode-org/vulnerablecode for support or download. |
| 7 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 8 | +# |
| 9 | + |
| 10 | +from unittest import TestCase |
| 11 | + |
| 12 | +import pytest |
| 13 | +from django.test import TestCase as DjangoTestCase |
| 14 | +from packageurl import PackageURL |
| 15 | +from univers.version_range import VersionRange |
| 16 | + |
| 17 | +from vulnerabilities.importer import AdvisoryDataV2 |
| 18 | +from vulnerabilities.importer import AffectedPackageV2 |
| 19 | +from vulnerabilities.importer import ReferenceV2 |
| 20 | +from vulnerabilities.models import AdvisoryV2 |
| 21 | +from vulnerabilities.pipes.advisory import insert_advisory_v2 |
| 22 | +from vulnerabilities.utils import SUMMARY_SIMILARITY_THRESHOLD |
| 23 | +from vulnerabilities.utils import _find |
| 24 | +from vulnerabilities.utils import _union |
| 25 | +from vulnerabilities.utils import compute_summary_similarity |
| 26 | +from vulnerabilities.utils import group_advisories_by_content |
| 27 | + |
| 28 | + |
| 29 | +class TestComputeSummarySimilarity(TestCase): |
| 30 | + |
| 31 | + def test_empty_texts_return_zero(self): |
| 32 | + assert compute_summary_similarity("", "some text") == 0.0 |
| 33 | + assert compute_summary_similarity("some text", "") == 0.0 |
| 34 | + assert compute_summary_similarity("", "") == 0.0 |
| 35 | + assert compute_summary_similarity(None, "text") == 0.0 |
| 36 | + assert compute_summary_similarity("text", None) == 0.0 |
| 37 | + |
| 38 | + def test_identical_texts_return_one(self): |
| 39 | + text = "A critical vulnerability in the AccessControl module." |
| 40 | + assert compute_summary_similarity(text, text) == 1.0 |
| 41 | + |
| 42 | + def test_identical_after_normalization(self): |
| 43 | + text1 = "Security flaw in module" |
| 44 | + text2 = " Security Flaw In Module " |
| 45 | + assert compute_summary_similarity(text1, text2) == 1.0 |
| 46 | + |
| 47 | + def test_completely_different_texts(self): |
| 48 | + text1 = "Buffer overflow in network stack" |
| 49 | + text2 = "Unrelated cooking recipe for chocolate cake" |
| 50 | + similarity = compute_summary_similarity(text1, text2) |
| 51 | + assert similarity < SUMMARY_SIMILARITY_THRESHOLD |
| 52 | + |
| 53 | + def test_short_summary_contained_in_long_summary(self): |
| 54 | + short = ( |
| 55 | + "The module AccessControl defines security policies for " |
| 56 | + "Python code used in restricted code within Zope applications." |
| 57 | + ) |
| 58 | + long = ( |
| 59 | + "The module AccessControl defines security policies for " |
| 60 | + "Python code used in restricted code within Zope applications. " |
| 61 | + "Restricted code is any code that resides in Zope's object database, " |
| 62 | + "such as the contents of Script (Python) objects. The policies " |
| 63 | + "defined in AccessControl severely restrict access to Python modules " |
| 64 | + "and only exempt a few that are deemed safe." |
| 65 | + ) |
| 66 | + similarity = compute_summary_similarity(short, long) |
| 67 | + assert similarity >= SUMMARY_SIMILARITY_THRESHOLD |
| 68 | + |
| 69 | + def test_similar_summaries_above_threshold(self): |
| 70 | + text1 = "SQL injection vulnerability in login form of web application" |
| 71 | + text2 = "SQL injection vulnerability found in the login form of the web application" |
| 72 | + similarity = compute_summary_similarity(text1, text2) |
| 73 | + assert similarity >= SUMMARY_SIMILARITY_THRESHOLD |
| 74 | + |
| 75 | + def test_partially_overlapping_summaries(self): |
| 76 | + text1 = "Remote code execution via crafted XML payload" |
| 77 | + text2 = "Remote code execution through specially crafted XML input" |
| 78 | + similarity = compute_summary_similarity(text1, text2) |
| 79 | + assert similarity > 0.4 |
| 80 | + |
| 81 | + def test_symmetry(self): |
| 82 | + text1 = "Cross-site scripting in admin panel" |
| 83 | + text2 = "XSS vulnerability found in the administration panel" |
| 84 | + assert compute_summary_similarity(text1, text2) == compute_summary_similarity( |
| 85 | + text2, text1 |
| 86 | + ) |
| 87 | + |
| 88 | + |
| 89 | +class TestUnionFind(TestCase): |
| 90 | + |
| 91 | + def test_find_on_singleton(self): |
| 92 | + parent = [0, 1, 2] |
| 93 | + assert _find(parent, 0) == 0 |
| 94 | + assert _find(parent, 2) == 2 |
| 95 | + |
| 96 | + def test_union_merges_two_sets(self): |
| 97 | + parent = [0, 1, 2, 3] |
| 98 | + _union(parent, 0, 1) |
| 99 | + assert _find(parent, 0) == _find(parent, 1) |
| 100 | + |
| 101 | + def test_transitive_union(self): |
| 102 | + parent = [0, 1, 2, 3, 4] |
| 103 | + _union(parent, 0, 1) |
| 104 | + _union(parent, 2, 3) |
| 105 | + _union(parent, 1, 3) |
| 106 | + root = _find(parent, 0) |
| 107 | + assert _find(parent, 1) == root |
| 108 | + assert _find(parent, 2) == root |
| 109 | + assert _find(parent, 3) == root |
| 110 | + assert _find(parent, 4) == 4 |
| 111 | + |
| 112 | + def test_path_compression(self): |
| 113 | + parent = [0, 0, 1, 2] |
| 114 | + root = _find(parent, 3) |
| 115 | + assert root == 0 |
| 116 | + assert parent[3] == 0 or _find(parent, 3) == 0 |
| 117 | + |
| 118 | + |
| 119 | +@pytest.mark.django_db |
| 120 | +class TestGroupAdvisoriesByContent(DjangoTestCase): |
| 121 | + |
| 122 | + def _create_advisory(self, advisory_id, datasource_id, summary, aliases=None, url=None): |
| 123 | + advisory_data = AdvisoryDataV2( |
| 124 | + advisory_id=advisory_id, |
| 125 | + aliases=aliases or [], |
| 126 | + summary=summary, |
| 127 | + affected_packages=[ |
| 128 | + AffectedPackageV2( |
| 129 | + package=PackageURL(type="pypi", name="accesscontrol"), |
| 130 | + affected_version_range=VersionRange.from_string("vers:pypi/>=4.0|<4.3"), |
| 131 | + ), |
| 132 | + ], |
| 133 | + references=[ReferenceV2(url=url or "https://example.com")], |
| 134 | + url=url or f"https://example.com/{advisory_id}", |
| 135 | + ) |
| 136 | + insert_advisory_v2( |
| 137 | + advisory=advisory_data, |
| 138 | + pipeline_id=datasource_id, |
| 139 | + ) |
| 140 | + return AdvisoryV2.objects.get( |
| 141 | + datasource_id=datasource_id, |
| 142 | + advisory_id=advisory_id, |
| 143 | + ) |
| 144 | + |
| 145 | + def test_group_by_exact_content_same_hash(self): |
| 146 | + adv1 = self._create_advisory( |
| 147 | + advisory_id="ADV-001", |
| 148 | + datasource_id="source_a", |
| 149 | + summary="Identical summary", |
| 150 | + url="https://source-a.example.com/ADV-001", |
| 151 | + ) |
| 152 | + adv2 = self._create_advisory( |
| 153 | + advisory_id="ADV-001", |
| 154 | + datasource_id="source_b", |
| 155 | + summary="Identical summary", |
| 156 | + url="https://source-b.example.com/ADV-001", |
| 157 | + ) |
| 158 | + grouped = group_advisories_by_content([adv1, adv2]) |
| 159 | + assert len(grouped) == 1 |
| 160 | + group = list(grouped.values())[0] |
| 161 | + all_advisories = {group["primary"]} | group["secondary"] |
| 162 | + assert adv1 in all_advisories |
| 163 | + assert adv2 in all_advisories |
| 164 | + |
| 165 | + def test_different_content_no_alias_no_similarity(self): |
| 166 | + adv1 = self._create_advisory( |
| 167 | + advisory_id="ADV-100", |
| 168 | + datasource_id="source_a", |
| 169 | + summary="Buffer overflow in network stack", |
| 170 | + url="https://example.com/ADV-100", |
| 171 | + ) |
| 172 | + adv2 = self._create_advisory( |
| 173 | + advisory_id="ADV-200", |
| 174 | + datasource_id="source_b", |
| 175 | + summary="Unrelated cooking instructions for pizza dough", |
| 176 | + url="https://example.com/ADV-200", |
| 177 | + ) |
| 178 | + grouped = group_advisories_by_content([adv1, adv2]) |
| 179 | + assert len(grouped) == 2 |
| 180 | + |
| 181 | + def test_group_by_shared_alias(self): |
| 182 | + adv1 = self._create_advisory( |
| 183 | + advisory_id="CVE-2021-32807", |
| 184 | + datasource_id="gitlab_importer_v2", |
| 185 | + summary="Improperly Controlled Modification of Dynamically-Determined Object Attributes", |
| 186 | + aliases=["CVE-2021-32807", "GHSA-qcx9-j53g-ccgf"], |
| 187 | + url="https://gitlab.com/gitlab-org/advisories-community/-/blob/main/pypi/AccessControl/CVE-2021-32807.yml", |
| 188 | + ) |
| 189 | + adv2 = self._create_advisory( |
| 190 | + advisory_id="PYSEC-2021-335", |
| 191 | + datasource_id="pypa_importer_v2", |
| 192 | + summary=( |
| 193 | + "The module AccessControl defines security policies for Python code " |
| 194 | + "used in restricted code within Zope applications. Restricted code is " |
| 195 | + "any code that resides in Zopes object database." |
| 196 | + ), |
| 197 | + aliases=["CVE-2021-32807", "GHSA-qcx9-j53g-ccgf"], |
| 198 | + url="https://github.com/pypa/advisory-database/blob/main/vulns/accesscontrol/PYSEC-2021-335.yaml", |
| 199 | + ) |
| 200 | + grouped = group_advisories_by_content([adv1, adv2]) |
| 201 | + assert len(grouped) == 1 |
| 202 | + group = list(grouped.values())[0] |
| 203 | + all_advisories = {group["primary"]} | group["secondary"] |
| 204 | + assert adv1 in all_advisories |
| 205 | + assert adv2 in all_advisories |
| 206 | + |
| 207 | + def test_alias_chain_merges_three_advisories(self): |
| 208 | + adv_a = self._create_advisory( |
| 209 | + advisory_id="ADV-A", |
| 210 | + datasource_id="source_1", |
| 211 | + summary="Summary A about access control", |
| 212 | + aliases=["CVE-2099-0001"], |
| 213 | + url="https://example.com/a", |
| 214 | + ) |
| 215 | + adv_b = self._create_advisory( |
| 216 | + advisory_id="ADV-B", |
| 217 | + datasource_id="source_2", |
| 218 | + summary="Summary B about restricted code", |
| 219 | + aliases=["CVE-2099-0001", "GHSA-xxxx-yyyy-zzzz"], |
| 220 | + url="https://example.com/b", |
| 221 | + ) |
| 222 | + adv_c = self._create_advisory( |
| 223 | + advisory_id="ADV-C", |
| 224 | + datasource_id="source_3", |
| 225 | + summary="Summary C about Zope security", |
| 226 | + aliases=["GHSA-xxxx-yyyy-zzzz"], |
| 227 | + url="https://example.com/c", |
| 228 | + ) |
| 229 | + grouped = group_advisories_by_content([adv_a, adv_b, adv_c]) |
| 230 | + assert len(grouped) == 1 |
| 231 | + |
| 232 | + def test_group_by_summary_similarity(self): |
| 233 | + base_summary = ( |
| 234 | + "SQL injection vulnerability in the login form of the web application " |
| 235 | + "allows remote attackers to execute arbitrary SQL commands" |
| 236 | + ) |
| 237 | + variant_summary = ( |
| 238 | + "SQL injection vulnerability in the login form of the web application " |
| 239 | + "allows remote attackers to execute arbitrary SQL commands via crafted input" |
| 240 | + ) |
| 241 | + adv1 = self._create_advisory( |
| 242 | + advisory_id="ADV-SQL-1", |
| 243 | + datasource_id="src_x", |
| 244 | + summary=base_summary, |
| 245 | + url="https://example.com/sql1", |
| 246 | + ) |
| 247 | + adv2 = self._create_advisory( |
| 248 | + advisory_id="ADV-SQL-2", |
| 249 | + datasource_id="src_y", |
| 250 | + summary=variant_summary, |
| 251 | + url="https://example.com/sql2", |
| 252 | + ) |
| 253 | + grouped = group_advisories_by_content([adv1, adv2]) |
| 254 | + assert len(grouped) == 1 |
| 255 | + |
| 256 | + def test_highest_precedence_becomes_primary(self): |
| 257 | + adv_low = self._create_advisory( |
| 258 | + advisory_id="ADV-P1", |
| 259 | + datasource_id="low_src", |
| 260 | + summary="Same summary here", |
| 261 | + aliases=["CVE-2099-9999"], |
| 262 | + url="https://example.com/p1", |
| 263 | + ) |
| 264 | + adv_high = self._create_advisory( |
| 265 | + advisory_id="ADV-P2", |
| 266 | + datasource_id="high_src", |
| 267 | + summary="Same summary here", |
| 268 | + aliases=["CVE-2099-9999"], |
| 269 | + url="https://example.com/p2", |
| 270 | + ) |
| 271 | + adv_low.precedence = 1 |
| 272 | + adv_low.save() |
| 273 | + adv_high.precedence = 10 |
| 274 | + adv_high.save() |
| 275 | + |
| 276 | + grouped = group_advisories_by_content([adv_low, adv_high]) |
| 277 | + assert len(grouped) == 1 |
| 278 | + group = list(grouped.values())[0] |
| 279 | + assert group["primary"] == adv_high |
| 280 | + assert adv_low in group["secondary"] |
| 281 | + |
| 282 | + def test_empty_input(self): |
| 283 | + assert group_advisories_by_content([]) == {} |
| 284 | + |
| 285 | + def test_single_advisory(self): |
| 286 | + adv = self._create_advisory( |
| 287 | + advisory_id="SOLO-1", |
| 288 | + datasource_id="solo_src", |
| 289 | + summary="Lonely advisory", |
| 290 | + url="https://example.com/solo", |
| 291 | + ) |
| 292 | + grouped = group_advisories_by_content([adv]) |
| 293 | + assert len(grouped) == 1 |
| 294 | + group = list(grouped.values())[0] |
| 295 | + assert group["primary"] == adv |
| 296 | + assert group["secondary"] == set() |
| 297 | + |
| 298 | + def test_none_input(self): |
| 299 | + assert group_advisories_by_content(None) == {} |
0 commit comments