Skip to content

Commit aa26e19

Browse files
Enhance advisory grouping with alias and summary similarity
Signed-off-by: Dhirenderchoudhary <dhirenderchoudhary0001@gmail.com>
1 parent 94a9c8f commit aa26e19

2 files changed

Lines changed: 401 additions & 16 deletions

File tree

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
from unittest import TestCase
11+
12+
import pytest
13+
from django.test import TestCase as DjangoTestCase
14+
from packageurl import PackageURL
15+
from univers.version_range import VersionRange
16+
17+
from vulnerabilities.importer import AdvisoryDataV2
18+
from vulnerabilities.importer import AffectedPackageV2
19+
from vulnerabilities.importer import ReferenceV2
20+
from vulnerabilities.models import AdvisoryV2
21+
from vulnerabilities.pipes.advisory import insert_advisory_v2
22+
from vulnerabilities.utils import SUMMARY_SIMILARITY_THRESHOLD
23+
from vulnerabilities.utils import _find
24+
from vulnerabilities.utils import _union
25+
from vulnerabilities.utils import compute_summary_similarity
26+
from vulnerabilities.utils import group_advisories_by_content
27+
28+
29+
class TestComputeSummarySimilarity(TestCase):
30+
31+
def test_empty_texts_return_zero(self):
32+
assert compute_summary_similarity("", "some text") == 0.0
33+
assert compute_summary_similarity("some text", "") == 0.0
34+
assert compute_summary_similarity("", "") == 0.0
35+
assert compute_summary_similarity(None, "text") == 0.0
36+
assert compute_summary_similarity("text", None) == 0.0
37+
38+
def test_identical_texts_return_one(self):
39+
text = "A critical vulnerability in the AccessControl module."
40+
assert compute_summary_similarity(text, text) == 1.0
41+
42+
def test_identical_after_normalization(self):
43+
text1 = "Security flaw in module"
44+
text2 = " Security Flaw In Module "
45+
assert compute_summary_similarity(text1, text2) == 1.0
46+
47+
def test_completely_different_texts(self):
48+
text1 = "Buffer overflow in network stack"
49+
text2 = "Unrelated cooking recipe for chocolate cake"
50+
similarity = compute_summary_similarity(text1, text2)
51+
assert similarity < SUMMARY_SIMILARITY_THRESHOLD
52+
53+
def test_short_summary_contained_in_long_summary(self):
54+
short = (
55+
"The module AccessControl defines security policies for "
56+
"Python code used in restricted code within Zope applications."
57+
)
58+
long = (
59+
"The module AccessControl defines security policies for "
60+
"Python code used in restricted code within Zope applications. "
61+
"Restricted code is any code that resides in Zope's object database, "
62+
"such as the contents of Script (Python) objects. The policies "
63+
"defined in AccessControl severely restrict access to Python modules "
64+
"and only exempt a few that are deemed safe."
65+
)
66+
similarity = compute_summary_similarity(short, long)
67+
assert similarity >= SUMMARY_SIMILARITY_THRESHOLD
68+
69+
def test_similar_summaries_above_threshold(self):
70+
text1 = "SQL injection vulnerability in login form of web application"
71+
text2 = "SQL injection vulnerability found in the login form of the web application"
72+
similarity = compute_summary_similarity(text1, text2)
73+
assert similarity >= SUMMARY_SIMILARITY_THRESHOLD
74+
75+
def test_partially_overlapping_summaries(self):
76+
text1 = "Remote code execution via crafted XML payload"
77+
text2 = "Remote code execution through specially crafted XML input"
78+
similarity = compute_summary_similarity(text1, text2)
79+
assert similarity > 0.4
80+
81+
def test_symmetry(self):
82+
text1 = "Cross-site scripting in admin panel"
83+
text2 = "XSS vulnerability found in the administration panel"
84+
assert compute_summary_similarity(text1, text2) == compute_summary_similarity(
85+
text2, text1
86+
)
87+
88+
89+
class TestUnionFind(TestCase):
90+
91+
def test_find_on_singleton(self):
92+
parent = [0, 1, 2]
93+
assert _find(parent, 0) == 0
94+
assert _find(parent, 2) == 2
95+
96+
def test_union_merges_two_sets(self):
97+
parent = [0, 1, 2, 3]
98+
_union(parent, 0, 1)
99+
assert _find(parent, 0) == _find(parent, 1)
100+
101+
def test_transitive_union(self):
102+
parent = [0, 1, 2, 3, 4]
103+
_union(parent, 0, 1)
104+
_union(parent, 2, 3)
105+
_union(parent, 1, 3)
106+
root = _find(parent, 0)
107+
assert _find(parent, 1) == root
108+
assert _find(parent, 2) == root
109+
assert _find(parent, 3) == root
110+
assert _find(parent, 4) == 4
111+
112+
def test_path_compression(self):
113+
parent = [0, 0, 1, 2]
114+
root = _find(parent, 3)
115+
assert root == 0
116+
assert parent[3] == 0 or _find(parent, 3) == 0
117+
118+
119+
@pytest.mark.django_db
120+
class TestGroupAdvisoriesByContent(DjangoTestCase):
121+
122+
def _create_advisory(self, advisory_id, datasource_id, summary, aliases=None, url=None):
123+
advisory_data = AdvisoryDataV2(
124+
advisory_id=advisory_id,
125+
aliases=aliases or [],
126+
summary=summary,
127+
affected_packages=[
128+
AffectedPackageV2(
129+
package=PackageURL(type="pypi", name="accesscontrol"),
130+
affected_version_range=VersionRange.from_string("vers:pypi/>=4.0|<4.3"),
131+
),
132+
],
133+
references=[ReferenceV2(url=url or "https://example.com")],
134+
url=url or f"https://example.com/{advisory_id}",
135+
)
136+
insert_advisory_v2(
137+
advisory=advisory_data,
138+
pipeline_id=datasource_id,
139+
)
140+
return AdvisoryV2.objects.get(
141+
datasource_id=datasource_id,
142+
advisory_id=advisory_id,
143+
)
144+
145+
def test_group_by_exact_content_same_hash(self):
146+
adv1 = self._create_advisory(
147+
advisory_id="ADV-001",
148+
datasource_id="source_a",
149+
summary="Identical summary",
150+
url="https://source-a.example.com/ADV-001",
151+
)
152+
adv2 = self._create_advisory(
153+
advisory_id="ADV-001",
154+
datasource_id="source_b",
155+
summary="Identical summary",
156+
url="https://source-b.example.com/ADV-001",
157+
)
158+
grouped = group_advisories_by_content([adv1, adv2])
159+
assert len(grouped) == 1
160+
group = list(grouped.values())[0]
161+
all_advisories = {group["primary"]} | group["secondary"]
162+
assert adv1 in all_advisories
163+
assert adv2 in all_advisories
164+
165+
def test_different_content_no_alias_no_similarity(self):
166+
adv1 = self._create_advisory(
167+
advisory_id="ADV-100",
168+
datasource_id="source_a",
169+
summary="Buffer overflow in network stack",
170+
url="https://example.com/ADV-100",
171+
)
172+
adv2 = self._create_advisory(
173+
advisory_id="ADV-200",
174+
datasource_id="source_b",
175+
summary="Unrelated cooking instructions for pizza dough",
176+
url="https://example.com/ADV-200",
177+
)
178+
grouped = group_advisories_by_content([adv1, adv2])
179+
assert len(grouped) == 2
180+
181+
def test_group_by_shared_alias(self):
182+
adv1 = self._create_advisory(
183+
advisory_id="CVE-2021-32807",
184+
datasource_id="gitlab_importer_v2",
185+
summary="Improperly Controlled Modification of Dynamically-Determined Object Attributes",
186+
aliases=["CVE-2021-32807", "GHSA-qcx9-j53g-ccgf"],
187+
url="https://gitlab.com/gitlab-org/advisories-community/-/blob/main/pypi/AccessControl/CVE-2021-32807.yml",
188+
)
189+
adv2 = self._create_advisory(
190+
advisory_id="PYSEC-2021-335",
191+
datasource_id="pypa_importer_v2",
192+
summary=(
193+
"The module AccessControl defines security policies for Python code "
194+
"used in restricted code within Zope applications. Restricted code is "
195+
"any code that resides in Zopes object database."
196+
),
197+
aliases=["CVE-2021-32807", "GHSA-qcx9-j53g-ccgf"],
198+
url="https://github.com/pypa/advisory-database/blob/main/vulns/accesscontrol/PYSEC-2021-335.yaml",
199+
)
200+
grouped = group_advisories_by_content([adv1, adv2])
201+
assert len(grouped) == 1
202+
group = list(grouped.values())[0]
203+
all_advisories = {group["primary"]} | group["secondary"]
204+
assert adv1 in all_advisories
205+
assert adv2 in all_advisories
206+
207+
def test_alias_chain_merges_three_advisories(self):
208+
adv_a = self._create_advisory(
209+
advisory_id="ADV-A",
210+
datasource_id="source_1",
211+
summary="Summary A about access control",
212+
aliases=["CVE-2099-0001"],
213+
url="https://example.com/a",
214+
)
215+
adv_b = self._create_advisory(
216+
advisory_id="ADV-B",
217+
datasource_id="source_2",
218+
summary="Summary B about restricted code",
219+
aliases=["CVE-2099-0001", "GHSA-xxxx-yyyy-zzzz"],
220+
url="https://example.com/b",
221+
)
222+
adv_c = self._create_advisory(
223+
advisory_id="ADV-C",
224+
datasource_id="source_3",
225+
summary="Summary C about Zope security",
226+
aliases=["GHSA-xxxx-yyyy-zzzz"],
227+
url="https://example.com/c",
228+
)
229+
grouped = group_advisories_by_content([adv_a, adv_b, adv_c])
230+
assert len(grouped) == 1
231+
232+
def test_group_by_summary_similarity(self):
233+
base_summary = (
234+
"SQL injection vulnerability in the login form of the web application "
235+
"allows remote attackers to execute arbitrary SQL commands"
236+
)
237+
variant_summary = (
238+
"SQL injection vulnerability in the login form of the web application "
239+
"allows remote attackers to execute arbitrary SQL commands via crafted input"
240+
)
241+
adv1 = self._create_advisory(
242+
advisory_id="ADV-SQL-1",
243+
datasource_id="src_x",
244+
summary=base_summary,
245+
url="https://example.com/sql1",
246+
)
247+
adv2 = self._create_advisory(
248+
advisory_id="ADV-SQL-2",
249+
datasource_id="src_y",
250+
summary=variant_summary,
251+
url="https://example.com/sql2",
252+
)
253+
grouped = group_advisories_by_content([adv1, adv2])
254+
assert len(grouped) == 1
255+
256+
def test_highest_precedence_becomes_primary(self):
257+
adv_low = self._create_advisory(
258+
advisory_id="ADV-P1",
259+
datasource_id="low_src",
260+
summary="Same summary here",
261+
aliases=["CVE-2099-9999"],
262+
url="https://example.com/p1",
263+
)
264+
adv_high = self._create_advisory(
265+
advisory_id="ADV-P2",
266+
datasource_id="high_src",
267+
summary="Same summary here",
268+
aliases=["CVE-2099-9999"],
269+
url="https://example.com/p2",
270+
)
271+
adv_low.precedence = 1
272+
adv_low.save()
273+
adv_high.precedence = 10
274+
adv_high.save()
275+
276+
grouped = group_advisories_by_content([adv_low, adv_high])
277+
assert len(grouped) == 1
278+
group = list(grouped.values())[0]
279+
assert group["primary"] == adv_high
280+
assert adv_low in group["secondary"]
281+
282+
def test_empty_input(self):
283+
assert group_advisories_by_content([]) == {}
284+
285+
def test_single_advisory(self):
286+
adv = self._create_advisory(
287+
advisory_id="SOLO-1",
288+
datasource_id="solo_src",
289+
summary="Lonely advisory",
290+
url="https://example.com/solo",
291+
)
292+
grouped = group_advisories_by_content([adv])
293+
assert len(grouped) == 1
294+
group = list(grouped.values())[0]
295+
assert group["primary"] == adv
296+
assert group["secondary"] == set()
297+
298+
def test_none_input(self):
299+
assert group_advisories_by_content(None) == {}

0 commit comments

Comments
 (0)