Skip to content

Commit ba574d8

Browse files
authored
Merge pull request #199 from sethmlarson/spdx-id-dedupe
Deduplicate SPDX IDs with hash suffixes
2 parents 58a8227 + 76278e9 commit ba574d8

4 files changed

Lines changed: 40 additions & 12 deletions

File tree

sbom.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
import tarfile
2525
import typing
2626
import zipfile
27+
from functools import cache
2728
from pathlib import Path
28-
from typing import Any, NotRequired, TypedDict, cast
29+
from typing import Any, LiteralString, NotRequired, TypedDict, cast
2930
from urllib.request import urlopen
3031

3132

@@ -90,9 +91,20 @@ class CreationInfo(TypedDict):
9091
licenseListVersion: str
9192

9293

93-
def spdx_id(value: str) -> str:
94+
# Cache of values that we've seen already. We use this
95+
# to de-duplicate values and their corresponding SPDX ID.
96+
_SPDX_IDS_TO_VALUES: dict[str, Any] = {}
97+
98+
99+
@cache
100+
def spdx_id(value: LiteralString) -> str:
94101
"""Encode a value into characters that are valid in an SPDX ID"""
95-
return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
102+
value_as_spdx_id = re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
103+
# To avoid collisions we append a hash suffix.
104+
suffix = hashlib.sha256(value.encode()).hexdigest()[:8]
105+
value_as_spdx_id = f"{value_as_spdx_id}-{suffix}"
106+
assert _SPDX_IDS_TO_VALUES.setdefault(value_as_spdx_id, value) == value
107+
return value_as_spdx_id
96108

97109

98110
def calculate_package_verification_codes(sbom: SBOM) -> None:

tests/sbom/sbom-with-pip-removed.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
"packages": [],
1414
"relationships": [
1515
{
16-
"relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING",
16+
"relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING-497fb0c3",
1717
"relationshipType": "CONTAINS",
18-
"spdxElementId": "SPDXRef-PACKAGE-expat"
18+
"spdxElementId": "SPDXRef-PACKAGE-expat-83b93528"
1919
}
2020
]
2121
}

tests/sbom/sbom-with-pip.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"files": [],
1313
"packages": [
1414
{
15-
"SPDXID": "SPDXRef-PACKAGE-pip",
15+
"SPDXID": "SPDXRef-PACKAGE-pip-ced959c1",
1616
"name": "pip",
1717
"versionInfo": "24.0",
1818
"licenseConcluded": "MIT",
@@ -38,19 +38,19 @@
3838
],
3939
"relationships": [
4040
{
41-
"relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING",
41+
"relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING-497fb0c3",
4242
"relationshipType": "CONTAINS",
43-
"spdxElementId": "SPDXRef-PACKAGE-expat"
43+
"spdxElementId": "SPDXRef-PACKAGE-expat-83b93528"
4444
},
4545
{
46-
"relatedSpdxElement": "SPDXRef-PACKAGE-urllib3",
46+
"relatedSpdxElement": "SPDXRef-PACKAGE-urllib3-b7a198af",
4747
"relationshipType": "DEPENDS_ON",
48-
"spdxElementId": "SPDXRef-PACKAGE-pip"
48+
"spdxElementId": "SPDXRef-PACKAGE-pip-ced959c1"
4949
},
5050
{
51-
"relatedSpdxElement": "SPDXRef-PACKAGE-pip",
51+
"relatedSpdxElement": "SPDXRef-PACKAGE-pip-ced959c1",
5252
"relationshipType": "DEPENDS_ON",
53-
"spdxElementId": "SPDXRef-PACKAGE-cpython"
53+
"spdxElementId": "SPDXRef-PACKAGE-cpython-608f998c"
5454
}
5555
]
5656
}

tests/test_sbom.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,22 @@
1111
import sbom
1212

1313

14+
@pytest.mark.parametrize(
15+
["value", "expected"],
16+
[
17+
("abc", "abc-ba7816bf"),
18+
("def", "def-cb8379ac"),
19+
("SPDXRef-PACKAGE-pip", "SPDXRef-PACKAGE-pip-ced959c1"),
20+
("SPDXRef-PACKAGE-cpython", "SPDXRef-PACKAGE-cpython-79ab18d2"),
21+
("SPDXRef-PACKAGE-urllib3", "SPDXRef-PACKAGE-urllib3-b8ab4751"),
22+
],
23+
)
24+
def test_spdx_id(value: str, expected: str) -> None:
25+
assert sbom.spdx_id(value) == expected
26+
# Check we get the same value next time
27+
assert sbom.spdx_id(value) == expected
28+
29+
1430
@pytest.mark.parametrize(
1531
["package_sha1s", "package_verification_code"],
1632
[

0 commit comments

Comments
 (0)