8000 Merge pull request #82 from sethmlarson/sbom-utility · python/release-tools@a048b9d · GitHub
[go: up one dir, main page]

Skip to content

Commit a048b9d

Browse files
authored
Merge pull request #82 from sethmlarson/sbom-utility
Create utility for generating SBOM from artifacts
2 parents 38ce4f4 + a1d4092 commit a048b9d

File tree

2 files changed

+329
-0
lines changed

2 files changed

+329
-0
lines changed

run_release.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import functools
1313
import getpass
1414
import itertools
15+
import json
1516
import os
1617
import pathlib
1718
import re
@@ -34,6 +35,7 @@
3435

3536
import release as release_mod
3637
from buildbotapi import BuildBotAPI
38+
import sbom
3739

3840
API_KEY_REGEXP = re.compile(r"(?P<major>\w+):(?P<minor>\w+)")
3941

@@ -507,6 +509,26 @@ def test_release_artifacts(db: DbfilenameShelf) -> None:
507509
raise ReleaseException("Test failed!")
508510

509511

512+
def build_sbom_artifacts(db):
513+
514+
# Skip building an SBOM if there isn't a 'Misc/sbom.spdx.json' file.
515+
if not (db["git_repo"] / "Misc/sbom.spdx.json").exists():
516+
print("Skipping building an SBOM, missing 'Misc/sbom.spdx.json'")
517+
return
518+
519+
release_version = db["release"]
520+
# For each source tarball build an SBOM.
521+
for ext in (".tgz", ".tar.xz"):
522+
tarball_name = f"Python-{release_version}{ext}"
523+
tarball_path = str(db["git_repo"] / str(db["release"]) / "src" / tarball_name)
524+
525+
print(f"Building an SBOM for artifact '{tarball_name}'")
526+
sbom_data = sbom.create_sbom_for_source_tarball(tarball_path)
527+
528+
with open(tarball_path + ".spdx.json", mode="w") as f:
529+
f.write(json.dumps(sbom_data, indent=2, sort_keys=True))
530+
531+
510532
class MySFTPClient(paramiko.SFTPClient):
511533
def put_dir(self, source, target, progress=None):
512534
for item in os.listdir(source):
@@ -1041,6 +1063,7 @@ def _api_key(api_key):
10411063
Task(create_tag, "Create tag"),
10421064
Task(build_release_artifacts, "Building release artifacts"),
10431065
Task(test_release_artifacts, "Test release artifacts"),
1066+
Task(build_sbom_artifacts, "Building SBOM artifacts"),
10441067
Task(upload_files_to_server, "Upload files to the PSF server"),
10451068
Task(place_files_in_download_folder, "Place files in the download folder"),
10461069
Task(upload_docs_to_the_docs_server, "Upload docs to the PSF docs server"),

sbom.py

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
"""
2+
Utility which creates Software Bill-of-Materials (SBOM)
3+
for CPython release artifacts. Can also be run manually with:
4+
5+
$ python sbom.py <artifact>
6+
7+
For example:
8+
9+
$ python sbom.py ./Python-3.13.0a3.tar.xz
10+
11+
"""
12+
13+
import datetime
14+
import hashlib
15+
import json
16+
import os
17+
import re
18+
import subprocess
19+
import sys
20+
import tarfile
21+
22+
23+
def spdx_id(value: str) -> str:
24+
"""Encode a value into characters that are valid in an SPDX ID"""
25+
return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
26+
27+
28+
def calculate_package_verification_codes(sbom) -> None:
29+
"""
30+
Calculate SPDX 'packageVerificationCode' values for
31+
each package with 'filesAnalyzed' set to 'true'.
32+
Mutates the values within the passed structure.
33+
34+
The code is SHA1 of a concatenated and sorted list of file SHA1s.
35+
"""
36+
37+
# Find all packages which we need to calculate package verification codes for.
38+
sbom_file_id_to_package_id = {}
39+
sbom_package_id_to_file_sha1s: dict[str, list[bytes]] = {}
40+
for sbom_package in sbom["packages"]:
41+
# If this value is 'false' we skip calculating.
42+
if sbom_package["filesAnalyzed"]:
43+
sbom_package_id = sbom_package["SPDXID"]
44+
sbom_package_id_to_file_sha1s[sbom_package_id] = []
45+
46+
# Next pass we do is over relationships,
47+
# we need to find all files that belong to each package.
48+
for sbom_relationship in sbom["relationships"]:
49+
sbom_relationship_type = sbom_relationship["relationshipType"]
50+
sbom_element_id = sbom_relationship["spdxElementId"]
51+
sbom_related_element_id = sbom_relationship["relatedSpdxElement"]
52+
53+
# We're looking for '<package> CONTAINS <file>' relationships
54+
if (
55+
sbom_relationship_type != "CONTAINS"
56+
or sbom_element_id not in sbom_package_id_to_file_sha1s
57+
or not sbom_related_element_id.startswith("SPDXRef-FILE-")
58+
):
59+
continue
60+
61+
# Found one! Add it to our mapping.
62+
sbom_file_id_to_package_id[sbom_related_element_id] = sbom_element_id
63+
64+
# Now we do a single pass on files, appending all SHA1 values along the way.
65+
for sbom_file in sbom["files"]:
66+
# Attempt to match this file to a package.
67+
sbom_file_id = sbom_file["SPDXID"]
68+
if sbom_file_id not in sbom_file_id_to_package_id:
69+
continue
70+
sbom_package_id = sbom_file_id_to_package_id[sbom_file_id]
71+
72+
# Find the SHA1 checksum for the file.
73+
for sbom_file_checksum in sbom_file["checksums"]:
74+
if sbom_file_checksum["algorithm"] == "SHA1":
75+
# We lowercase the value as that's what's required by the algorithm.
76+
sbom_file_checksum_sha1 = (
77+
sbom_file_checksum["checksumValue"].lower().encode("ascii")
78+
)
79+
break
80+
else:
81+
raise ValueError(f"Can't find SHA1 checksum for '{sbom_file_id}'")
82+
83+
sbom_package_id_to_file_sha1s[sbom_package_id].append(sbom_file_checksum_sha1)
84+
85+
# Finally we iterate over the packages again and calculate the final package verification code values.
86+
for sbom_package in sbom["packages"]:
87+
sbom_package_id = sbom_package["SPDXID"]
88+
if sbom_package_id not in sbom_package_id_to_file_sha1s:
89+
continue
90+
91+
# Package verification code is the SHA1 of ASCII values ascending-sorted.
92+
sbom_package_verification_code = hashlib.sha1(
93+
b"".join(sorted(sbom_package_id_to_file_sha1s[sbom_package_id]))
94+
).hexdigest()
95+
96+
sbom_package["packageVerificationCode"] = {
97+
"packageVerificationCodeValue": sbom_package_verification_code
98+
}
99+
100+
101+
def get_release_tools_commit_sha() -> str:
102+
"""Gets the git commit SHA of the release-tools repository"""
103+
git_prefix = os.path.abspath(os.path.dirname(__file__))
104+
stdout = subprocess.check_output(
105+
["git", "rev-parse", "--prefix", git_prefix, "HEAD"],
106+
cwd=git_prefix
107+
).decode("ascii")
108+
assert re.match(r"^[a-f0-9]{40,}$", stdout)
109+
return stdout
110+
111+
112+
def create_sbom_for_source_tarball(tarball_path: str):
113+
"""Stitches together an SBOM for a source tarball"""
114+
tarball_name = os.path.basename(tarball_path)
115+
116+
# Open the tarball with known compression settings.
117+
if tarball_name.endswith(".tgz"):
118+
tarball = tarfile.open(tarball_path, mode="r:gz")
119+
elif tarball_name.endswith(".tar.xz"):
120+
tarball = tarfile.open(tarball_path, mode="r:xz")
121+
else:
122+
raise ValueError(f"Unknown tarball format: '{tarball_name}'")
123+
124+
# Parse the CPython version from the tarball.
125+
# Calculate the download locations from the CPython version and tarball name.
126+
cpython_version = re.match(r"^Python-([0-9abrc.]+)\.t", tarball_name).group(1)
127+
cpython_version_without_suffix = re.match(r"^([0-9.]+)", cpython_version).group(1)
128+
tarball_download_location = f"https://www.python.org/ftp/python/{cpython_version_without_suffix}/{tarball_name}"
129+
130+
# Take a hash of the tarball
131+
with open(tarball_path, mode="rb") as f:
132+
tarball_checksum_sha256 = hashlib.sha256(f.read()).hexdigest()
133< 4F6 code class="diff-text syntax-highlighted-line addition">+
134+
# There should be an SBOM included in the tarball.
135+
# If there's not we can't create an SBOM.
136+
try:
137+
sbom_tarball_member = tarball.getmember(f"Python-{cpython_version}/Misc/sbom.spdx.json")
138+
except KeyError:
139+
raise ValueError(
140+
"Tarball doesn't contain an SBOM at 'Misc/sbom.spdx.json'"
141+
) from None
142+
sbom_bytes = tarball.extractfile(sbom_tarball_member).read()
143+
144+
sbom = json.loads(sbom_bytes)
145+
sbom.update({
146+
"SPDXID": "SPDXRef-DOCUMENT",
147+
"spdxVersion": "SPDX-2.3",
148+
"name": "CPython SBOM",
149+
"dataLicense": "CC0-1.0",
150+
# Naming done according to OpenSSF SBOM WG recommendations.
151+
# See: https://github.com/ossf/sbom-everywhere/blob/main/reference/sbom_naming.md
152+
"documentNamespace": f"{tarball_download_location}.spdx.json",
153+
"creationInfo": {
154+
"created": (
155+
datetime.datetime.now(tz=datetime.timezone.utc)
156+
.strftime("%Y-%m-%dT%H:%M:%SZ")
157+
),
158+
"creators": [
159+
"Person: Python Release Managers",
160+
f"Tool: ReleaseTools-{get_release_tools_commit_sha()}",
161+
],
162+
# Version of the SPDX License ID list.
163+
# This shouldn't need to be updated often, if ever.
164+
"licenseListVersion": "3.22",
165+
},
166+
})
167+
168+
# Create the SBOM entry for the CPython package. We use
169+
# the SPDXID later on for creating relationships to files.
170+
sbom_cpython_package = {
171+
"SPDXID": "SPDXRef-PACKAGE-cpython",
172+
"name": "CPython",
173+
"versionInfo": cpython_version,
174+
"licenseConcluded": "PSF-2.0",
175+
"originator": "Organization: Python Software Foundation",
176+
"supplier": "Organization: Python Software Foundation",
177+
"packageFileName": tarball_name,
178+
"externalRefs": [
179+
{
180+
"referenceCategory": "SECURITY",
181+
"referenceLocator": f"cpe:2.3:a:python:python:{cpython_version}:*:*:*:*:*:*:*",
182+
"referenceType": "cpe23Type",
183+
}
184+
],
185+
"primaryPackagePurpose": "SOURCE",
186+
"downloadLocation": tarball_download_location,
187+
"checksums": [{"algorithm": "SHA256", "checksumValue": tarball_checksum_sha256}],
188+
}
189+
190+
# The top-level CPython package depends on every vendored sub-package.
191+
for sbom_package in sbom["packages"]:
192+
sbom["relationships"].append({
193+
"spdxElementId": sbom_cpython_package["SPDXID"],
194+
"relatedSpdxElement": sbom_package["SPDXID"],
195+
"relationshipType": "DEPENDS_ON",
196+
})
197+
198+
sbom["packages"].append(sbom_cpython_package)
199+
200+
# Extract all currently known files from the SBOM with their checksums.
201+
known_sbom_files = {}
202+
for sbom_file in sbom["files"]:
203+
sbom_filename = sbom_file["fileName"]
204+
205+
# Look for the expected SHA256 checksum.
206+
for sbom_file_checksum in sbom_file["checksums"]:
207+
if sbom_file_checksum["algorithm"] == "SHA256":
208+
known_sbom_files[sbom_filename] = (
209+
sbom_file_checksum["checksumValue"]
210+
)
211+
break
212+
else:
213+
raise ValueError(
214+
f"Couldn't find expected SHA256 checksum in SBOM for file '{sbom_filename}'"
215+
)
216+
217+
# Now we walk the tarball and compare known files to our expected checksums in the SBOM.
218+
# All files that aren't already in the SBOM can be added as "CPython" files.
219+
for member in tarball.getmembers():
220+
if member.isdir(): # Skip directories!
221+
continue
222+
223+
# Get the member from the tarball. CPython prefixes all of its
224+
# source code with 'Python-{version}/...'.
225+
assert member.isfile() and member.name.startswith(f"Python-{cpython_version}/")
226+
227+
# Calculate the hashes, either for comparison with a known value
228+
# or to embed in the SBOM as a new file. SHA1 is only used because
229+
# SPDX requires it for all file entries.
230+
file_bytes = tarball.extractfile(member).read()
231+
actual_file_checksum_sha1 = hashlib.sha1(file_bytes).hexdigest()
232+
actual_file_checksum_sha256 = hashlib.sha256(file_bytes).hexdigest()
233+
234+
# Remove the 'Python-{version}/...' prefix for the SPDXID and fileName.
235+
member_name_no_prefix = member.name.split('/', 1)[1]
236+
237+
# We've already seen this file, so we check it hasn't been modified and continue on.
238+
if member_name_no_prefix in known_sbom_files:
239+
# If there's a hash mismatch we raise an error, something isn't right!
240+
expected_file_checksum_sha256 = known_sbom_files.pop(member_name_no_prefix)
241+
if expected_file_checksum_sha256 != actual_file_checksum_sha256:
242+
raise ValueError(f"Mismatched checksum for file '{member_name_no_prefix}'")
243+
244+
# If this is a new file, then it's a part of the 'CPython' SBOM package.
245+
else:
246+
sbom_file_spdx_id = spdx_id(f"SPDXRef-FILE-{member_name_no_prefix}")
247+
sbom["files"].append(
248+
{
249+
"SPDXID": sbom_file_spdx_id,
250+
"fileName": member_name_no_prefix,
251+
"checksums": [
252+
{
253+
"algorithm": "SHA1",
254+
"checksumValue": actual_file_checksum_sha1,
255+
},
256+
{
257+
"algorithm": "SHA256",
258+
"checksumValue": actual_file_checksum_sha256,
259+
},
260+
],
261+
}
262+
)
263+
sbom["relationships"].append(
264+
{
265+
"spdxElementId": sbom_cpython_package["SPDXID"],
266+
"relatedSpdxElement": sbom_file_spdx_id,
267+
"relationshipType": "CONTAINS",
268+
}
269+
)
270+
271+
# If there are any known files that weren't found in the
272+
# source tarball we want to raise an error.
273+
if known_sbom_files:
274+
raise ValueError(
275+
f"Some files from source SBOM aren't accounted for "
276+
f"in source tarball: {sorted(known_sbom_files)!r}"
277+
)
278+
279+
# Final relationship, this SBOM describes the CPython package.
280+
sbom["relationships"].append(
281+
{
282+
"spdxElementId": "SPDXRef-DOCUMENT",
283+
"relatedSpdxElement": sbom_cpython_package["SPDXID"],
284+
"relationshipType": "DESCRIBES",
285+
}
286+
)
287+
288+
# Apply the 'supplier' tag to every package since we're shipping
289+
# the package in the tarball itself. Originator field is used for maintainers.
290+
for sbom_package in sbom["packages"]:
291+
sbom_package["supplier"] = "Organization: Python Software Foundation"
292+
sbom_package["filesAnalyzed"] = True
293+
294+
# Calculate the 'packageVerificationCode' values for files in packages.
295+
calculate_package_verification_codes(sbom)
296+
297+
return sbom
298+
299+
300+
def main() -> None:
301+
tarball_path = sys.argv[1]
302+
sbom_data = create_sbom_for_source_tarball(tarball_path)
303+
print(json.dumps(sbom_data, indent=2, sort_keys=True))
304+
305+
if __name__ == "__main__":
306+
main()

0 commit comments

Comments
 (0)
0