|
| 1 | +""" |
| 2 | +Utility which creates Software Bill-of-Materials (SBOM) |
| 3 | +for CPython release artifacts. Can also be run manually with: |
| 4 | +
|
| 5 | + $ python sbom.py <artifact> |
| 6 | +
|
| 7 | +For example: |
| 8 | +
|
| 9 | + $ python sbom.py ./Python-3.13.0a3.tar.xz |
| 10 | +
|
| 11 | +""" |
| 12 | + |
| 13 | +import datetime |
| 14 | +import hashlib |
| 15 | +import json |
| 16 | +import os |
| 17 | +import re |
| 18 | +import subprocess |
| 19 | +import sys |
| 20 | +import tarfile |
| 21 | + |
| 22 | + |
| 23 | +def spdx_id(value: str) -> str: |
| 24 | + """Encode a value into characters that are valid in an SPDX ID""" |
| 25 | + return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) |
| 26 | + |
| 27 | + |
| 28 | +def calculate_package_verification_codes(sbom) -> None: |
| 29 | + """ |
| 30 | + Calculate SPDX 'packageVerificationCode' values for |
| 31 | + each package with 'filesAnalyzed' set to 'true'. |
| 32 | + Mutates the values within the passed structure. |
| 33 | +
|
| 34 | + The code is SHA1 of a concatenated and sorted list of file SHA1s. |
| 35 | + """ |
| 36 | + |
| 37 | + # Find all packages which we need to calculate package verification codes for. |
| 38 | + sbom_file_id_to_package_id = {} |
| 39 | + sbom_package_id_to_file_sha1s: dict[str, list[bytes]] = {} |
| 40 | + for sbom_package in sbom["packages"]: |
| 41 | + # If this value is 'false' we skip calculating. |
| 42 | + if sbom_package["filesAnalyzed"]: |
| 43 | + sbom_package_id = sbom_package["SPDXID"] |
| 44 | + sbom_package_id_to_file_sha1s[sbom_package_id] = [] |
| 45 | + |
| 46 | + # Next pass we do is over relationships, |
| 47 | + # we need to find all files that belong to each package. |
| 48 | + for sbom_relationship in sbom["relationships"]: |
| 49 | + sbom_relationship_type = sbom_relationship["relationshipType"] |
| 50 | + sbom_element_id = sbom_relationship["spdxElementId"] |
| 51 | + sbom_related_element_id = sbom_relationship["relatedSpdxElement"] |
| 52 | + |
| 53 | + # We're looking for '<package> CONTAINS <file>' relationships |
| 54 | + if ( |
| 55 | + sbom_relationship_type != "CONTAINS" |
| 56 | + or sbom_element_id not in sbom_package_id_to_file_sha1s |
| 57 | + or not sbom_related_element_id.startswith("SPDXRef-FILE-") |
| 58 | + ): |
| 59 | + continue |
| 60 | + |
| 61 | + # Found one! Add it to our mapping. |
| 62 | + sbom_file_id_to_package_id[sbom_related_element_id] = sbom_element_id |
| 63 | + |
| 64 | + # Now we do a single pass on files, appending all SHA1 values along the way. |
| 65 | + for sbom_file in sbom["files"]: |
| 66 | + # Attempt to match this file to a package. |
| 67 | + sbom_file_id = sbom_file["SPDXID"] |
| 68 | + if sbom_file_id not in sbom_file_id_to_package_id: |
| 69 | + continue |
| 70 | + sbom_package_id = sbom_file_id_to_package_id[sbom_file_id] |
| 71 | + |
| 72 | + # Find the SHA1 checksum for the file. |
| 73 | + for sbom_file_checksum in sbom_file["checksums"]: |
| 74 | + if sbom_file_checksum["algorithm"] == "SHA1": |
| 75 | + # We lowercase the value as that's what's required by the algorithm. |
| 76 | + sbom_file_checksum_sha1 = ( |
| 77 | + sbom_file_checksum["checksumValue"].lower().encode("ascii") |
| 78 | + ) |
| 79 | + break |
| 80 | + else: |
| 81 | + raise ValueError(f"Can't find SHA1 checksum for '{sbom_file_id}'") |
| 82 | + |
| 83 | + sbom_package_id_to_file_sha1s[sbom_package_id].append(sbom_file_checksum_sha1) |
| 84 | + |
| 85 | + # Finally we iterate over the packages again and calculate the final package verification code values. |
| 86 | + for sbom_package in sbom["packages"]: |
| 87 | + sbom_package_id = sbom_package["SPDXID"] |
| 88 | + if sbom_package_id not in sbom_package_id_to_file_sha1s: |
| 89 | + continue |
| 90 | + |
| 91 | + # Package verification code is the SHA1 of ASCII values ascending-sorted. |
| 92 | + sbom_package_verification_code = hashlib.sha1( |
| 93 | + b"".join(sorted(sbom_package_id_to_file_sha1s[sbom_package_id])) |
| 94 | + ).hexdigest() |
| 95 | + |
| 96 | + sbom_package["packageVerificationCode"] = { |
| 97 | + "packageVerificationCodeValue": sbom_package_verification_code |
| 98 | + } |
| 99 | + |
| 100 | + |
| 101 | +def get_release_tools_commit_sha() -> str: |
| 102 | + """Gets the git commit SHA of the release-tools repository""" |
| 103 | + git_prefix = os.path.abspath(os.path.dirname(__file__)) |
| 104 | + stdout = subprocess.check_output( |
| 105 | + ["git", "rev-parse", "--prefix", git_prefix, "HEAD"], |
| 106 | + cwd=git_prefix |
| 107 | + ).decode("ascii") |
| 108 | + assert re.match(r"^[a-f0-9]{40,}$", stdout) |
| 109 | + return stdout |
| 110 | + |
| 111 | + |
| 112 | +def create_sbom_for_source_tarball(tarball_path: str): |
| 113 | + """Stitches together an SBOM for a source tarball""" |
| 114 | + tarball_name = os.path.basename(tarball_path) |
| 115 | + |
| 116 | + # Open the tarball with known compression settings. |
| 117 | + if tarball_name.endswith(".tgz"): |
| 118 | + tarball = tarfile.open(tarball_path, mode="r:gz") |
| 119 | + elif tarball_name.endswith(".tar.xz"): |
| 120 | + tarball = tarfile.open(tarball_path, mode="r:xz") |
| 121 | + else: |
| 122 | + raise ValueError(f"Unknown tarball format: '{tarball_name}'") |
| 123 | + |
| 124 | + # Parse the CPython version from the tarball. |
| 125 | + # Calculate the download locations from the CPython version and tarball name. |
| 126 | + cpython_version = re.match(r"^Python-([0-9abrc.]+)\.t", tarball_name).group(1) |
| 127 | + cpython_version_without_suffix = re.match(r"^([0-9.]+)", cpython_version).group(1) |
| 128 | + tarball_download_location = f"https://www.python.org/ftp/python/{cpython_version_without_suffix}/{tarball_name}" |
| 129 | + |
| 130 | + # Take a hash of the tarball |
| 131 | + with open(tarball_path, mode="rb") as f: |
| 132 | + tarball_checksum_sha256 = hashlib.sha256(f.read()).hexdigest() |
| 133 | <
4F6
code class="diff-text syntax-highlighted-line addition">+ |
| 134 | + # There should be an SBOM included in the tarball. |
| 135 | + # If there's not we can't create an SBOM. |
| 136 | + try: |
| 137 | + sbom_tarball_member = tarball.getmember(f"Python-{cpython_version}/Misc/sbom.spdx.json") |
| 138 | + except KeyError: |
| 139 | + raise ValueError( |
| 140 | + "Tarball doesn't contain an SBOM at 'Misc/sbom.spdx.json'" |
| 141 | + ) from None |
| 142 | + sbom_bytes = tarball.extractfile(sbom_tarball_member).read() |
| 143 | + |
| 144 | + sbom = json.loads(sbom_bytes) |
| 145 | + sbom.update({ |
| 146 | + "SPDXID": "SPDXRef-DOCUMENT", |
| 147 | + "spdxVersion": "SPDX-2.3", |
| 148 | + "name": "CPython SBOM", |
| 149 | + "dataLicense": "CC0-1.0", |
| 150 | + # Naming done according to OpenSSF SBOM WG recommendations. |
| 151 | + # See: https://github.com/ossf/sbom-everywhere/blob/main/reference/sbom_naming.md |
| 152 | + "documentNamespace": f"{tarball_download_location}.spdx.json", |
| 153 | + "creationInfo": { |
| 154 | + "created": ( |
| 155 | + datetime.datetime.now(tz=datetime.timezone.utc) |
| 156 | + .strftime("%Y-%m-%dT%H:%M:%SZ") |
| 157 | + ), |
| 158 | + "creators": [ |
| 159 | + "Person: Python Release Managers", |
| 160 | + f"Tool: ReleaseTools-{get_release_tools_commit_sha()}", |
| 161 | + ], |
| 162 | + # Version of the SPDX License ID list. |
| 163 | + # This shouldn't need to be updated often, if ever. |
| 164 | + "licenseListVersion": "3.22", |
| 165 | + }, |
| 166 | + }) |
| 167 | + |
| 168 | + # Create the SBOM entry for the CPython package. We use |
| 169 | + # the SPDXID later on for creating relationships to files. |
| 170 | + sbom_cpython_package = { |
| 171 | + "SPDXID": "SPDXRef-PACKAGE-cpython", |
| 172 | + "name": "CPython", |
| 173 | + "versionInfo": cpython_version, |
| 174 | + "licenseConcluded": "PSF-2.0", |
| 175 | + "originator": "Organization: Python Software Foundation", |
| 176 | + "supplier": "Organization: Python Software Foundation", |
| 177 | + "packageFileName": tarball_name, |
| 178 | + "externalRefs": [ |
| 179 | + { |
| 180 | + "referenceCategory": "SECURITY", |
| 181 | + "referenceLocator": f"cpe:2.3:a:python:python:{cpython_version}:*:*:*:*:*:*:*", |
| 182 | + "referenceType": "cpe23Type", |
| 183 | + } |
| 184 | + ], |
| 185 | + "primaryPackagePurpose": "SOURCE", |
| 186 | + "downloadLocation": tarball_download_location, |
| 187 | + "checksums": [{"algorithm": "SHA256", "checksumValue": tarball_checksum_sha256}], |
| 188 | + } |
| 189 | + |
| 190 | + # The top-level CPython package depends on every vendored sub-package. |
| 191 | + for sbom_package in sbom["packages"]: |
| 192 | + sbom["relationships"].append({ |
| 193 | + "spdxElementId": sbom_cpython_package["SPDXID"], |
| 194 | + "relatedSpdxElement": sbom_package["SPDXID"], |
| 195 | + "relationshipType": "DEPENDS_ON", |
| 196 | + }) |
| 197 | + |
| 198 | + sbom["packages"].append(sbom_cpython_package) |
| 199 | + |
| 200 | + # Extract all currently known files from the SBOM with their checksums. |
| 201 | + known_sbom_files = {} |
| 202 | + for sbom_file in sbom["files"]: |
| 203 | + sbom_filename = sbom_file["fileName"] |
| 204 | + |
| 205 | + # Look for the expected SHA256 checksum. |
| 206 | + for sbom_file_checksum in sbom_file["checksums"]: |
| 207 | + if sbom_file_checksum["algorithm"] == "SHA256": |
| 208 | + known_sbom_files[sbom_filename] = ( |
| 209 | + sbom_file_checksum["checksumValue"] |
| 210 | + ) |
| 211 | + break |
| 212 | + else: |
| 213 | + raise ValueError( |
| 214 | + f"Couldn't find expected SHA256 checksum in SBOM for file '{sbom_filename}'" |
| 215 | + ) |
| 216 | + |
| 217 | + # Now we walk the tarball and compare known files to our expected checksums in the SBOM. |
| 218 | + # All files that aren't already in the SBOM can be added as "CPython" files. |
| 219 | + for member in tarball.getmembers(): |
| 220 | + if member.isdir(): # Skip directories! |
| 221 | + continue |
| 222 | + |
| 223 | + # Get the member from the tarball. CPython prefixes all of its |
| 224 | + # source code with 'Python-{version}/...'. |
| 225 | + assert member.isfile() and member.name.startswith(f"Python-{cpython_version}/") |
| 226 | + |
| 227 | + # Calculate the hashes, either for comparison with a known value |
| 228 | + # or to embed in the SBOM as a new file. SHA1 is only used because |
| 229 | + # SPDX requires it for all file entries. |
| 230 | + file_bytes = tarball.extractfile(member).read() |
| 231 | + actual_file_checksum_sha1 = hashlib.sha1(file_bytes).hexdigest() |
| 232 | + actual_file_checksum_sha256 = hashlib.sha256(file_bytes).hexdigest() |
| 233 | + |
| 234 | + # Remove the 'Python-{version}/...' prefix for the SPDXID and fileName. |
| 235 | + member_name_no_prefix = member.name.split('/', 1)[1] |
| 236 | + |
| 237 | + # We've already seen this file, so we check it hasn't been modified and continue on. |
| 238 | + if member_name_no_prefix in known_sbom_files: |
| 239 | + # If there's a hash mismatch we raise an error, something isn't right! |
| 240 | + expected_file_checksum_sha256 = known_sbom_files.pop(member_name_no_prefix) |
| 241 | + if expected_file_checksum_sha256 != actual_file_checksum_sha256: |
| 242 | + raise ValueError(f"Mismatched checksum for file '{member_name_no_prefix}'") |
| 243 | + |
| 244 | + # If this is a new file, then it's a part of the 'CPython' SBOM package. |
| 245 | + else: |
| 246 | + sbom_file_spdx_id = spdx_id(f"SPDXRef-FILE-{member_name_no_prefix}") |
| 247 | + sbom["files"].append( |
| 248 | + { |
| 249 | + "SPDXID": sbom_file_spdx_id, |
| 250 | + "fileName": member_name_no_prefix, |
| 251 | + "checksums": [ |
| 252 | + { |
| 253 | + "algorithm": "SHA1", |
| 254 | + "checksumValue": actual_file_checksum_sha1, |
| 255 | + }, |
| 256 | + { |
| 257 | + "algorithm": "SHA256", |
| 258 | + "checksumValue": actual_file_checksum_sha256, |
| 259 | + }, |
| 260 | + ], |
| 261 | + } |
| 262 | + ) |
| 263 | + sbom["relationships"].append( |
| 264 | + { |
| 265 | + "spdxElementId": sbom_cpython_package["SPDXID"], |
| 266 | + "relatedSpdxElement": sbom_file_spdx_id, |
| 267 | + "relationshipType": "CONTAINS", |
| 268 | + } |
| 269 | + ) |
| 270 | + |
| 271 | + # If there are any known files that weren't found in the |
| 272 | + # source tarball we want to raise an error. |
| 273 | + if known_sbom_files: |
| 274 | + raise ValueError( |
| 275 | + f"Some files from source SBOM aren't accounted for " |
| 276 | + f"in source tarball: {sorted(known_sbom_files)!r}" |
| 277 | + ) |
| 278 | + |
| 279 | + # Final relationship, this SBOM describes the CPython package. |
| 280 | + sbom["relationships"].append( |
| 281 | + { |
| 282 | + "spdxElementId": "SPDXRef-DOCUMENT", |
| 283 | + "relatedSpdxElement": sbom_cpython_package["SPDXID"], |
| 284 | + "relationshipType": "DESCRIBES", |
| 285 | + } |
| 286 | + ) |
| 287 | + |
| 288 | + # Apply the 'supplier' tag to every package since we're shipping |
| 289 | + # the package in the tarball itself. Originator field is used for maintainers. |
| 290 | + for sbom_package in sbom["packages"]: |
| 291 | + sbom_package["supplier"] = "Organization: Python Software Foundation" |
| 292 | + sbom_package["filesAnalyzed"] = True |
| 293 | + |
| 294 | + # Calculate the 'packageVerificationCode' values for files in packages. |
| 295 | + calculate_package_verification_codes(sbom) |
| 296 | + |
| 297 | + return sbom |
| 298 | + |
| 299 | + |
| 300 | +def main() -> None: |
| 301 | + tarball_path = sys.argv[1] |
| 302 | + sbom_data = create_sbom_for_source_tarball(tarball_path) |
| 303 | + print(json.dumps(sbom_data, indent=2, sort_keys=True)) |
| 304 | + |
| 305 | +if __name__ == "__main__": |
| 306 | + main() |
0 commit comments