-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Bulk MAD generator: Support databases from DCA runs #19627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
cb0b566
40d937a
b87ba31
6ff2beb
e721fc0
5051790
cb93870
7ecf8c8
566bf43
b640474
5d79a8d
7c89d6d
7121f5c
fc165db
1228080
7c2612a
3ddca32
cdd869a
bdf411a
3444c98
0f30644
7cb9024
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…cifying the formatting requirements.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,5 +1,7 @@ | ||||||
""" | ||||||
Experimental script for bulk generation of MaD models based on a list of projects. | ||||||
|
||||||
Note: This file must be formatted using the Black Python formatter. | ||||||
""" | ||||||
|
||||||
import os.path | ||||||
|
@@ -24,6 +26,7 @@ | |||||
) | ||||||
build_dir = os.path.join(gitroot, "mad-generation-build") | ||||||
|
||||||
|
||||||
# A project to generate models for | ||||||
class Project(TypedDict): | ||||||
""" | ||||||
|
@@ -132,7 +135,9 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: | |||||
return project_dirs | ||||||
|
||||||
|
||||||
def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None: | ||||||
def build_database( | ||||||
language: str, extractor_options, project: Project, project_dir: str | ||||||
) -> str | None: | ||||||
""" | ||||||
Build a CodeQL database for a project. | ||||||
|
||||||
|
@@ -179,6 +184,7 @@ def build_database(language: str, extractor_options, project: Project, project_d | |||||
|
||||||
return database_dir | ||||||
|
||||||
|
||||||
def generate_models(args, name: str, database_dir: str) -> None: | ||||||
""" | ||||||
Generate models for a project. | ||||||
|
@@ -196,7 +202,10 @@ def generate_models(args, name: str, database_dir: str) -> None: | |||||
generator.setenvironment(database=database_dir, folder=name) | ||||||
generator.run() | ||||||
|
||||||
def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]: | ||||||
|
||||||
def build_databases_from_projects( | ||||||
language: str, extractor_options, projects: List[Project] | ||||||
) -> List[tuple[str, str | None]]: | ||||||
""" | ||||||
Build databases for all projects in parallel. | ||||||
|
||||||
|
@@ -215,11 +224,15 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li | |||||
# Phase 2: Build databases for all projects | ||||||
print("\n=== Phase 2: Building databases ===") | ||||||
database_results = [ | ||||||
(project["name"], build_database(language, extractor_options, project, project_dir)) | ||||||
( | ||||||
project["name"], | ||||||
build_database(language, extractor_options, project, project_dir), | ||||||
) | ||||||
for project, project_dir in project_dirs | ||||||
] | ||||||
return database_results | ||||||
|
||||||
|
||||||
def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: | ||||||
""" | ||||||
Download a JSON file from GitHub using a personal access token (PAT). | ||||||
|
@@ -230,14 +243,15 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict: | |||||
Returns: | ||||||
The JSON response as a dictionary. | ||||||
""" | ||||||
headers = { "Authorization": f"token {pat}" } | extra_headers | ||||||
headers = {"Authorization": f"token {pat}"} | extra_headers | ||||||
response = requests.get(url, headers=headers) | ||||||
if response.status_code != 200: | ||||||
print(f"Failed to download JSON: {response.status_code} {response.text}") | ||||||
sys.exit(1) | ||||||
else: | ||||||
return response.json() | ||||||
|
||||||
|
||||||
def download_artifact(url: str, artifact_name: str, pat: str) -> str: | ||||||
""" | ||||||
Download a GitHub Actions artifact from a given URL. | ||||||
|
@@ -248,7 +262,7 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str: | |||||
Returns: | ||||||
The path to the downloaded artifact file. | ||||||
""" | ||||||
headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" } | ||||||
headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"} | ||||||
response = requests.get(url, stream=True, headers=headers) | ||||||
zipName = artifact_name + ".zip" | ||||||
if response.status_code == 200: | ||||||
|
@@ -262,15 +276,20 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str: | |||||
print(f"Failed to download file. Status code: {response.status_code}") | ||||||
sys.exit(1) | ||||||
|
||||||
|
||||||
def remove_extension(filename: str) -> str: | ||||||
while "." in filename: | ||||||
filename, _ = os.path.splitext(filename) | ||||||
return filename | ||||||
|
||||||
|
||||||
def pretty_name_from_artifact_name(artifact_name: str) -> str: | ||||||
return artifact_name.split("___")[1] | ||||||
|
||||||
def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]: | ||||||
|
||||||
def download_dca_databases( | ||||||
experiment_name: str, pat: str, projects | ||||||
) -> List[tuple[str, str | None]]: | ||||||
""" | ||||||
Download databases from a DCA experiment. | ||||||
Args: | ||||||
|
@@ -282,58 +301,81 @@ def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tup | |||||
""" | ||||||
database_results = [] | ||||||
print("\n=== Finding projects ===") | ||||||
response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat) | ||||||
response = github( | ||||||
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", | ||||||
pat, | ||||||
) | ||||||
targets = response["targets"] | ||||||
for target, data in targets.items(): | ||||||
downloads = data["downloads"] | ||||||
analyzed_database = downloads["analyzed_database"] | ||||||
artifact_name = analyzed_database["artifact_name"] | ||||||
pretty_name = pretty_name_from_artifact_name(artifact_name) | ||||||
|
||||||
if not pretty_name in [project["name"] for project in projects]: | ||||||
print(f"Skipping {pretty_name} as it is not in the list of projects") | ||||||
continue | ||||||
|
||||||
repository = analyzed_database["repository"] | ||||||
run_id = analyzed_database["run_id"] | ||||||
print(f"=== Finding artifact: {artifact_name} ===") | ||||||
response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" }) | ||||||
artifacts = response["artifacts"] | ||||||
artifact_map = {artifact["name"]: artifact for artifact in artifacts} | ||||||
print(f"=== Downloading artifact: {artifact_name} ===") | ||||||
archive_download_url = artifact_map[artifact_name]["archive_download_url"] | ||||||
artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat) | ||||||
print(f"=== Extracting artifact: {artifact_name} ===") | ||||||
# The database is in a zip file, which contains a tar.gz file with the DB | ||||||
# First we open the zip file | ||||||
with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref: | ||||||
artifact_unzipped_location = os.path.join(build_dir, artifact_name) | ||||||
# And then we extract it to build_dir/artifact_name | ||||||
zip_ref.extractall(artifact_unzipped_location) | ||||||
# And then we iterate over the contents of the extracted directory | ||||||
# and extract the tar.gz files inside it | ||||||
for entry in os.listdir(artifact_unzipped_location): | ||||||
artifact_tar_location = os.path.join(artifact_unzipped_location, entry) | ||||||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: | ||||||
# And we just untar it to the same directory as the zip file | ||||||
tar_ref.extractall(artifact_unzipped_location) | ||||||
database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry)))) | ||||||
downloads = data["downloads"] | ||||||
analyzed_database = downloads["analyzed_database"] | ||||||
artifact_name = analyzed_database["artifact_name"] | ||||||
pretty_name = pretty_name_from_artifact_name(artifact_name) | ||||||
8000 span> |
||||||
if not pretty_name in [project["name"] for project in projects]: | ||||||
print(f"Skipping {pretty_name} as it is not in the list of projects") | ||||||
continue | ||||||
|
||||||
repository = analyzed_database["repository"] | ||||||
run_id = analyzed_database["run_id"] | ||||||
print(f"=== Finding artifact: {artifact_name} ===") | ||||||
response = github( | ||||||
f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", | ||||||
pat, | ||||||
{"Accept": "application/vnd.github+json"}, | ||||||
) | ||||||
artifacts = response["artifacts"] | ||||||
artifact_map = {artifact["name"]: artifact for artifact in artifacts} | ||||||
print(f"=== Downloading artifact: {artifact_name} ===") | ||||||
archive_download_url = artifact_map[artifact_name]["archive_download_url"] | ||||||
artifact_zip_location = download_artifact( | ||||||
archive_download_url, artifact_name, pat | ||||||
) | ||||||
print(f"=== Extracting artifact: {artifact_name} ===") | ||||||
# The database is in a zip file, which contains a tar.gz file with the DB | ||||||
# First we open the zip file | ||||||
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref: | ||||||
artifact_unzipped_location = os.path.join(build_dir, artifact_name) | ||||||
# And then we extract it to build_dir/artifact_name | ||||||
zip_ref.extractall(artifact_unzipped_location) | ||||||
# And then we iterate over the contents of the extracted directory | ||||||
# and extract the tar.gz files inside it | ||||||
for entry in os.listdir(artifact_unzipped_location): | ||||||
artifact_tar_location = os.path.join(artifact_unzipped_location, entry) | ||||||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: | ||||||
# And we just untar it to the same directory as the zip file | ||||||
tar_ref.extractall(artifact_unzipped_location) | ||||||
database_results.append( | ||||||
( | ||||||
pretty_name, | ||||||
os.path.join( | ||||||
artifact_unzipped_location, remove_extension(entry) | ||||||
), | ||||||
) | ||||||
) | ||||||
print(f"\n=== Extracted {len(database_results)} databases ===") | ||||||
|
||||||
def compare(a, b): | ||||||
a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0]) | ||||||
b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0]) | ||||||
a_index = next( | ||||||
i for i, project in enumerate(projects) if project["name"] == a[0] | ||||||
) | ||||||
b_index = next( | ||||||
i for i, project in enumerate(projects) if project["name"] == b[0] | ||||||
) | ||||||
return a_index - b_index | ||||||
|
||||||
# Sort the database results based on the order in the projects file | ||||||
return sorted(database_results, key=cmp_to_key(compare)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about making project_dirs = [project_dirs_map[project["name"]] for project in projects] That would avoid the gnarly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wonderful idea! Fixed in bdf411a |
||||||
|
||||||
|
||||||
|
||||||
def get_destination_for_project(config, name: str) -> str: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think "destination" by itself is a bit ambiguous.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 7c89d6d |
||||||
return os.path.join(config["destination"], name) | ||||||
|
||||||
|
||||||
def get_strategy(config) -> str: | ||||||
return config["strategy"].lower() | ||||||
|
||||||
|
||||||
def main(config, args) -> None: | ||||||
""" | ||||||
Main function to handle the bulk generation of MaD models. | ||||||
|
@@ -371,7 +413,9 @@ def main(config, args) -> None: | |||||
match get_strategy(config): | ||||||
case "repo": | ||||||
extractor_options = config.get("extractor_options", []) | ||||||
database_results = build_databases_from_projects(language, extractor_options, projects) | ||||||
database_results = build_databases_from_projects( | ||||||
language, extractor_options, projects | ||||||
) | ||||||
case "dca": | ||||||
experiment_name = args.dca | ||||||
if experiment_name is None: | ||||||
|
@@ -386,9 +430,7 @@ def main(config, args) -> None: | |||||
# Phase 3: Generate models for all projects | ||||||
print("\n=== Phase 3: Generating models ===") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling this "Phase 3" doesn't really make sense anymore as it depends on the strategy.
Suggested change
Same for the two other "Phase"s being printed above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in b640474 |
||||||
|
||||||
failed_builds = [ | ||||||
project for project, db_dir in database_results if db_dir is None | ||||||
] | ||||||
failed_builds = [project for project, db_dir in database_results if db_dir is None] | ||||||
if failed_builds: | ||||||
print( | ||||||
f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}" | ||||||
|
@@ -406,15 +448,36 @@ def main(config, args) -> None: | |||||
if database_dir is not None: | ||||||
generate_models(args, project, database_dir) | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
parser = argparse.ArgumentParser() | ||||||
parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True) | ||||||
parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False) | ||||||
parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False) | ||||||
parser.add_argument("--lang", type=str, help="The language to generate models for", required=True) | ||||||
parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False) | ||||||
parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False) | ||||||
parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False) | ||||||
parser.add_argument( | ||||||
"--config", type=str, help="Path to the configuration file.", required=True | ||||||
) | ||||||
parser.add_argument( | ||||||
"--dca", | ||||||
type=str, | ||||||
help="Name of a DCA run that built all the projects", | ||||||
required=False, | ||||||
) | ||||||
parser.add_argument( | ||||||
"--pat", | ||||||
type=str, | ||||||
help="PAT token to grab DCA databases (the same as the one you use for DCA)", | ||||||
required=False, | ||||||
) | ||||||
parser.add_argument( | ||||||
"--lang", type=str, help="The language to generate models for", required=True | ||||||
) | ||||||
parser.add_argument( | ||||||
"--with-sources", action="store_true", help="Generate sources", required=False | ||||||
) | ||||||
parser.add_argument( | ||||||
"--with-sinks", action="store_true", help="Generate sinks", required=False | ||||||
) | ||||||
parser.add_argument( | ||||||
"--with-summaries", action="store_true", help="Generate sinks", required=False | ||||||
) | ||||||
args = parser.parse_args() | ||||||
|
||||||
# Load config file | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about a slightly more descriptive name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea! Fixed in 566bf43