8000 Bulk MAD generator: Support databases from DCA runs by MathiasVP · Pull Request #19627 · github/codeql · GitHub
[go: up one dir, main page]

Skip to content

Bulk MAD generator: Support databases from DCA runs #19627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
May 30, 2025
Merged
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cb0b566
C++: Put autogenerated models in the same folder structure as Rust.
MathiasVP May 29, 2025
40d937a
Bulk generator: Some imports we will need.
MathiasVP May 29, 2025
b87ba31
Bulk generator: Get rid of the hardcoded project list and move it int…
MathiasVP May 29, 2025
6ff2beb
Bulk generator: Add command-line arguments.
MathiasVP May 29, 2025
e721fc0
Bulk generator: Prepare for adding DCA support. This commits just gen…
MathiasVP May 29, 2025
5051790
Bulk generator: Add DCA support.
MathiasVP May 29, 2025
cb93870
Bulk generator: Rename file since it is no longer Rust specific.
MathiasVP May 29, 2025
7ecf8c8
Bulk generator: Format file and add a note at the top of the file spe…
MathiasVP May 30, 2025
566bf43
Bulk generator: Rename 'github' to 'get_json_from_github'.
MathiasVP May 30, 2025
b640474
Bulk generator: Remove 'Phase' part of log message.
MathiasVP May 30, 2025
5d79a8d
Update misc/scripts/models-as-data/bulk_generate_mad.py
MathiasVP May 30, 2025
7c89d6d
Bulk generator: Rename 'get_destination_for_project' to 'get_mad_dest…
MathiasVP May 30, 2025
7121f5c
Bulk generator: Use the 'Project' type throughout the file.
MathiasVP May 30, 2025
fc165db
Bulk generator: Specify 'with-summaries', 'with-sources', and 'with-s…
MathiasVP May 30, 2025
1228080
Bulk generator: Specify 'language' in the config file.
MathiasVP May 30, 2025
7c2612a
Bulk generator: Specify a path to the PAT instead of the PAT itself.
MathiasVP May 30, 2025
3ddca32
Update misc/scripts/models-as-data/bulk_generate_mad.py
MathiasVP May 30, 2025
cdd869a
Bulk generator: Autoformat.
MathiasVP May 30, 2025
bdf411a
Bulk generator: Make 'database_results' a map to simplify away the ex…
MathiasVP May 30, 2025
3444c98
Bulk generator: Fix field name.
MathiasVP May 30, 2025
0f30644
Bulk generator: Snake case things.
MathiasVP May 30, 2025
7cb9024
Bulk generator: Flip default values for summaries, sources, and sinks.
MathiasVP May 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Bulk generator: Format file and add a note at the top of the file spe…
…cifying the formatting requirements.
  • Loading branch information
MathiasVP committed May 30, 2025
commit 7ecf8c8ea2d11f0a9bfcca86a55b12827e38314c
171 changes: 117 additions & 54 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""
Experimental script for bulk generation of MaD models based on a list of projects.

Note: This file must be formatted using the Black Python formatter.
"""

import os.path
Expand All @@ -24,6 +26,7 @@
)
build_dir = os.path.join(gitroot, "mad-generation-build")


# A project to generate models for
class Project(TypedDict):
"""
Expand Down Expand Up @@ -132,7 +135,9 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
return project_dirs


def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None:
def build_database(
language: str, extractor_options, project: Project, project_dir: str
) -> str | None:
"""
Build a CodeQL database for a project.

Expand Down Expand Up @@ -179,6 +184,7 @@ def build_database(language: str, extractor_options, project: Project, project_d

return database_dir


def generate_models(args, name: str, database_dir: str) -> None:
"""
Generate models for a project.
Expand All @@ -196,7 +202,10 @@ def generate_models(args, name: str, database_dir: str) -> None:
generator.setenvironment(database=database_dir, folder=name)
generator.run()

def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]:

def build_databases_from_projects(
language: str, extractor_options, projects: List[Project]
) -> List[tuple[str, str | None]]:
"""
Build databases for all projects in parallel.

Expand All @@ -215,11 +224,15 @@ def build_databases_from_projects(language: str, extractor_options, projects: Li
# Phase 2: Build databases for all projects
print("\n=== Phase 2: Building databases ===")
database_results = [
(project["name"], build_database(language, extractor_options, project, project_dir))
(
project["name"],
build_database(language, extractor_options, project, project_dir),
)
for project, project_dir in project_dirs
]
return database_results


def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about a slightly more descriptive name?

Suggested change
def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
def get_json_from_github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! Fixed in 566bf43

"""
Download a JSON file from GitHub using a personal access token (PAT).
Expand All @@ -230,14 +243,15 @@ def github(url: str, pat: str, extra_headers: dict[str, str] = {}) -> dict:
Returns:
The JSON response as a dictionary.
"""
headers = { "Authorization": f"token {pat}" } | extra_headers
headers = {"Authorization": f"token {pat}"} | extra_headers
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to download JSON: {response.status_code} {response.text}")
sys.exit(1)
else:
return response.json()


def download_artifact(url: str, artifact_name: str, pat: str) -> str:
"""
Download a GitHub Actions artifact from a given URL.
Expand All @@ -248,7 +262,7 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
Returns:
The path to the downloaded artifact file.
"""
headers = { "Authorization": f"token {pat}", "Accept": "application/vnd.github+json" }
headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github+json"}
response = requests.get(url, stream=True, headers=headers)
zipName = artifact_name + ".zip"
if response.status_code == 200:
Expand All @@ -262,15 +276,20 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
print(f"Failed to download file. Status code: {response.status_code}")
sys.exit(1)


def remove_extension(filename: str) -> str:
while "." in filename:
filename, _ = os.path.splitext(filename)
return filename


def pretty_name_from_artifact_name(artifact_name: str) -> str:
return artifact_name.split("___")[1]

def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tuple[str, str | None]]:

def download_dca_databases(
experiment_name: str, pat: str, projects
) -> List[tuple[str, str | None]]:
"""
Download databases from a DCA experiment.
Args:
Expand All @@ -282,58 +301,81 @@ def download_dca_databases(experiment_name: str, pat: str, projects) -> List[tup
"""
database_results = []
print("\n=== Finding projects ===")
response = github(f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", pat)
response = github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
pat,
)
targets = response["targets"]
for target, data in targets.items():
downloads = data["downloads"]
analyzed_database = downloads["analyzed_database"]
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in [project["name"] for project in projects]:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

repository = analyzed_database["repository"]
run_id = analyzed_database["run_id"]
print(f"=== Finding artifact: {artifact_name} ===")
response = github(f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts", pat, { "Accept": "application/vnd.github+json" })
artifacts = response["artifacts"]
artifact_map = {artifact["name"]: artifact for artifact in artifacts}
print(f"=== Downloading artifact: {artifact_name} ===")
archive_download_url = artifact_map[artifact_name]["archive_download_url"]
artifact_zip_location = download_artifact(archive_download_url, artifact_name, pat)
print(f"=== Extracting artifact: {artifact_name} ===")
# The database is in a zip file, which contains a tar.gz file with the DB
# First we open the zip file
with zipfile.ZipFile(artifact_zip_location, 'r') as zip_ref:
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
# And then we extract it to build_dir/artifact_name
zip_ref.extractall(artifact_unzipped_location)
# And then we iterate over the contents of the extracted directory
# and extract the tar.gz files inside it
for entry in os.listdir(artifact_unzipped_location):
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
database_results.append((pretty_name, os.path.join(artifact_unzipped_location, remove_extension(entry))))
downloads = data["downloads"]
analyzed_database = downloads["analyzed_database"]
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in [project["name"] for project in projects]:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

repository = analyzed_database["repository"]
run_id = analyzed_database["run_id"]
print(f"=== Finding artifact: {artifact_name} ===")
response = github(
f"https://api.github.com/repos/{repository}/actions/runs/{run_id}/artifacts",
pat,
{"Accept": "application/vnd.github+json"},
)
artifacts = response["artifacts"]
artifact_map = {artifact["name"]: artifact for artifact in artifacts}
print(f"=== Downloading artifact: {artifact_name} ===")
archive_download_url = artifact_map[artifact_name]["archive_download_url"]
artifact_zip_location = download_artifact(
archive_download_url, artifact_name, pat
)
print(f"=== Extracting artifact: {artifact_name} ===")
# The database is in a zip file, which contains a tar.gz file with the DB
# First we open the zip file
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
# And then we extract it to build_dir/artifact_name
zip_ref.extractall(artifact_unzipped_location)
# And then we iterate over the contents of the extracted directory
# and extract the tar.gz files inside it
for entry in os.listdir(artifact_unzipped_location):
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
database_results.append(
(
pretty_name,
os.path.join(
artifact_unzipped_location, remove_extension(entry)
),
)
)
print(f"\n=== Extracted {len(database_results)} databases ===")

def compare(a, b):
a_index = next(i for i, project in enumerate(projects) if project["name"] == a[0])
b_index = next(i for i, project in enumerate(projects) if project["name"] == b[0])
a_index = next(
i for i, project in enumerate(projects) if project["name"] == a[0]
)
b_index = next(
i for i, project in enumerate(projects) if project["name"] == b[0]
)
return a_index - b_index

# Sort the database results based on the order in the projects file
return sorted(database_results, key=cmp_to_key(compare))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about making database_results a map and picking the results out of the map based on projects. Similar to what's done in clone_projects:

    project_dirs = [project_dirs_map[project["name"]] for project in projects]

That would avoid the gnarly compare function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonderful idea! Fixed in bdf411a




def get_destination_for_project(config, name: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "destination" by itself is a bit ambiguous.

Suggested change
def get_destination_for_project(config, name: str) -> str:
def get_mad_destination_for_project(config, name: str) -> str:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 7c89d6d

return os.path.join(config["destination"], name)


def get_strategy(config) -> str:
return config["strategy"].lower()


def main(config, args) -> None:
"""
Main function to handle the bulk generation of MaD models.
Expand Down Expand Up @@ -371,7 +413,9 @@ def main(config, args) -> None:
match get_strategy(config):
case "repo":
extractor_options = config.get("extractor_options", [])
database_results = build_databases_from_projects(language, extractor_options, projects)
database_results = build_databases_from_projects(
language, extractor_options, projects
)
case "dca":
experiment_name = args.dca
if experiment_name is None:
Expand All @@ -386,9 +430,7 @@ def main(config, args) -> None:
# Phase 3: Generate models for all projects
print("\n=== Phase 3: Generating models ===")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling this "Phase 3" doesn't really make sense anymore as it depends on the strategy.

Suggested change
print("\n=== Phase 3: Generating models ===")
print("\n=== Generating models ===")

Same for the two other "Phase"s being printed above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in b640474


failed_builds = [
project for project, db_dir in database_results if db_dir is None
]
failed_builds = [project for project, db_dir in database_results if db_dir is None]
if failed_builds:
print(
f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
Expand All @@ -406,15 +448,36 @@ def main(config, args) -> None:
if database_dir is not None:
generate_models(args, project, database_dir)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, help="Path to the configuration file.", required=True)
parser.add_argument("--dca", type=str, help="Name of a DCA run that built all the projects", required=False)
parser.add_argument("--pat", type=str, help="PAT token to grab DCA databases (the same as the one you use for DCA)", required=False)
parser.add_argument("--lang", type=str, help="The language to generate models for", required=True)
parser.add_argument("--with-sources", action="store_true", help="Generate sources", required=False)
parser.add_argument("--with-sinks", action="store_true", help="Generate sinks", required=False)
parser.add_argument("--with-summaries", action="store_true", help="Generate sinks", required=False)
parser.add_argument(
"--config", type=str, help="Path to the configuration file.", required=True
)
parser.add_argument(
"--dca",
type=str,
help="Name of a DCA run that built all the projects",
required=False,
)
parser.add_argument(
"--pat",
type=str,
help="PAT token to grab DCA databases (the same as the one you use for DCA)",
required=False,
)
parser.add_argument(
"--lang", type=str, help="The language to generate models for", required=True
)
parser.add_argument(
"--with-sources", action="store_true", help="Generate sources", required=False
)
parser.add_argument(
"--with-sinks", action="store_true", help="Generate sinks", required=False
)
parser.add_argument(
"--with-summaries", action="store_true", help="Generate sinks", required=False
)
args = parser.parse_args()

# Load config file
Expand Down
Loading
0