8000 Bulk MAD generator: Support databases from DCA runs by MathiasVP · Pull Request #19627 · github/codeql · GitHub
[go: up one dir, main page]

Skip to content

Bulk MAD generator: Support databases from DCA runs #19627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
May 30, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cb0b566
C++: Put autogenerated models in the same folder structure as Rust.
MathiasVP May 29, 2025
40d937a
Bulk generator: Some imports we will need.
MathiasVP May 29, 2025
b87ba31
Bulk generator: Get rid of the hardcoded project list and move it int…
MathiasVP May 29, 2025
6ff2beb
Bulk generator: Add command-line arguments.
MathiasVP May 29, 2025
e721fc0
Bulk generator: Prepare for adding DCA support. This commits just gen…
MathiasVP May 29, 2025
5051790
Bulk generator: Add DCA support.
MathiasVP May 29, 2025
cb93870
Bulk generator: Rename file since it is no longer Rust specific.
MathiasVP May 29, 2025
7ecf8c8
Bulk generator: Format file and add a note at the top of the file spe…
MathiasVP May 30, 2025
566bf43
Bulk generator: Rename 'github' to 'get_json_from_github'.
MathiasVP May 30, 2025
b640474
Bulk generator: Remove 'Phase' part of log message.
MathiasVP May 30, 2025
5d79a8d
Update misc/scripts/models-as-data/bulk_generate_mad.py
MathiasVP May 30, 2025
7c89d6d
Bulk generator: Rename 'get_destination_for_project' to 'get_mad_dest…
MathiasVP May 30, 2025
7121f5c
Bulk generator: Use the 'Project' type throughout the file.
MathiasVP May 30, 2025
fc165db
Bulk generator: Specify 'with-summaries', 'with-sources', and 'with-s…
MathiasVP May 30, 2025
1228080
Bulk generator: Specify 'language' in the config file.
MathiasVP May 30, 2025
7c2612a
Bulk generator: Specify a path to the PAT instead of the PAT itself.
MathiasVP May 30, 2025
3ddca32
Update misc/scripts/models-as-data/bulk_generate_mad.py
MathiasVP May 30, 2025
cdd869a
Bulk generator: Autoformat.
MathiasVP May 30, 2025
bdf411a
Bulk generator: Make 'database_results' a map to simplify away the ex…
MathiasVP May 30, 2025
3444c98
Bulk generator: Fix field name.
MathiasVP May 30, 2025
0f30644
Bulk generator: Snake case things.
MathiasVP May 30, 2025
7cb9024
Bulk generator: Flip default values for summaries, sources, and sinks.
MathiasVP May 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Bulk generator: Prepare for adding DCA support. This commits just gen…
…eralizes the existing functionality to be independent of Rust and instead depend on the configuration file and the command-line arguments.
  • Loading branch information
MathiasVP committed May 29, 2025
commit e721fc07aaef1ef91bff825dc95aa5c18176e3b4
103 changes: 63 additions & 40 deletions misc/scripts/models-as-data/rust_bulk_generate_mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,10 @@
)
build_dir = os.path.join(gitroot, "mad-generation-build")


def path_to_mad_directory(language: str, name: str) -> str:
return os.path.join(gitroot, f"{language}/ql/lib/ext/generated/{name}")


# A project to generate models for
class Project(TypedDict):
"""
Type definition for Rust projects to model.
Type definition for projects (acquired via a GitHub repo) to model.

Attributes:
name: The name of the project
Expand Down Expand Up @@ -139,13 +134,15 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
return project_dirs


def build_database(project: Project, project_dir: str) -> str | None:
def build_database(language: str, extractor_options, project: Project, project_dir: str) -> str | None:
"""
Build a CodeQL database for a project.

Args:
language: The language for which to build the database (e.g., "rust").
extractor_options: Additional options for the extractor.
project: A dictionary containing project information with 'name' and 'git_repo' keys.
project_dir: The directory containing the project source code.
project_dir: Path to the CodeQL database.

Returns:
The path to the created database directory.
Expand All @@ -158,17 +155,17 @@ def build_database(project: Project, project_dir: str) -> str | None:
# Only build the database if it doesn't already exist
if not os.path.exists(database_dir):
print(f"Building CodeQL database for {name}...")
extractor_options = [option for x in extractor_options for option in ("-O", x)]
try:
subprocess.check_call(
[
"codeql",
"database",
"create",
"--language=rust",
f"--language={language}",
"--source-root=" + project_dir,
"--overwrite",
"-O",
"cargo_features='*'",
*extractor_options,
"--",
database_dir,
]
Expand All @@ -184,40 +181,72 @@ def build_database(project: Project, project_dir: str) -> str | None:

return database_dir


def generate_models(project: Project, database_dir: str) -> None:
def generate_models(args, name: str, database_dir: str) -> None:
"""
Generate models for a project.

Args:
project: A dictionary containing project information with 'name' and 'git_repo' keys.
project_dir: The directory containing the project source code.
args: Command line arguments passed to this script.
name: The name of the project.
database_dir: Path to the CodeQL database.
"""
name = project["name"]

generator = mad.Generator("rust")
generator.generateSinks = True
generator.generateSources = True
generator.generateSummaries = True
generator = mad.Generator(args.lang)
generator.generateSinks = args.with_sinks
generator.generateSources = args.with_sources
generator.generateSummaries = args.with_summaries
generator.setenvironment(database=database_dir, folder=name)
generator.run()

def build_databases_from_projects(language: str, extractor_options, projects: List[Project]) -> List[tuple[str, str | None]]:
"""
Build databases for all projects in parallel.

Args:
language: The language for which to build the databases (e.g., "rust").
extractor_options: Additional options for the extractor.
projects: List of projects to build databases for.

Returns:
List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
"""
# Phase 1: Clone projects in parallel
print("=== Phase 1: Cloning projects ===")
project_dirs = clone_projects(projects)

# Phase 2: Build databases for all projects
print("\n=== Phase 2: Building databases ===")
database_results = [
(project["name"], build_database(language, extractor_options, project, project_dir))
for project, project_dir in project_dirs
]
return database_results

def get_destination_for_project(config, name: str) -> str:
return os.path.join(config["destination"], name)

def get_strategy(config) -> str:
return config["strategy"].lower()

def main() -> None:
def main(config, args) -> None:
"""
Process all projects in three distinct phases:
1. Clone projects (in parallel)
2. Build databases for projects
3. Generate models for successful database builds
Main function to handle the bulk generation of MaD models.
Args:
config: Configuration dictionary containing project details and other settings.
args: Command line arguments passed to this script.
"""

projects = config["targets"]
destination = config["destination"]
language = args.lang

# Create build directory if it doesn't exist
if not os.path.exists(build_dir):
os.makedirs(build_dir)

# Check if any of the MaD directories contain working directory changes in git
for project in projects:
mad_dir = path_to_mad_directory("rust", project["name"])
mad_dir = get_destination_for_project(config, project["name"])
if os.path.exists(mad_dir):
git_status_output = subprocess.check_output(
["git", "status", "-s", mad_dir], text=True
Expand All @@ -232,22 +261,17 @@ def main() -> None:
)
sys.exit(1)

# Phase 1: Clone projects in parallel
print("=== Phase 1: Cloning projects ===")
project_dirs = clone_projects(projects)

# Phase 2: Build databases for all projects
print("\n=== Phase 2: Building databases ===")
database_results = [
(project, build_database(project, project_dir))
for project, project_dir in project_dirs
]
database_results = []
match get_strategy(config):
case "repo":
extractor_options = config.get("extractor_options", [])
database_results = build_databases_from_projects(language, extractor_options, projects)

# Phase 3: Generate models for all projects
print("\n=== Phase 3: Generating models ===")

failed_builds = [
project["name"] for project, db_dir in database_results if db_dir is None
project for project, db_dir in database_results if db_dir is None
]
if failed_builds:
print(
Expand All @@ -257,15 +281,14 @@ def main() -> None:

# Delete the MaD directory for each project
for project, database_dir in database_results:
mad_dir = path_to_mad_directory("rust", project["name"])
mad_dir = get_destination_for_project(config, project)
if os.path.exists(mad_dir):
print(f"Deleting existing MaD directory at {mad_dir}")
subprocess.check_call C2A8 (["rm", "-rf", mad_dir])

for project, database_dir in database_results:
if database_dir is not None:
generate_models(project, database_dir)

generate_models(args, project, database_dir)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
5 changes: 5 additions & 0 deletions rust/misc/bulk_generation_targets.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"strategy": "repo",
"targets": [
{
"name": "libc",
Expand Down Expand Up @@ -65,5 +66,9 @@
"git_repo": "https://github.com/clap-rs/clap",
"git_tag": "v4.5.38"
}
],
"destination": "rust/ql/lib/ext/generated",
"extractor_options": [
"cargo_features='*'"
]
}
0