8000 [CI] test upload: better check for if job is rerun disabled tests (#1… · pytorch/pytorch@2978771 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2978771

Browse files
clee2000pytorchmergebot
authored andcommitted
[CI] test upload: better check for if job is rerun disabled tests (#148027)
Some disabled test runs weren't being uploaded as disabled tests because some dynamo tests are set to mark themselves as skipped if they are failing. This makes the script think that there are fewer retries than there are actually are and that the job is not a rerun 10000 disabled tests job. Instead, query for the job name to see if it contains rerun disabled tests and fall back to counting the number of retries if querying fails Alternate options: relax the check for the number of tests Pull Request resolved: #148027 Approved by: https://github.com/huydhn
1 parent fc78192 commit 2978771

File tree

2 files changed

+59
-5
lines changed

2 files changed

+59
-5
lines changed

tools/stats/check_disabled_tests.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,9 @@ def main(repo: str, workflow_run_id: int, workflow_run_attempt: int) -> None:
245245

246246
# The scheduled workflow has both rerun disabled tests and memory leak check jobs.
247247
# We are only interested in the former here
248-
if not is_rerun_disabled_tests(tests):
248+
if not is_rerun_disabled_tests(
249+
report, workflow_run_id, workflow_run_attempt, tests
250+
):
249251
continue
250252

251253
for name, stats in tests.items():

tools/stats/upload_stats_lib.py

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import zipfile
1010
from functools import lru_cache
1111
from pathlib import Path
12-
from typing import Any, Callable, Optional
12+
from typing import Any, Callable, cast, Optional
1313

1414
import boto3 # type: ignore[import]
1515
import requests
@@ -245,15 +245,24 @@ def unzip(p: Path) -> None:
245245
zip.extractall(unzipped_dir)
246246

247247

248-
def is_rerun_disabled_tests(tests: dict[str, dict[str, int]]) -> bool:
248+
def is_rerun_disabled_tests(
249+
report: Path,
250+
workflow_run_id: int,
251+
workflow_run_attempt: int,
252+
tests: dict[str, dict[str, int]],
253+
) -> bool:
249254
"""
250255
Check if the test report is coming from rerun_disabled_tests workflow where
251256
each test is run multiple times
252257
"""
253-
return all(
258+
if all(
254259
t.get("num_green", 0) + t.get("num_red", 0) > MAX_RETRY_IN_NON_DISABLED_MODE
255260
for t in tests.values()
256-
)
261+
):
262+
return True
263+
job_id = get_job_id(report)
264+
job_name = get_job_name(job_id, workflow_run_id, workflow_run_attempt)
265+
return job_name is not None and "rerun_disabled_tests" in job_name
257266

258267

259268
def get_job_id(report: Path) -> int | None:
@@ -266,3 +275,46 @@ def get_job_id(report: Path) -> int | None:
266275
return int(report.parts[0].rpartition("_")[2])
267276
except ValueError:
268277
return None
278+
279+
280+
@lru_cache
281+
def get_job_name(
282+
id: int | None, workflow_id: int | None, workflow_run_attempt: int | None
283+
) -> str | None:
284+
if id is None:
285+
return None
286+
try:
287+
if workflow_id is None:
288+
response = requests.get(
289+
f"{PYTORCH_REPO}/actions/jobs/{id}",
290+
headers=_get_request_headers(),
291+
)
292+
if response.status_code != 200:
293+
return None
294+
return cast(str, response.json()["name"])
295+
else:
296+
297+
@lru_cache
298+
def _get_jobs(workflow_id: int) -> dict[int, str]:
299+
jobs: dict[int, str] = {}
300+
# Paginate
301+
page = 1
302+
while True:
303+
response = requests.get(
304+
f"{PYTORCH_REPO}/actions/runs/{workflow_id}/attempts/{workflow_run_attempt}/jobs",
305+
headers=_get_request_headers(),
306+
params={"page": page, "per_page": 100},
307+
)
308+
if response.status_code != 200:
309+
return jobs
310+
for job in response.json()["jobs"]:
311+
jobs[job["id"]] = job["name"]
312+
if "next" not in response.links:
313+
break
314+
page += 1
315+
return jobs
316+
317+
jobs = _get_jobs(workflow_id)
318+
return jobs[id]
319+
except Exception:
320+
return None

0 commit comments

Comments
 (0)
0