8000 Move ROCm MI300 jobs to unstable to make CI green by ZainRizvi · Pull Request #145790 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

Move ROCm MI300 jobs to unstable to make CI green #145790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 1 addition & 27 deletions .github/workflows/inductor-rocm-mi300.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,4 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}

linux-focal-rocm6_3-py3_10-inductor-build:
name: rocm6.3-py3.10-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-focal-rocm6.3-py3.10
docker-image-name: pytorch-linux-focal-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

linux-focal-rocm6_3-py3_10-inductor-test:
permissions:
id-token: write
contents: read
name: rocm6.3-py3.10-inductor
uses: ./.github/workflows/_rocm-test.yml
needs: linux-focal-rocm6_3-py3_10-inductor-build
with:
build-environment: linux-focal-rocm6.3-py3.10
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
secrets: inherit
# Build and test jobs moved to unstable.yml to make CI green. Jobs will be moved back when MI300 runners are better isolated from each other.
46 changes: 8 additions & 38 deletions .github/workflows/rocm-mi300.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,14 @@ concurrency:
permissions: read-all

jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read

linux-focal-rocm6_3-py3_10-build:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-focal-rocm6.3-py3.10
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-rocm6.3-py3.10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious, what is the point of doing something like that for the workflow that could not be triggered by ciflow/XYZ labels? and as such not a merge blocking?

docker-image-name: pytorch-linux-focal-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}

linux-focal-rocm6_3-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-focal-rocm6.3-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-focal-rocm6_3-py3_10-build
- target-determination
with:
build-environment: linux-focal-rocm6.3-py3.10
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
secrets: inherit
# Build and test jobs moved to unstable.yml to make CI green. Jobs will be moved back when MI300 runners are better isolated from each other.
86 changes: 86 additions & 0 deletions .github/workflows/unstable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,89 @@ jobs:
echo
echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
echo " they can graduate and move back to pull or trunk."

target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read

get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}

# Moved from rocm-mi300.yml

linux-focal-rocm6_3-py3_10-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-focal-rocm6.3-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-focal-rocm6.3-py3.10
docker-image-name: pytorch-linux-focal-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

linux-focal-rocm6_3-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-focal-rocm6.3-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-focal-rocm6_3-py3_10-build
- target-determination
with:
build-environment: linux-focal-rocm6.3-py3.10
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
secrets: inherit

# Moved from inductor-rocm-mi300.yml

linux-focal-rocm6_3-py3_10-inductor-build:
name: rocm6.3-py3.10-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-focal-rocm6.3-py3.10
docker-image-name: pytorch-linux-focal-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

linux-focal-rocm6_3-py3_10-inductor-test:
permissions:
id-token: write
contents: read
name: rocm6.3-py3.10-inductor
uses: ./.github/workflows/_rocm-test.yml
needs: linux-focal-rocm6_3-py3_10-inductor-build
with:
build-environment: linux-focal-rocm6.3-py3.10
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
secrets: inherit
Loading
0