8000 Update base for Update on "[Cutlass] Integrate EVT into CUDACPPSchedu… · pytorch/pytorch@74f9e06 · GitHub
[go: up one dir, main page]

Skip to content

Commit 74f9e06

Browse files
committed
Update base for Update on "[Cutlass] Integrate EVT into CUDACPPScheduling"
Previously merged: * #151713 * #151405 * #150905 * #152306 * #152305 Allow epilogue nodes in cuda combined scheduling cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov [ghstack-poisoned]
2 parents 1f9858c + 5e96827 commit 74f9e06

File tree

196 files changed

+4167
-1779
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

196 files changed

+4167
-1779
lines changed

.ci/docker/build.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,11 +241,11 @@ case "$image" in
241241
CONDA_CMAKE=yes
242242
TRITON=yes
243243
;;
244-
pytorch-linux-focal-rocm-n-1-py3)
244+
pytorch-linux-jammy-rocm-n-1-py3)
245245
ANACONDA_PYTHON_VERSION=3.10
246246
GCC_VERSION=11
247247
VISION=yes
248-
ROCM_VERSION=6.2.4
248+
ROCM_VERSION=6.3
249249
NINJA_VERSION=1.9.0
250250
CONDA_CMAKE=yes
251251
TRITON=yes
@@ -254,11 +254,11 @@ case "$image" in
254254
UCC_COMMIT=${_UCC_COMMIT}
255255
INDUCTOR_BENCHMARKS=yes
256256
;;
257-
pytorch-linux-focal-rocm-n-py3)
257+
pytorch-linux-jammy-rocm-n-py3)
258258
ANACONDA_PYTHON_VERSION=3.10
259259
GCC_VERSION=11
260260
VISION=yes
261-
ROCM_VERSION=6.3
261+
ROCM_VERSION=6.4
262262
NINJA_VERSION=1.9.0
263263
CONDA_CMAKE=yes
264264
TRITON=yes

.ci/docker/common/install_rocm.sh

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,25 @@ EOF
6666
done
6767

6868
# ROCm 6.3 had a regression where initializing static code objects had significant overhead
69-
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
69+
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
70+
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]] || [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
71+
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
72+
HIP_BRANCH=rocm-6.3.x
73+
VER_STR=6.3
74+
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
75+
HIP_BRANCH=release/rocm-rel-6.4
76+
VER_STR=6.4
77+
fi
7078
# clr build needs CppHeaderParser but can only find it using conda's python
7179
/opt/conda/bin/python -m pip install CppHeaderParser
72-
git clone https://github.com/ROCm/HIP -b rocm-6.3.x
80+
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
7381
HIP_COMMON_DIR=$(readlink -f HIP)
74-
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
82+
git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}-statco-hotfix
7583
mkdir -p clr/build
7684
pushd clr/build
7785
cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
7886
make -j
79-
cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
87+
cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
8088
popd
8189
rm -rf HIP clr
8290
fi

.github/actions/setup-rocm/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ description: Set up ROCm host for CI
55
runs:
66
using: composite
77
steps:
8+
- name: Runner ROCm version
9+
if: always()
10+
shell: bash
11+
run: |
12+
dpkg -l | grep -E " rocm"
13+
814
- name: Stop all running docker containers
915
if: always()
1016
shell: bash

.github/label_to_label.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
- "module: aotinductor"
4343
- "module: cudagraphs"
4444
- "oncall: export"
45-
- "module: startup-tracing-compile"
45+
- "module: compile-time"
4646
- "module: compiled autograd"
4747
- "module: flex attention"
4848
- "module: dynamic shapes"

.github/workflows/_link_check.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ jobs:
3030
&& git diff --name-only ${{ github.event.before }} ${{ github.sha }}
3131
) || {
3232
echo
33-
echo "URL lint failed. If this is a transient outage, you can bypass it by adding the \`skip-url-lint\` label to your PR."
33+
echo "URL lint failed."
34+
echo "If this is a transient outage, you can bypass it by adding the \`skip-url-lint\` label to your PR."
35+
echo "Or add `@lint-ignore` somewhere on the same line as the URL you want to skip checking."
3436
exit 1
3537
}
3638

.github/workflows/_linux-test.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ on:
5555
required: false
5656
type: boolean
5757
default: false
58+
monitor-log-interval:
59+
description: |
60+
Set the interval for the monitor script to log utilization.
61+
required: false
62+
type: number
63+
default: 5
64+
monitor-data-collect-interval:
65+
description: |
66+
Set the interval for the monitor script to collect data.
67+
required: false
68+
type: number
69+
default: 1
5870
secrets:
5971
HUGGING_FACE_HUB_TOKEN:
6072
required: false
@@ -172,9 +184,11 @@ jobs:
172184
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
173185
WORKFLOW_NAME: ${{ github.workflow }}
174186
WORKFLOW_RUN_ID: ${{github.run_id}}
187+
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
188+
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
175189
run: |
176-
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7
177-
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
190+
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
191+
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
178192
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
179193
180194
- name: Download build artifacts

.github/workflows/_mac-test.yml

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ on:
3838
required: false
3939
type: boolean
4040
default: true
41+
monitor-log-interval:
42+
description: |
43+
Set the interval for the monitor script to log utilization.
44+
required: false
45+
type: number
46+
default: 5
47+
monitor-data-collect-interval:
48+
description: |
49+
Set the interval for the monitor script to collect data.
50+
required: false
51+
type: number
52+
default: 1
4153
secrets:
4254
HUGGING_FACE_HUB_TOKEN:
4355
required: false
@@ -93,12 +105,27 @@ jobs:
93105
- name: Checkout PyTorch
94106
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
95107

108+
- name: Get workflow job id
109+
id: get-job-id
110+
uses: ./.github/actions/get-workflow-job-id
111+
if: always()
112+
with:
113+
github-token: ${{ secrets.GITHUB_TOKEN }}
114+
96115
- name: Start monitoring script
97116
id: monitor-script
98117
if: ${{ !inputs.disable-monitor }}
99118
continue-on-error: true
119+
env:
120+
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
121+
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
122+
WORKFLOW_NAME: ${{ github.workflow }}
123+
WORKFLOW_RUN_ID: ${{github.run_id}}
124+
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
125+
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
100126
run: |
101-
${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
127+
${CONDA_RUN} python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
128+
${CONDA_RUN} python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
102129
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
103130
104131
- name: Download build artifacts
@@ -124,13 +151,6 @@ jobs:
124151
id: parse-ref
125152
run: .github/scripts/parse_ref.py
126153

127-
- name: Get workflow job id
128-
id: get-job-id
129-
uses: ./.github/actions/get-workflow-job-id
130-
if: always()
131-
with:
132-
github-token: ${{ secrets.GITHUB_TOKEN }}
133-
134154
- name: Check for keep-going label and re-enabled test issues
135155
# This uses the filter-test-configs action because it conviniently
136156
# checks for labels and re-enabled test issues. It does not actually do
@@ -237,6 +257,17 @@ jobs:
237257
schema-version: v3
238258
github-token: ${{ secrets.GITHUB_TOKEN }}
239259

260+
- name: Upload utilization stats
261+
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
262+
continue-on-error: true
263+
uses: ./.github/actions/upload-utilization-stats
264+
with:
265+
job_id: ${{ steps.get-job-id.outputs.job-id }}
266+
job_name: ${{ steps.get-job-id.outputs.job-name }}
267+
workflow_name: ${{ github.workflow }}
268+
workflow_run_id: ${{github.run_id}}
269+
workflow_attempt: ${{github.run_attempt}}
270+
240271
- name: Clean up disk space
241272
if: always()
242273
continue-on-error: true

.github/workflows/_rocm-test.yml

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,18 @@ on:
5050
required: false
5151
type: boolean
5252
default: true
53-
53+
monitor-log-interval:
54+
description: |
55+
Set the interval for the monitor script to log utilization.
56+
required: false
57+
type: number
58+
default: 5
59+
monitor-data-collect-interval:
60+
description: |
61+
Set the interval for the monitor script to collect data.
62+
required: false
63+
type: number
64+
default: 1
5465
env:
5566
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
5667

@@ -101,14 +112,28 @@ jobs:
101112
with:
102113
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
103114

115+
- name: Get workflow job id
116+
id: get-job-id
117+
uses: ./.github/actions/get-workflow-job-id
118+
if: always()
119+
with:
120+
github-token: ${{ secrets.GITHUB_TOKEN }}
121+
104122
- name: Start monitoring script
105123
id: monitor-script
124+
env:
125+
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
126+
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
127+
WORKFLOW_NAME: ${{ github.workflow }}
128+
WORKFLOW_RUN_ID: ${{github.run_id}}
129+
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
130+
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
106131
if: ${{ !inputs.disable-monitor }}
107132
shell: bash
108133
continue-on-error: true
109134
run: |
110-
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
111-
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
135+
python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
136+
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
112137
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
113138
114139
- name: Download build artifacts
@@ -124,13 +149,6 @@ jobs:
124149
id: parse-ref
125150
run: .github/scripts/parse_ref.py
126151

127-
- name: Get workflow job id
128-
id: get-job-id
129-
uses: ./.github/actions/get-workflow-job-id
130-
if: always()
131-
with:
132-
github-token: ${{ secrets.GITHUB_TOKEN }}
133-
134152
- name: Check for keep-going label and re-enabled test issues
135153
# This uses the filter-test-configs action because it conviniently
136154
# checks for labels and re-enabled test issues. It does not actually do
@@ -309,5 +327,16 @@ jobs:
309327
schema-version: v3
310328
github-token: ${{ secrets.GITHUB_TOKEN }}
311329

330+
- name: Upload utilization stats
331+
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
332+
continue-on-error: true
333+
uses: ./.github/actions/upload-utilization-stats
334+
with:
335+
job_id: ${{ steps.get-job-id.outputs.job-id }}
336+
job_name: ${{ steps.get-job-id.outputs.job-name }}
337+
workflow_name: ${{ github.workflow }}
338+
workflow_run_id: ${{github.run_id}}
339+
workflow_attempt: ${{github.run_attempt}}
340+
312341
- name: Teardown ROCm
313342
uses: ./.github/actions/teardown-rocm

.github/workflows/_win-test.yml

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,18 @@ on:
3636
required: false
3737
type: boolean
3838
default: true
39-
39+
monitor-log-interval:
40+
description: |
41+
Set the interval for the monitor script to log utilization.
42+
required: false
43+
type: number
44+
default: 5
45+
monitor-data-collect-interval:
46+
description: |
47+
Set the interval for the monitor script to collect data.
48+
required: false
49+
type: number
50+
default: 1
4051
env:
4152
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
4253

@@ -106,14 +117,29 @@ jobs:
106117
set -eu
107118
python3 -m pip install 'xdoctest>=1.1.0'
108119
120+
- name: Get workflow job id
121+
id: get-job-id
122+
uses: ./.github/actions/get-workflow-job-id
123+
if: always()
124+
with:
125+
github-token: ${{ secrets.GITHUB_TOKEN }}
126+
109127
- name: Start monitoring script
110128
id: monitor-script
129+
env:
130+
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
131+
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
132+
WORKFLOW_NAME: ${{ github.workflow }}
133+
WORKFLOW_RUN_ID: ${{github.run_id}}
134+
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
135+
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
111136
shell: bash
112137
if: ${{ !inputs.disable-monitor }}
113138
continue-on-error: true
114139
run: |
115140
# Windows conda doesn't have python3 binary, only python, but it's python3
116-
${CONDA_RUN} python -m tools.stats.monitor > usage_log.txt 2>&1 &
141+
${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
142+
${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
117143
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
118144
119145
- name: Download PyTorch Build Artifacts
@@ -131,13 +157,6 @@ jobs:
131157
continue-on-error: true
132158
uses: ./.github/actions/download-td-artifacts
133159

134-
- name: Get workflow job id
135-
id: get-job-id
136-
uses: ./.github/actions/get-workflow-job-id
137-
if: always()
138-
with:
139-
github-token: ${{ secrets.GITHUB_TOKEN }}
140-
141160
- name: Check for keep-going label and re-enabled test issues
142161
# This uses the filter-test-configs action because it conviniently
143162
# checks for labels and re-enabled test issues. It does not actually do
@@ -236,6 +255,17 @@ jobs:
236255
with:
237256
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
238257

258+
- name: Upload utilization stats
259+
if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
260+
continue-on-error: true
261+
uses: ./.github/actions/upload-utilization-stats
262+
with:
263+
job_id: ${{ steps.get-job-id.outputs.job-id }}
264+
job_name: ${{ steps.get-job-id.outputs.job-name }}
265+
workflow_name: ${{ github.workflow }}
266+
workflow_run_id: ${{github.run_id}}
267+
workflow_attempt: ${{github.run_attempt}}
268+
239269
- name: Parse ref
240270
id: parse-ref
241271
shell: bash

0 commit comments

Comments
 (0)
0