10000 Update on "[dtensor] fix scaled dot product flash attention sharding" · pytorch/pytorch@5a8391c · GitHub
[go: up one dir, main page]

Skip to content

Commit 5a8391c

Browse files
committed
Update on "[dtensor] fix scaled dot product flash attention sharding"
cc H-Huang awgu kwen2501 wanchaol fegin fduwjj wz337 wconstab d4l3k c-p-i-o [ghstack-poisoned]
2 parents b12a04e + 20de9aa commit 5a8391c

File tree

180 files changed

+3196
-1822
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+3196
-1822
lines changed

.ci/docker/manywheel/Dockerfile_2014

Lines changed: 0 additions & 153 deletions
This file was deleted.

.ci/docker/manywheel/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ fi
121121
(
122122
set -x
123123

124-
if [ "$(uname -m)" != "s390x" ]; then
124+
# Only activate this if in CI
125+
if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
125126
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
126127
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
127128
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service

.ci/manywheel/build_cuda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ case ${CUDA_VERSION} in
5858
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
5959
;;
6060
12.6)
61-
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
61+
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
6262
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
6363
;;
6464
12.4)

.ci/pytorch/smoke_test/smoke_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ def test_cuda_gds_errors_captured() -> None:
166166
major_version = int(torch.version.cuda.split(".")[0])
167167
minor_version = int(torch.version.cuda.split(".")[1])
168168

169+
if target_os == "windows":
170+
print(f"{target_os} is not supported for GDS smoke test")
171+
return
172+
169173
if major_version < 12 or (major_version == 12 and minor_version < 6):
170174
print("CUDA version is not supported for GDS smoke test")
171175
return

.github/scripts/label_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ def gh_get_labels(org: str, repo: str) -> list[str]:
6363
update_labels(labels, info)
6464

6565
last_page = get_last_page_num_from_header(header)
66-
assert (
67-
last_page > 0
68-
), "Error reading header info to determine total number of pages of labels"
66+
assert last_page > 0, (
67+
"Error reading header info to determine total number of pages of labels"
68+
)
6969
for page_number in range(2, last_page + 1): # skip page 1
7070
_, info = request_for_labels(prefix + f"&page={page_number}")
7171
update_labels(labels, info)

.github/scripts/trymerge.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1224,9 +1224,17 @@ def merge_changes(
12241224
if not self.is_ghstack_pr():
12251225
msg = self.gen_commit_message()
12261226
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
1227-
repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
1227+
repo.fetch(self.last_commit()["oid"], pr_branch_name)
12281228
repo._run_git("merge", "--squash", pr_branch_name)
12291229
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
1230+
1231+
# Did the PR change since we started the merge?
1232+
pulled_sha = repo.show_ref(pr_branch_name)
1233+
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
1234+
if pulled_sha != latest_pr_status.last_commit()["oid"]:
1235+
raise RuntimeError(
1236+
"PR has been updated since CI checks last passed. Please rerun the merge command."
1237+
)
12301238
return []
12311239
else:
12321240
return self.merge_ghstack_into(

.github/workflows/build-triton-wheel.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ jobs:
6262
DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.docker-image }}
6363
PY_VERS: ${{ matrix.py_vers }}
6464
BUILD_DEVICE: ${{ matrix.device }}
65+
# TODO: We need to get rid of this when we remove builds that rely on the base manylinux-builder image
6566
PLATFORM: ${{ contains(matrix.docker-image, '2_28') && 'manylinux_2_28_x86_64' || 'manylinux2014_x86_64' }}
6667
steps:
6768
- name: Setup SSH (Click me for login details)
@@ -168,6 +169,7 @@ jobs:
168169
contents: read
169170
container:
170171
image: continuumio/miniconda3:4.12.0
172+
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
171173
steps:
172174
- uses: actions/checkout@v3
173175

.lintrunner.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,7 +1476,7 @@ init_command = [
14761476
'black==23.12.1',
14771477
'usort==1.0.8.post1',
14781478
'isort==5.13.2',
1479-
'ruff==0.8.4', # sync with RUFF
1479+
'ruff==0.9.8', # sync with RUFF
14801480
]
14811481
is_formatter = true
14821482

@@ -1561,7 +1561,7 @@ init_command = [
15611561
'python3',
15621562
'tools/linter/adapters/pip_init.py',
15631563
'--dry-run={{DRYRUN}}',
1564-
'ruff==0.8.4', # sync with PYFMT
1564+
'ruff==0.9.8', # sync with PYFMT
15651565
]
15661566
is_formatter = true
15671567

README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ If you want to build legacy python code, please refer to [Building on legacy cod
305305

306306
**CPU-only builds**
307307

308-
In this mode PyTorch computations will run on your CPU, not your GPU
308+
In this mode PyTorch computations will run on your CPU, not your GPU.
309309

310310
```cmd
311311
python setup.py develop
@@ -353,6 +353,18 @@ python setup.py develop
353353
354354
```
355355

356+
**Intel GPU builds**
357+
358+
In this mode PyTorch with Intel GPU support will be built.
359+
360+
Please make sure [the common prerequisites](#prerequisites) as well as [the prerequisites for Intel GPU](#intel-gpu-support) are properly installed and the environment variables are configured prior to starting the build. For build tool support, `Visual Studio 2022` is required.
361+
362+
Then PyTorch can be built with the command:
363+
364+
```cmd
365+
python setup.py develop
366+
```
367+
356368
##### Adjust Build Options (Optional)
357369

358370
You can adjust the configuration of cmake variables optionally (without building first), by doing

aten/src/ATen/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,6 @@ if(USE_XPU)
277277
list(APPEND ATen_XPU_DEPENDENCY_LIBS ${OCL_LIBRARY})
278278
list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu)
279279
list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu/detail)
280-
list(APPEND ATen_XPU_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn/include)
281280
list(APPEND ATen_XPU_INCLUDE ${XPU_MKLDNN_INCLUDE})
282281

283282
list(APPEND ATen_XPU_INCLUDE ${SYCL_INCLUDE_DIR})

0 commit comments

Comments
 (0)
0