10000 Update on "[dtensor] PART 1: move DeviceMesh and placement to core di… · pytorch/pytorch@5f04dcb · GitHub
[go: up one dir, main page]

Skip to content

Commit 5f04dcb

Browse files
committed
Update on "[dtensor] PART 1: move DeviceMesh and placement to core distributed"
This PR creates `torch.distributed._tensor` package and moves DeviceMesh, PlacementTypes to it part of #88838 [ghstack-poisoned]
2 parents 139a07e + d576b98 commit 5f04dcb

File tree

371 files changed

+9014
-5930
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

371 files changed

+9014
-5930
lines changed

.github/ci_commit_pins/vision.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
ffd5a567eb90abf6b5555063da434d3c130d540f
1+
b1f6c9e271368cd84837522af39e68dd4b5768a7

.github/ci_commit_pins/xla.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
7889d2d3be16675943d84e4a4133ed7c245a623f
1+
dd9b67ff0d6ba4da6a46ca1b22e35c98dbed0d77

.github/scripts/filter_test_configs.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@
3434
"xla",
3535
}}
3636

37+
# Supported modes when running periodically
38+
SUPPORTED_PERIODICAL_MODES = {
39+
"mem_leak_check",
40+
"rerun_disabled_tests",
41+
}
42+
43+
3744
def parse_args() -> Any:
3845
from argparse import ArgumentParser
3946
parser = ArgumentParser("Filter all test configurations and keep only requested ones")
@@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
109116
return filtered_test_matrix
110117

111118

119+
def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
120+
"""
121+
Apply all periodic modes when running under a schedule
122+
"""
123+
scheduled_test_matrix: Dict[str, List[Any]] = {
124+
"include": [],
125+
}
126+
127+
for config in test_matrix.get("include", []):
128+
for mode in SUPPORTED_PERIODICAL_MODES:
129+
cfg = config.copy()
130+
cfg[mode] = mode
131+
scheduled_test_matrix["include"].append(cfg)
132+
133+
return scheduled_test_matrix
134+
135+
112136
def set_output(name: str, val: Any) -> None:
113137
if os.getenv("GITHUB_OUTPUT"):
114138
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@@ -163,8 +187,7 @@ def main() -> None:
163187
filtered_test_matrix = test_matrix
164188

165189
if args.event_name == "schedule":
166-
for config in filtered_test_matrix.get("include", []):
167-
config["mem_leak_check"] = "mem_leak_check"
190+
filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
168191

169192
# Set the filtered test matrix as the output
170193
set_output("test-matrix", json.dumps(filtered_test_matrix))

.github/scripts/install_nvidia_utils_linux.sh

Lines changed: 0 additions & 131 deletions
This file was deleted.

.github/scripts/test_filter_test_configs.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,14 @@
44
import yaml
55
import json
66
from unittest import TestCase, main, mock
7-
from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS
7+
from filter_test_configs import (
8+
get_labels,
9+
filter,
10+
set_periodic_modes,
11+
PREFIX,
12+
VALID_TEST_CONFIG_LABELS,
13+
SUPPORTED_PERIODICAL_MODES
14+
)
815
import requests
916
from requests.models import Response
1017
from typing import Any, Dict
@@ -86,5 +93,26 @@ def test_filter_with_valid_label(self) -> None:
8693
self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
8794

8895

96+
def test_set_periodic_modes(self) -> None:
97+
testcases = [
98+
{
99+
"test_matrix": "{include: []}",
100+
"description": "Empty test matrix",
101+
},
102+
{
103+
"test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
104+
"descripion": "Replicate each periodic mode in a different config",
105+
},
106+
]
107+
108+
for case in testcases:
109+
test_matrix = yaml.safe_load(case["test_matrix"])
110+
scheduled_test_matrix = set_periodic_modes(test_matrix)
111+
self.assertEqual(
112+
len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
113+
len(scheduled_test_matrix["include"])
114+
)
115+
116+
89117
if __name__ == '__main__':
90118
main()

.github/workflows/_binary-test-linux.yml

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,17 +171,8 @@ jobs:
171171
path: "${{ runner.temp }}/artifacts/"
172172

173173
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
174-
uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
174+
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
175175
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' }}
176-
with:
177-
timeout_minutes: 10
178-
max_attempts: 3
179-
command: |
180-
set -ex
181-
pushd pytorch
182-
bash .github/scripts/install_nvidia_utils_linux.sh
183-
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
184-
popd
185176

186177
- name: Pull Docker image
187178
uses: pytorch/test-infra/.github/actions/pull-docker-image@main

.github/workflows/_linux-test.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,8 @@ jobs:
7474
docker-image: ${{ inputs.docker-image }}
7575

7676
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
77-
uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
77+
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
7878
if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
79-
with:
80-
timeout_minutes: 10
81-
max_attempts: 3
82-
command: |
83-
set -ex
84-
bash .github/scripts/install_nvidia_utils_linux.sh
85-
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
8679

8780
- name: Start monitoring script
8881
id: monitor-script
@@ -122,7 +115,8 @@ jobs:
122115
DOCKER_IMAGE: ${{ inputs.docker-image }}
123116
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
124117
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
125-
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
118+
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
119+
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
126120
timeout-minutes: 240
127121
run: |
128122
set -x
@@ -177,6 +171,7 @@ jobs:
177171
-e XLA_CUDA \
178172
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
179173
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
174+
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
180175
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
181176
--ulimit stack=10485760:83886080 \
182177
--security-opt seccomp=unconfined \

.github/workflows/_mac-build.yml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,17 @@ jobs:
109109
brew link --force libomp
110110
111111
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
112+
uses: nick-fields/retry@v2.8.2
112113
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
113-
run: |
114-
sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
115-
sudo chmod +x /usr/local/bin/sccache
116-
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
117-
echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
114+
with:
115+
timeout_minutes: 5
116+
max_attempts: 3
117+
retry_wait_seconds: 90
118+
command: |
119+
sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
120+
sudo chmod +x /usr/local/bin/sccache
121+
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
122+
echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
118123
119124
- name: Get workflow job id
120125
id: get-job-id

.github/workflows/_mac-test-mps.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ jobs:
6666
${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl
6767
6868
- name: Run MPS tests
69+
id: test
6970
env:
7071
ENV_NAME: conda-test-env-${{ github.run_id }}
7172
shell: arch -arch arm64 bash {0}
@@ -74,5 +75,18 @@ jobs:
7475
set -ex
7576
# TODO(https://github.com/pytorch/pytorch/issues/79293)
7677
77-
${CONDA_RUN} --cwd test python3 test_mps.py -v
78-
${CONDA_RUN} --cwd test python3 test_metal.py -v
78+
${CONDA_RUN} python3 test/run_test.py --mps --verbose
79+
80+
- name: Get workflow job id
81+
id: get-job-id
82+
uses: ./.github/actions/get-workflow-job-id
83+
if: always()
84+
with:
85+
github-token: ${{ secrets.GITHUB_TOKEN }}
86+
87+
- name: Upload test artifacts
88+
uses: ./.github/actions/upload-test-artifacts
89+
if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
90+
with:
91+
use-gha: true
92+
file-suffix: ${{ github.job }}-mps-1-1-macos-m1-12_${{ steps.get-job-id.outputs.job-id }}

.github/workflows/_mac-test.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ jobs:
129129
- name: Test
130130
id: test
131131
env:
132-
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
132+
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
133+
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
133134
run: |
134135
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
135136

0 commit comments

Comments
 (0)
0