8000 Update on "[PGNCCL] Launch kernel on current stream & remove `record_… · pytorch/pytorch@940cca6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 940cca6

Browse files
committed
Update on "[PGNCCL] Launch kernel on current stream & remove record_stream entirely"
This PR has multiple changes to `ProcessGroupNCCL` (which unfortunately are related): 1. When async_op=False, we directly launch the collective on "current" stream, instead of a trampoline stream and join back. - Resolves #147729 - Resolves #146881 - Also saves two event syncs (which have overhead in case of HIP) and one pybind when we call `work.wait()` in distributed_c10d.py on behalf of user. 2. Entirely remove `record_stream` and use CPU-side stashing for managing tensor lifetime against recycling. - Resolves #147168 3. Remove tensor life management when async_op=False; only use it when async_op=True. 4. To guard against user not calling `work.wait()`, we ask watchdog to unstash tensors after detecting completion of collectives, to prevent us from holding reference to tensors forever. This is a safety net, rather than a service guarantee, see discussion [here](#147168 (comment)). 5. Profile in async_op=False mode would look different -- collective kernels would show up in the same line and compute kernels. Joint work with cenzhaometa who wants to remove the event sync overhead. Cc: ngimel awgu Aidyn-A skyw wconstab leonardo0lyj cc H-Huang awgu wanchaol fegin fduwjj wz337 wconstab d4l3k c-p-i-o [ghstack-poisoned]
2 parents ff1f4c4 + a90d7e1 commit 940cca6

File tree

162 files changed

+3975
-1330
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

162 files changed

+3975
-1330
lines changed

.ci/docker/build.sh

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,11 @@ fi
9999
# configuration, so we hardcode everything here rather than do it
100100
# from scratch
101101
case "$image" in
102-
pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
102+
pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
103103
CUDA_VERSION=12.6.3
104104
CUDNN_VERSION=9
105105
ANACONDA_PYTHON_VERSION=3.10
106-
GCC_VERSION=9
106+
GCC_VERSION=11
107107
PROTOBUF=yes
108108
DB=yes
109109
VISION=yes
@@ -158,6 +158,65 @@ case "$image" in
158158
TRITON=yes
159159
INDUCTOR_BENCHMARKS=yes
160160
;;
161+
pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
162+
CUDA_VERSION=12.6.3
163+
CUDNN_VERSION=9
164+
ANACONDA_PYTHON_VERSION=3.10
165+
GCC_VERSION=9
166+
PROTOBUF=yes
167+
DB=yes
168+
VISION=yes
169+
KATEX=yes
170+
UCX_COMMIT=${_UCX_COMMIT}
171+
UCC_COMMIT=${_UCC_COMMIT}
172+
CONDA_CMAKE=yes
173+
TRITON=yes
174+
;;
175+
pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
176+
CUDA_VERSION=12.6.3
177+
CUDNN_VERSION=9
178+
ANACONDA_PYTHON_VERSION=3.10
179+
GCC_VERSION=9
180+
PROTOBUF=yes
181+
DB=yes
182+
VISION=yes
183+
KATEX=yes
184+
UCX_COMMIT=${_UCX_COMMIT}
185+
UCC_COMMIT=${_UCC_COMMIT}
186+
CONDA_CMAKE=yes
187+
TRITON=yes
188+
INDUCTOR_BENCHMARKS=yes
189+
;;
190+
pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
191+
CUDA_VERSION=12.6.3
192+
CUDNN_VERSION=9
193+
ANACONDA_PYTHON_VERSION=3.12
194+
GCC_VERSION=9
195+
PROTOBUF=yes
196+
DB=yes
197+
VISION=yes
198+
KATEX=yes
199+
UCX_COMMIT=${_UCX_COMMIT}
200+
UCC_COMMIT=${_UCC_COMMIT}
201+
CONDA_CMAKE=yes
202+
TRITON=yes
203+
INDUCTOR_BENCHMARKS=yes
204+
;;
205+
pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
206+
CUDA_VERSI 97AE ON=12.6.3
207+
CUDNN_VERSION=9
208+
ANACONDA_PYTHON_VERSION=3.13
209+
GCC_VERSION=9
210+
PROTOBUF=yes
211+
DB=yes
212+
VISION=yes
213+
KATEX=yes
214+
UCX_COMMIT=${_UCX_COMMIT}
215+
UCC_COMMIT=${_UCC_COMMIT}
216+
CONDA_CMAKE=yes
217+
TRITON=yes
218+
INDUCTOR_BENCHMARKS=yes
219+
;;
161220
pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
162221
CUDA_VERSION=11.8.0
163222
CUDNN_VERSION=9
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
e98b6fcb8df5b44eb0d0addb6767c573d37ba024
1+
cfb7d5314748542fed42d0837bef1a6d177da2d6

.ci/docker/manywheel/build_scripts/build_utils.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Script used only in CD pipeline
44

55
OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
6-
CURL_DOWNLOAD_URL=https://curl.askapache.com/download
6+
CURL_DOWNLOAD_URL=https://curl.se/download
77

88
AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
99

.ci/pytorch/windows/arm64/bootstrap_buildtools.bat

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,24 @@ curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%
1717
:: Install the Visual Studio Build Tools with C++ components
1818
echo Installing Visual Studio Build Tools with C++ components...
1919
echo Installing MSVC %MSVC_VERSION%
20-
if "%MSVC_VERSION%" == "latest" (
21-
"%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
22-
--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
23-
--add Microsoft.VisualStudio.Component.VC.ASAN ^
24-
--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
25-
--add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
26-
--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
27-
) else if "%MSVC_VERSION%" == "14.40" (
28-
"%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
29-
--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
30-
--add Microsoft.VisualStudio.Component.VC.ASAN ^
31-
--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
32-
--add Microsoft.VisualStudio.Component.VC.14.40.17.10.ARM64 ^
33-
--add Microsoft.VisualStudio.Component.VC.14.40.17.10.x86.x64
34-
) else if "%MSVC_VERSION%" == "14.36" (
35-
"%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
36-
--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
37-
--add Microsoft.VisualStudio.Component.VC.ASAN ^
38-
--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
39-
--add Microsoft.VisualStudio.Component.VC.14.36.17.6.ARM64 ^
40-
--add Microsoft.VisualStudio.Component.VC.14.36.17.6.x86.x64
41-
)
20+
"%INSTALLER_FILE%" --norestart --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
21+
--add Microsoft.VisualStudio.Workload.VCTools ^
22+
--add Microsoft.VisualStudio.Component.Windows10SDK ^
23+
--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
24+
--add Microsoft.VisualStudio.Component.VC.ASAN ^
25+
--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
26+
--add Microsoft.VisualStudio.Component.VC.CoreBuildTools ^
27+
--add Microsoft.VisualStudio.Component.VC.CoreIde ^
28+
--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
29+
--add Microsoft.VisualStudio.Component.VC.Tools.ARM64EC ^
30+
--add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
31+
--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
32+
33+
echo exitcode = %errorlevel%
4234

4335
:: Check if installation was successful
4436
if %errorlevel% neq 0 (
45-
echo "Failed to install Visual Studio Build Tools with C++ components. (exitcode = %errorlevel%)"
37+
echo Failed to install Visual Studio Build Tools with C++ components.
4638
exit /b 1
4739
)
4840

.ci/pytorch/windows/arm64/bootstrap_python.bat

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,25 @@ echo Dependency Python installation started.
66
if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
77
if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
88

9-
if "%PYTHON_VERSION%"=="Python312" (
10-
echo Python version is set to Python312
11-
set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
12-
) else if "%PYTHON_VERSION%"=="Python311" (
13-
echo Python version is set to Python311
14-
set DOWNLOAD_URL="https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe"
9+
if "%DESIRED_PYTHON%" == "3.13" (
10+
echo Python version is set to 3.13
11+
set DOWNLOAD_URL=https://www.python.org/ftp/python/3.13.2/python-3.13.2-arm64.exe
12+
) else if "%DESIRED_PYTHON%" == "3.12" (
13+
echo Python version is set to 3.12
14+
set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
15+
) else if "%DESIRED_PYTHON%" == "3.11" (
16+
echo Python version is set to 3.11
17+
set DOWNLOAD_URL=https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe
1518
) else (
16-
echo PYTHON_VERSION not defined, Python version is set to Python312
17-
set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
19+
echo DESIRED_PYTHON not defined, Python version is set to 3.12
20+
set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
1821
)
1922

2023
set INSTALLER_FILE=%DOWNLOADS_DIR%\python-installer.exe
2124

2225
:: Download installer
2326
echo Downloading Python...
24-
curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%
27+
curl -L -o "%INSTALLER_FILE%" "%DOWNLOAD_URL%"
2528

2629
:: Install Python
2730
echo Installing Python...

.ci/pytorch/windows/arm64/bootstrap_tests.bat

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ where python
1414
:: install dependencies
1515
python -m pip install --upgrade pip
1616
pip install -r requirements.txt
17-
pip install pytest numpy
17+
pip install pytest numpy protobuf expecttest hypothesis
1818

1919
:: find file name for pytorch wheel
2020
for /f "delims=" %%f in ('dir /b "%PYTORCH_FINAL_PACKAGE_DIR%" ^| findstr "torch-"') do set "TORCH_WHEEL_FILENAME=%PYTORCH_FINAL_PACKAGE_DIR%\%%f"
Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,14 @@
11
@echo off
22
setlocal
33

4-
set "ORIG_PATH=%PATH%"
5-
64
if "%PACKAGE_TYPE%" == "wheel" goto wheel
75
if "%PACKAGE_TYPE%" == "libtorch" goto libtorch
86

97
echo "unknown package type"
108
exit /b 1
119

1210
:wheel
13-
echo "install wheel package"
14-
15-
echo Running pip install...
16-
pip install -q --pre numpy protobuf
17-
echo Error level after pip install: %ERRORLEVEL%
18-
if errorlevel 1 exit /b 1
19-
20-
for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i"
21-
if errorlevel 1 exit /b 1
22-
23-
goto smoke_test
24-
25-
:smoke_test
26-
python -c "import torch"
27-
if ERRORLEVEL 1 exit /b 1
11+
call %PYTORCH_ROOT%\.ci\pytorch\windows\arm64\bootstrap_tests.bat
2812

2913
echo Running python rnn_smoke.py...
3014
python %PYTORCH_ROOT%\.ci\pytorch\test_example_code\rnn_smoke_win_arm64.py
@@ -39,10 +23,12 @@ goto end
3923
:libtorch
4024
echo "install and test libtorch"
4125

42-
for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do tar -xf "%%i" -C tmp
26+
if not exist tmp mkdir tmp
27+
28+
for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do C:\Windows\System32\tar.exe -xf "%%i" -C tmp
4329
if ERRORLEVEL 1 exit /b 1
4430

45-
pushd tmp\libtorch
31+
pushd tmp
4632

4733
set VC_VERSION_LOWER=14
4834
set VC_VERSION_UPPER=36
@@ -60,6 +46,4 @@ if ERRORLEVEL 1 exit /b 1
6046
.\simple-torch-test.exe
6147
if ERRORLEVEL 1 exit /b 1
6248

63-
:end
64-
set "PATH=%ORIG_PATH%"
65-
popd
49+
:end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
set -eux -o pipefail
3+
4+
source "${BINARY_ENV_FILE:-/c/w/env}"
5+
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
6+
7+
export USE_SCCACHE=1
8+
export SCCACHE_IGNORE_SERVER_IO_ERROR=1
9+
10+
echo "Free space on filesystem before build:"
11+
df -h
12+
13+
export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
14+
15+
if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
16+
pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
17+
elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
18+
pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
19+
fi
20+
21+
echo "Free space on filesystem after build:"
22+
df -h
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
set -eux -o pipefail
3+
4+
source "${BINARY_ENV_FILE:-/c/w/env}"
5+
6+
pytorch/.ci/pytorch/windows/arm64/smoke_test.bat

.github/actionlint.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ self-hosted-runner:
4343
- windows.8xlarge.nvidia.gpu
4444
- windows.8xlarge.nvidia.gpu.nonephemeral
4545
- windows.g5.4xlarge.nvidia.gpu
46+
# Windows ARM64 runners
47+
- windows-11-arm64
4648
# Organization-wide AMD hosted runners
4749
- linux.rocm.gpu
4850
- linux.rocm.gpu.2

.github/scripts/generate_binary_build_matrix.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,15 @@ def generate_libtorch_matrix(
279279
gpu_arch_type, gpu_arch_version
280280
),
281281
"libtorch_variant": libtorch_variant,
282-
"libtorch_config": abi_version if os == "windows" else "",
283-
"devtoolset": abi_version if os != "windows" else "",
282+
"libtorch_config": abi_version
283+
if os in ("windows", "windows-arm64")
284+
else "",
285+
"devtoolset": abi_version
286+
if os not in ("windows", "windows-arm64")
287+
else "",
284288
"container_image": (
285289
LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
286-
if os != "windows"
290+
if os not in ("windows", "windows-arm64")
287291
else ""
288292
),
289293
"package_type": "libtorch",

.github/scripts/generate_ci_workflows.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
9696
class OperatingSystem:
9797
LINUX = "linux"
9898
WINDOWS = "windows"
99+
WINDOWS_ARM64 = "windows-arm64"
99100
MACOS = "macos"
100101
MACOS_ARM64 = "macos-arm64"
101102
LINUX_AARCH64 = "linux-aarch64"
@@ -261,6 +262,52 @@ class OperatingSystem:
261262
),
262263
]
263264

265+
WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
266+
BinaryBuildWorkflow(
267+
os=OperatingSystem.WINDOWS_ARM64,
268+
package_type="wheel",
269+
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
270+
OperatingSystem.WINDOWS_ARM64,
271+
arches=["cpu"],
272+
python_versions=["3.12"],
273+
),
274+
ciflow_config=CIFlowConfig(
275+
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
276+
isolated_workflow=True,
277+
),
278+
),
279+
BinaryBuildWorkflow(
280+
os=OperatingSystem.WINDOWS_ARM64,
281+
package_type="libtorch",
282+
abi_version=generate_binary_build_matrix.RELEASE,
283+
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
284+
OperatingSystem.WINDOWS_ARM64,
285+
generate_binary_build_matrix.RELEASE,
286+
arches=["cpu"],
287+
libtorch_variants=["shared-with-deps"],
288+
),
289+
ciflow_config=CIFlowConfig(
290+
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
291+
isolated_workflow=True,
292+
),
293+
),
294+
BinaryBuildWorkflow(
295+
os=OperatingSystem.WINDOWS_ARM64,
296+
package_type="libtorch",
297+
abi_version=generate_binary_build_matrix.DEBUG,
298+
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
299+
OperatingSystem.WINDOWS_ARM64,
300+
generate_binary_build_matrix.DEBUG,
301+
arches=["cpu"],
302+
libtorch_variants=["shared-with-deps"],
303+
),
304+
ciflow_config=CIFlowConfig(
305+
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
306+
isolated_workflow=True,
307+
),
308+
),
309+
]
310+
264311
MACOS_BINARY_BUILD_WORKFLOWS = [
265312
BinaryBuildWorkflow(
266313
os=OperatingSystem.MACOS_ARM64,
@@ -355,6 +402,10 @@ def main() -> None:
355402
jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
356403
WINDOWS_BINARY_SMOKE_WORKFLOWS,
357404
),
405+
(
406+
jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
407+
WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
408+
),
358409
(
359410
jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
360411
MACOS_BINARY_BUILD_WORKFLOWS,
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
@echo on
2+
3+
set PYTHON_PREFIX=%PY_VERS:.=%
4+
set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
5+
call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
6+
:: Create a new conda environment
7+
if "%PY_VERS%" == "3.13t" (
8+
call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
9+
) else (
10+
call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
11+
)
12+
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake setuptools==72.1.0 ninja
13+
14+
dir "%VC_INSTALL_PATH%"
15+
16+
call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
17+
call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%

0 commit comments

Comments
 (0)
0