8000 Update · pytorch/pytorch@41c7207 · GitHub
[go: up one dir, main page]

Skip to content

Commit 41c7207

Browse files
committed
Update
[ghstack-poisoned]
2 parents 7fbe418 + 266186b commit 41c7207

File tree

536 files changed

+22909
-6277
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

536 files changed

+22909
-6277
lines changed

.ci/docker/build.sh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ CMAKE_VERSION=3.18.5
8686

8787
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
8888
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
89+
if [[ "$image" == *rocm* ]]; then
90+
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
91+
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
92+
fi
8993

9094
# It's annoying to rename jobs every time you want to rewrite a
9195
# configuration, so we hardcode everything here rather than do it
@@ -206,25 +210,33 @@ case "$image" in
206210
;;
207211
pytorch-linux-focal-rocm-n-1-py3)
208212
ANACONDA_PYTHON_VERSION=3.10
209-
GCC_VERSION=9
213+
GCC_VERSION=11
210214
PROTOBUF=yes
211215
DB=yes
212216
VISION=yes
213217
ROCM_VERSION=6.2.4
214218
NINJA_VERSION=1.9.0
215219
CONDA_CMAKE=yes
216220
TRITON=yes
221+
KATEX=yes
222+
UCX_COMMIT=${_UCX_COMMIT}
223+
UCC_COMMIT=${_UCC_COMMIT}
224+
INDUCTOR_BENCHMARKS=yes
217225
;;
218226
pytorch-linux-focal-rocm-n-py3)
219227
ANACONDA_PYTHON_VERSION=3.10
220-
GCC_VERSION=9
228+
GCC_VERSION=11
221229
PROTOBUF=yes
222230
DB=yes
223231
VISION=yes
224232
ROCM_VERSION=6.3
225233
NINJA_VERSION=1.9.0
226234
CONDA_CMAKE=yes
227235
TRITON=yes
236+
KATEX=yes
237+
UCX_COMMIT=${_UCX_COMMIT}
238+
UCC_COMMIT=${_UCC_COMMIT}
239+
INDUCTOR_BENCHMARKS=yes
228240
;;
229241
pytorch-linux-jammy-xpu-2024.0-py3)
230242
ANACONDA_PYTHON_VERSION=3.9
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
41e7ffa8b7ff09206aa5b9b5c1bbd82b9e0ff277
1+
f936c9992f9730e4996fd6ac16ed6cdc723190af
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
v2.21.5-1
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
v2.25.1-1

.ci/docker/common/install_base.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,12 @@ install_ubuntu() {
3232

3333
# HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
3434
# See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
35+
# TODO: Eliminate this hack, we should not relay on apt-get installation
36+
# See https://github.com/pytorch/pytorch/issues/144768
3537
if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
3638
maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
39+
elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
40+
maybe_libnccl_dev="libnccl2=2.25.1-1+cuda12.4 libnccl-dev=2.25.1-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
3741
else
3842
maybe_libnccl_dev=""
3943
fi

.ci/docker/common/install_cuda.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
set -ex
44

5-
NCCL_VERSION=v2.21.5-1
5+
NCCL_VERSION=v2.25.1-1
66
CUDNN_VERSION=9.5.1.17
77

88
function install_cusparselt_040 {
@@ -40,6 +40,7 @@ function install_cusparselt_063 {
4040

4141
function install_118 {
4242
CUDNN_VERSION=9.1.0.70
43+
NCCL_VERSION=v2.21.5-1
4344
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
4445
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
4546
# install CUDA 11.8.0 in the same container
@@ -239,7 +240,7 @@ function prune_126 {
239240
}
240241

241242
function install_128 {
242-
CUDNN_VERSION=9.7.0.66
243+
CUDNN_VERSION=9.7.1.26
243244
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
244245
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
245246
# install CUDA 12.8.0 in the same container
@@ -288,4 +289,4 @@ do
288289
;;
289290
esac
290291
shift
291-
done
292+
done

.ci/docker/common/install_cuda_aarch64.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ function prune_126 {
161161
}
162162

163163
function install_128 {
164-
CUDNN_VERSION=9.7.0.66
164+
CUDNN_VERSION=9.7.1.26
165165
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
166166
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
167167
# install CUDA 12.8.0 in the same container

.ci/docker/common/install_cudnn.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
44
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
55
mkdir tmp_cudnn
66
pushd tmp_cudnn
7-
if [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
7+
if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
8+
CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
9+
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
810
CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
911
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
1012
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"

.ci/docker/common/install_onnx.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ pip_install ml_dtypes
3939
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
4040
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
4141
IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
42-
as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
42+
as_jenkins echo 'import transformers; transformers.GPTJForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gptj");' > "${IMPORT_SCRIPT_FILENAME}"
4343

4444
# Need a PyTorch version for transformers to work
4545
pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu

.ci/docker/common/install_ucc.sh

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ else
88
with_cuda=no
99
fi
1010

11+
if [[ -d "/opt/rocm" ]]; then
12+
with_rocm=/opt/rocm
13+
else
14+
with_rocm=no
15+
fi
16+
1117
function install_ucx() {
1218
set -ex
1319
git clone --recursive https://github.com/openucx/ucx.git
@@ -19,6 +25,7 @@ function install_ucx() {
1925
./configure --prefix=$UCX_HOME \
2026
--enable-mt \
2127
--with-cuda=$with_cuda \
28+
--with-rocm=$with_rocm \
2229
--enable-profiling \
2330
--enable-stats
2431
time make -j
@@ -36,12 +43,29 @@ function install_ucc() {
3643
git submodule update --init --recursive
3744

3845
./autogen.sh
46+
3947
# We only run distributed tests on Tesla M60 and A10G
4048
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
49+
50+
if [[ -n "$ROCM_VERSION" ]]; then
51+
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
52+
amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
53+
else
54+
amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
55+
fi
56+
for arch in $amdgpu_targets; do
57+
HIP_OFFLOAD="$HIP_OFFLOAD --offload-arch=$arch"
58+
done
59+
else
60+
HIP_OFFLOAD="all-arch-no-native"
61+
fi
62+
4163
./configure --prefix=$UCC_HOME \
4264
--with-ucx=$UCX_HOME \
4365
--with-cuda=$with_cuda \
44-
--with-nvcc-gencode="${NVCC_GENCODE}"
66+
--with-nvcc-gencode="${NVCC_GENCODE}" \
67+
--with-rocm=$with_rocm \
68+
--with-rocm-arch="${HIP_OFFLOAD}"
4569
time make -j
4670
sudo make install
4771

0 commit comments

Comments
 (0)
0