pytorch
diff --git a/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py
+1-1 b/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py
+1-1
diff --git a/‎.ci/aarch64_linux/build_aarch64_wheel.py
+1-1 b/‎.ci/aarch64_linux/build_aarch64_wheel.py
+1-1
diff --git a/‎.ci/docker/common/install_conda.sh
+1-1 b/‎.ci/docker/common/install_conda.sh
+1-1
diff --git a/‎.ci/docker/common/install_openblas.sh
+1-1 b/‎.ci/docker/common/install_openblas.sh
+1-1
diff --git a/‎.ci/docker/common/install_rocm_drm.sh
+1-1 b/‎.ci/docker/common/install_rocm_drm.sh
+1-1
diff --git a/‎.ci/docker/common/install_triton.sh
+3-3 b/‎.ci/docker/common/install_triton.sh
+3-3
diff --git a/‎.ci/docker/libtorch/build.sh
+1-1 b/‎.ci/docker/libtorch/build.sh
+1-1
diff --git a/‎.ci/docker/manywheel/Dockerfile_2014
-153 b/‎.ci/docker/manywheel/Dockerfile_2014
-153
diff --git a/‎.ci/docker/manywheel/build.sh
+3-2 b/‎.ci/docker/manywheel/build.sh
+3-2
diff --git a/‎.ci/magma/Makefile
+1-1 b/‎.ci/magma/Makefile
+1-1
diff --git a/‎.ci/manywheel/build_cuda.sh
+2-2 b/‎.ci/manywheel/build_cuda.sh
+2-2
diff --git a/‎.ci/pytorch/build.sh
+6-3 b/‎.ci/pytorch/build.sh
+6-3
diff --git a/‎.ci/pytorch/smoke_test/max_autotune.py
+6-2 b/‎.ci/pytorch/smoke_test/max_autotune.py
+6-2
diff --git a/‎.ci/pytorch/smoke_test/smoke_test.py
+4 b/‎.ci/pytorch/smoke_test/smoke_test.py
+4
@@ -204,7 +204,7 @@ def parse_arguments():
         else:
             build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
     elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
         build_ArmComputeLibrary()
 
@@ -761,7 +761,7 @@ def start_build(
         version = host.check_output("cat pytorch/version.txt").strip()[:-2]
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
     if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
     if enable_mkldnn:
 
@@ -66,7 +66,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
   if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.28=*openmp*"
+    conda_install "openblas==0.3.29=*openmp*"
   else
     conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
   fi
 
@@ -4,7 +4,7 @@
 set -ex
 
 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
 
 
 OPENBLAS_BUILD_FLAGS="
 
@@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
  	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
++		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
  		return;
  	}
 
 
@@ -60,15 +60,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
   # Triton needs at least gcc-9 to build
   apt-get install -y g++-9
 
-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
   # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
   add-apt-repository -y ppa:ubuntu-toolchain-r/test
   apt-get install -y g++-9
 
-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install .
 else
-  pip_install -e .
+  pip_install .
 fi
 
 if [ -n "${CONDA_CMAKE}" ]; then
 
@@ -39,7 +39,7 @@ case ${GPU_ARCH_TYPE} in
         BASE_TARGET=rocm
         DOCKER_TAG=rocm${GPU_ARCH_VERSION}
         GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx942"
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
         ;;
     *)
 
@@ -97,7 +97,7 @@ case ${GPU_ARCH_TYPE} in
             DEVTOOLSET_VERSION="11"
             GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
         fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
         ;;
     xpu)
@@ -121,7 +121,8 @@ fi
 (
     set -x
 
-    if [ "$(uname -m)" != "s390x" ]; then
+    # Only activate this if in CI
+    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
         # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
         # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
         sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
 
@@ -12,7 +12,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh
 
 .PHONY: all
 
@@ -54,11 +54,11 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
     12.8)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
         EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
         ;;
     12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
         EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
         ;;
     12.4)
 
@@ -173,6 +173,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   source /opt/intel/oneapi/compiler/latest/env/vars.sh
   # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
   export USE_KINETO=0
+  export TORCH_XPU_ARCH_LIST=pvc
 fi
 
 # sccache will fail for CUDA builds if all cores are used for compiling
@@ -191,7 +192,7 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
   echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
   echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
   export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@@ -377,8 +378,10 @@ else
     # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
     # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
     # 16 CPUs
-    MAX_JOBS=$(nproc --ignore=4)
-    export MAX_JOBS
+    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+      MAX_JOBS=$(nproc --ignore=4)
+      export MAX_JOBS
+    fi
 
     # NB: Install outside of source directory (at the same level as the root
     # pytorch folder) so that it doesn't get cleaned away prior to docker push.
 
@@ -46,7 +46,9 @@ def train(args, model, device, train_loader, optimizer, epoch):
         optimizer.step()
         if batch_idx % args.log_interval == 0:
             print(
-                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"  # noqa: B950
+                f"Train Epoch: {epoch} "
+                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
+                f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
             )
             if args.dry_run:
                 break
@@ -71,7 +73,9 @@ def test(model, device, test_loader):
     test_loss /= len(test_loader.dataset)
 
     print(
-        f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"  # noqa: B950
+        f"\nTest set: Average loss: {test_loss:.4f}, "
+        f"Accuracy: {correct}/{len(test_loader.dataset)} "
+        f"({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
 
 
@@ -166,6 +166,10 @@ def test_cuda_gds_errors_captured() -> None:
     major_version = int(torch.version.cuda.split(".")[0])
     minor_version = int(torch.version.cuda.split(".")[1])
 
+    if target_os == "windows":
+        print(f"{target_os} is not supported for GDS smoke test")
+        return
+
     if major_version < 12 or (major_version == 12 and minor_version < 6):
         print("CUDA version is not supported for GDS smoke test")
         return
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644`
`115`	`115`	`if (!fp) {`
`116`	`116`	`- fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,`
`117`	`117`	`- strerror(errno));`
`118`		`-+ fprintf(stderr, "amdgpu.ids: No such file or directory\n");`
	`118`	`++ //fprintf(stderr, "amdgpu.ids: No such file or directory\n");`
`119`	`119`	`return;`
`120`	`120`	`}`
`121`	`121`