From c82e73756f25939875003c3a78ff1fcece9b7b99 Mon Sep 17 00:00:00 2001
From: Ting Lu <tingl@nvidia.com>
Date: Wed, 5 Mar 2025 13:03:18 -0800
Subject: [PATCH 1/6] Move away from 12.4 to add 12.6 eager tests

---
 .ci/docker/build.sh                 |  8 +++---
 .github/workflows/docker-builds.yml |  2 +-
 .github/workflows/pull.yml          | 40 ++++++++++++++---------------
 .github/workflows/trunk.yml         | 30 +++++++++++-----------
 4 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 5f038ac4e8e5..d099fe08be78 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -99,8 +99,8 @@ fi
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.1
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.6.3
     CUDNN_VERSION=9
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
@@ -326,7 +326,7 @@ case "$image" in
     EXECUTORCH=yes
     ;;
   pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
+    CUDA_VERSION=12.6
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
     CONDA_CMAKE=yes
@@ -334,7 +334,7 @@ case "$image" in
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3.12-triton-cpu)
-    CUDA_VERSION=12.4
+    CUDA_VERSION=12.6
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
     CONDA_CMAKE=yes
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index b6125c0bd2aa..a550be99f948 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
         runner: [linux.12xlarge]
         docker-image-name: [
-          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9,
+          pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9,
           pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks,
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 7888ac6123e9..b717f5e70d05 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -278,14 +278,14 @@ jobs:
       test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc9-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
@@ -296,17 +296,17 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc9-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-build
+      - linux-focal-cuda12_6-py3_10-gcc9-build
       - target-determination
     with:
       timeout-minutes: 360
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-py3-clang12-mobile-build:
@@ -430,14 +430,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-sm89-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm89
+  linux-focal-cuda12_6-py3_10-gcc9-sm89-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9-sm89
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm89
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       cuda-arch-list: 8.9
       test-matrix: |
         { include: [
@@ -469,16 +469,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-sm89-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm89
+  linux-focal-cuda12_6-py3_10-gcc9-sm89-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9-sm89
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-sm89-build
+      - linux-focal-cuda12_6-py3_10-gcc9-sm89-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm89-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm89
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-sm89-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-sm89-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-py3-clang12-executorch-build:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 6ad5f9944cc7..0e5087d37464 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -45,13 +45,13 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  libtorch-linux-focal-cuda12_4-py3_10-gcc9-debug-build:
-    name: libtorch-linux-focal-cuda12.4-py3.10-gcc9-debug
+  libtorch-linux-focal-cuda12_6-py3_10-gcc9-debug-build:
+    name: libtorch-linux-focal-cuda12.6-py3.10-gcc9-debug
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: libtorch-linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: libtorch-linux-focal-cuda12.6-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       build-generates-artifacts: false
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runner: "linux.4xlarge"
@@ -62,14 +62,14 @@ jobs:
     secrets: inherit
 
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-  linux-focal-cuda12_4-py3_10-gcc9-no-ops-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-no-ops
+  linux-focal-cuda12_6-py3_10-gcc9-no-ops-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9-no-ops
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-no-ops
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-no-ops
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -199,7 +199,7 @@ jobs:
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
+  linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build:
     if: false # See https://github.com/pytorch/pytorch/issues/138750
     name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
@@ -224,16 +224,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build-test
+  linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build-test
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build
+      - linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
     secrets: inherit
 
   # NB: Keep this in sync with inductor-perf-test-nightly.yml

From 80d2b1b512c605455b954db42b322821e1731832 Mon Sep 17 00:00:00 2001
From: Ting Lu <tingl@nvidia.com>
Date: Wed, 5 Mar 2025 16:05:07 -0800
Subject: [PATCH 2/6] Add missing slow tests

---
 .github/workflows/periodic.yml | 70 +++++++++++++++++-----------------
 .github/workflows/pull.yml     | 32 ++++++++--------
 .github/workflows/slow.yml     | 20 +++++-----
 .github/workflows/trunk.yml    | 14 +++----
 4 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 1de8fcc28122..fd52a953b4b9 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -49,14 +49,14 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-focal-cuda12_4-py3_10-gcc9-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc9-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       test-matrix: |
         { include: [
           { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -67,16 +67,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc9-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-build
+      - linux-focal-cuda12_6-py3_10-gcc9-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-focal-cuda11_8-py3_9-gcc9-build:
@@ -170,16 +170,16 @@ jobs:
       test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
+  linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     if: false # See https://github.com/pytorch/pytorch/issues/138750
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       use_split_build: true
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       test-matrix: |
         { include: [
           { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -190,16 +190,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
+  linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build
+      - linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
     secrets: inherit
 
 
@@ -265,14 +265,14 @@ jobs:
       test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3-gcc9-slow-gradcheck-build:
-    name: linux-focal-cuda12.4-py3-gcc9-slow-gradcheck
+  linux-focal-cuda12_6-py3-gcc9-slow-gradcheck-build:
+    name: linux-focal-cuda12.6-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3-gcc9-slow-gradcheck
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3-gcc9-slow-gradcheck
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
@@ -287,28 +287,28 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3-gcc9-slow-gradcheck-test:
-    name: linux-focal-cuda12.4-py3-gcc9-slow-gradcheck
+  linux-focal-cuda12_6-py3-gcc9-slow-gradcheck-test:
+    name: linux-focal-cuda12.6-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3-gcc9-slow-gradcheck-build
+      - linux-focal-cuda12_6-py3-gcc9-slow-gradcheck-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3-gcc9-slow-gradcheck
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3-gcc9-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3-gcc9-slow-gradcheck-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3-gcc9-slow-gradcheck
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3-gcc9-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3-gcc9-slow-gradcheck-build.outputs.test-matrix }}
       timeout-minutes: 300
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-bazel-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
+  linux-focal-cuda12_6-py3_10-gcc9-bazel-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      cuda-version: "12.4"
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-bazel-test
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
+      cuda-version: "12.6"
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index b717f5e70d05..858a8e3b55c0 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -387,8 +387,8 @@ jobs:
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-bazel-test
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       cuda-version: cpu
       test-matrix: |
         { include: [
@@ -449,16 +449,16 @@ jobs:
         ]}
     secrets: inherit
 
-  unstable-linux-focal-cuda12_4-py3_10-gcc9-sm89-build-xfail:
+  unstable-linux-focal-cuda12_6-py3_10-gcc9-sm89-build-xfail:
     # A version of the build that sets a larger number of jobs for a build.  May
     # OOM
-    name: unstable-linux-focal-cuda12.4-py3.10-gcc9-sm89-xfail
+    name: unstable-linux-focal-cuda12.6-py3.10-gcc9-sm89-xfail
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm89
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       cuda-arch-list: 8.9
       max-jobs: 4
       # Doesn't actually run tests, but need this in order to prevent the build
@@ -537,14 +537,14 @@ jobs:
       timeout-minutes: 600
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    name: cuda12.4-py3.10-gcc9-sm75
+  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
+    name: cuda12.6-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm75
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '7.5'
       test-matrix: |
         { include: [
@@ -552,14 +552,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
-    name: cuda12.4-py3.10-gcc9-sm75
+  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
+    name: cuda12.6-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-xpu-2025_0-py3_9-build:
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 209bedbba337..bf0d33ebeb20 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -47,14 +47,14 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
+  linux-focal-cuda12_6-py3_10-gcc9-sm86-build:
+    name: linux-focal-cuda12.6-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
@@ -64,16 +64,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_4-py3_10-gcc9-sm86-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
+  linux-focal-cuda12_6-py3_10-gcc9-sm86-test:
+    name: linux-focal-cuda12.6-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-sm86-build
+      - linux-focal-cuda12_6-py3_10-gcc9-sm86-build
       - target-determination
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-sm86-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-focal-py3_9-clang10-build:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 0e5087d37464..5f718ea8ffdd 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -201,14 +201,14 @@ jobs:
 
   linux-focal-cuda12_6-py3_10-gcc9-experimental-split-build:
     if: false # See https://github.com/pytorch/pytorch/issues/138750
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
+    name: linux-focal-cuda12.6-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       use_split_build: true
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9
       test-matrix: |
         { include: [
           { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -237,13 +237,13 @@ jobs:
     secrets: inherit
 
   # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    name: cuda12.4-py3.10-gcc9-sm80
+  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
+    name: cuda12.6-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
 

From 02320f021e06fc90f0afa015447f3980d96e66be Mon Sep 17 00:00:00 2001
From: Ting Lu <tingl@nvidia.com>
Date: Wed, 5 Mar 2025 19:49:03 -0800
Subject: [PATCH 3/6] dont change inductor jobs

---
 .github/workflows/pull.yml  | 20 ++++++++++----------
 .github/workflows/trunk.yml |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 858a8e3b55c0..8946de141a8b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -537,14 +537,14 @@ jobs:
       timeout-minutes: 600
     secrets: inherit
 
-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm75
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
+    name: cuda12.4-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm75
-      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '7.5'
       test-matrix: |
         { include: [
@@ -552,14 +552,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
-    name: cuda12.6-py3.10-gcc9-sm75
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
+    name: cuda12.4-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
     with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-xpu-2025_0-py3_9-build:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 5f718ea8ffdd..bf379174266c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -237,13 +237,13 @@ jobs:
     secrets: inherit
 
   # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm80
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
+    name: cuda12.4-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
 

From 44958d0c3c0dec273f1c0b181696dedbd67998e6 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Thu, 6 Mar 2025 08:00:53 -0800
Subject: [PATCH 4/6] for_each

---
 test/test_foreach.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/test_foreach.py b/test/test_foreach.py
index 9c0fc438d13a..71611a60a722 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -12,7 +12,7 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -87,9 +87,9 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
                 actual = self.func(*inputs, **kwargs)
             keys = tuple([e.key for e in p.key_averages()])
             mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
-            assert mta_called == (expect_fastpath and (not zero_size)), (
-                f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
-            )
+            assert (
+                mta_called == (expect_fastpath and (not zero_size))
+            ), f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
         else:
             actual = self.func(*inputs, **kwargs)
         if self.is_inplace:
@@ -355,6 +355,9 @@ def clone(arg):
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
+    @unittest.skipIf(
+        _get_torch_cuda_version() >= (12, 6), "Test is fixed on cuda 12.1 update 1."
+    )
     def test_pointwise_op_with_tensor_of_scalarlist_overload(
         self, device, dtype, op, is_fastpath
     ):

From 3e551bda110b37b7ac11a16c2de8fbe518612743 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Thu, 6 Mar 2025 08:02:17 -0800
Subject: [PATCH 5/6] for_each

---
 test/test_foreach.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_foreach.py b/test/test_foreach.py
index 71611a60a722..fbf7e9715773 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -12,7 +12,6 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -42,7 +41,10 @@
     TEST_WITH_ROCM,
     TestCase,
 )
-
+from torch.testing._internal.common_cuda import (
+    _get_torch_cuda_version,
+    TEST_MULTIGPU,
+)
 
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
 
@@ -87,9 +89,9 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
                 actual = self.func(*inputs, **kwargs)
             keys = tuple([e.key for e in p.key_averages()])
             mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
-            assert (
-                mta_called == (expect_fastpath and (not zero_size))
-            ), f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
+            assert mta_called == (expect_fastpath and (not zero_size)), (
+                f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
+            )
         else:
             actual = self.func(*inputs, **kwargs)
         if self.is_inplace:
@@ -355,9 +357,7 @@ def clone(arg):
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
-    @unittest.skipIf(
-        _get_torch_cuda_version() >= (12, 6), "Test is fixed on cuda 12.1 update 1."
-    )
+    @unittest.skipIf(_get_torch_cuda_version() >= (12, 6), "This test is failing on CUDA 12.6")
     def test_pointwise_op_with_tensor_of_scalarlist_overload(
         self, device, dtype, op, is_fastpath
     ):

From 69d52510b2554898d8d641a1df453fd9e3ee8b19 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Thu, 6 Mar 2025 08:41:56 -0800
Subject: [PATCH 6/6] added_reference_issue

---
 test/test_foreach.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_foreach.py b/test/test_foreach.py
index fbf7e9715773..8531c3a3422d 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -12,6 +12,7 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -41,10 +42,7 @@
     TEST_WITH_ROCM,
     TestCase,
 )
-from torch.testing._internal.common_cuda import (
-    _get_torch_cuda_version,
-    TEST_MULTIGPU,
-)
+
 
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
 
@@ -357,7 +355,8 @@ def clone(arg):
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
-    @unittest.skipIf(_get_torch_cuda_version() >= (12, 6), "This test is failing on CUDA 12.6")
+    # TODO: Remove skip CUDA 12.6 once resolved: https://github.com/pytorch/pytorch/issues/148681
+    @unittest.skipIf(_get_torch_cuda_version() >= (12, 6), "Failure on CUDA 12.6")
     def test_pointwise_op_with_tensor_of_scalarlist_overload(
         self, device, dtype, op, is_fastpath
     ):