pytorch
diff --git a/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py
Lines changed: 1 addition & 1 deletion b/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/aarch64_linux/build_aarch64_wheel.py
Lines changed: 1 addition & 1 deletion b/‎.ci/aarch64_linux/build_aarch64_wheel.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_rocm_drm.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/common/install_rocm_drm.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/libtorch/build.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/libtorch/build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/manywheel/build.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/manywheel/build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/pytorch/smoke_test/max_autotune.py
Lines changed: 6 additions & 2 deletions b/‎.ci/pytorch/smoke_test/max_autotune.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎.ci/pytorch/test.sh
Lines changed: 54 additions & 4 deletions b/‎.ci/pytorch/test.sh
Lines changed: 54 additions & 4 deletions
diff --git a/‎.github/merge_rules.yaml
Lines changed: 3 additions & 0 deletions b/‎.github/merge_rules.yaml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/scripts/github_utils.py
Lines changed: 4 additions & 4 deletions b/‎.github/scripts/github_utils.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/scripts/trymerge.py
Lines changed: 2 additions & 2 deletions b/‎.github/scripts/trymerge.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/trymerge_explainer.py
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/trymerge_explainer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_binary-upload.yml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/_binary-upload.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/build-triton-wheel.yml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/build-triton-wheel.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/inductor-perf-test-nightly.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/inductor-perf-test-nightly.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/periodic.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/periodic.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.cpp
Lines changed: 16 additions & 1 deletion b/‎aten/src/ATen/Context.cpp
Lines changed: 16 additions & 1 deletion
diff --git a/‎aten/src/ATen/Context.h
Lines changed: 14 additions & 0 deletions b/‎aten/src/ATen/Context.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec_base.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cpu/vec/vec_base.h
Lines changed: 1 addition & 1 deletion
@@ -204,7 +204,7 @@ def parse_arguments():
         else:
             build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
     elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
         build_ArmComputeLibrary()
 
@@ -761,7 +761,7 @@ def start_build(
         version = host.check_output("cat pytorch/version.txt").strip()[:-2]
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
     if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
     if enable_mkldnn:
 
@@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
  	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
++		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
  		return;
  	}
 
 
@@ -39,7 +39,7 @@ case ${GPU_ARCH_TYPE} in
         BASE_TARGET=rocm
         DOCKER_TAG=rocm${GPU_ARCH_VERSION}
         GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx942"
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        
10000
 ;;
     *)
 
@@ -97,7 +97,7 @@ case ${GPU_ARCH_TYPE} in
             DEVTOOLSET_VERSION="11"
             GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
         fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
         ;;
     xpu)
 
@@ -46,7 +46,9 @@ def train(args, model, device, train_loader, optimizer, epoch):
         optimizer.step()
         if batch_idx % args.log_interval == 0:
             print(
-                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"  # noqa: B950
+                f"Train Epoch: {epoch} "
+                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
+                f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
             )
             if args.dry_run:
                 break
@@ -71,7 +73,9 @@ def test(model, device, test_loader):
     test_loss /= len(test_loader.dataset)
 
     print(
-        f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"  # noqa: B950
+        f"\nTest set: Average loss: {test_loss:.4f}, "
+        f"Accuracy: {correct}/{len(test_loader.dataset)} "
+        f"({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
 
 
@@ -494,8 +494,53 @@ test_cachebench() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
-  $TASKSET python "benchmarks/dynamo/cachebench.py" --mode training --benchmark torchbench --output "$TEST_REPORTS_DIR/cachebench_training.json"
-  $TASKSET python "benchmarks/dynamo/cachebench.py" --mode inference --benchmark torchbench --output "$TEST_REPORTS_DIR/cachebench_inference.json"
+  local BENCHMARK
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    local BENCHMARK=torchbench
+  elif [[ "${SHARD_NUMBER}" == 2 ]]; then
+    local BENCHMARK=huggingface
+  else
+    echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
+    exit 1
+  fi
+
+  local mode_options=("training" "inference")
+
+  for mode in "${mode_options[@]}"; do
+    $TASKSET python "benchmarks/dynamo/cachebench.py" \
+        --mode "$mode" \
+        --device cuda \
+        --benchmark "$BENCHMARK" \
+        --repeat 3 \
+        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"
+
+    $TASKSET python "benchmarks/dynamo/cachebench.py" \
+        --mode "$mode" \
+        --dynamic \
+        --device cuda \
+        --benchmark "$BENCHMARK" \
+        --repeat 3 \
+        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
+  done
+}
+
+test_verify_cachebench() {
+  TMP_TEST_REPORTS_DIR=$(mktemp -d)
+  TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"
+
+  $TASKSET python "benchmarks/dynamo/cachebench.py" \
+      --mode training \
+      --device cpu \
+      --model nanogpt \
+      --benchmark torchbench \
+      --output "$TEST_OUTPUT"
+
+  # -s checks file exists and is non empty
+  if [[ ! -s "$TEST_OUTPUT" ]]; then
+    echo "Cachebench failed to produce an output."
+    echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
+    exit 1
+  fi
 }
 
 test_perf_for_dashboard() {
@@ -1449,7 +1494,7 @@ test_executorch() {
 test_linux_aarch64() {
   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 
   # Dynamo tests
@@ -1520,8 +1565,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
   install_torchaudio cuda
   install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50
+  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
   PYTHONPATH=$(pwd)/torchbench test_cachebench
+elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
+  install_torchaudio cpu
+  install_torchvision
+  checkout_install_torchbench nanogpt
+  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   if [[ "${TEST_CONFIG}" == *cpu* ]]; then
     install_torchaudio cpu
 
@@ -334,6 +334,7 @@
   - XiaobingSuper
   - jgong5
   - mingfeima
+  - EikanWang
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -366,6 +367,7 @@
   - jgong5
   - vfdev-5
   - leslie-fang-intel
+  - EikanWang
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -379,6 +381,7 @@
   approved_by:
   - leslie-fang-intel
   - jgong5
+  - EikanWang
   mandatory_checks_name:
   - EasyCLA
   - Lint
 
@@ -57,10 +57,10 @@ def gh_fetch_url_and_headers(
             print(
                 f"""{url}
                 Rate limit exceeded:
-                Used: {err.headers['X-RateLimit-Used']}
-                Limit: {err.headers['X-RateLimit-Limit']}
-                Remaining: {err.headers['X-RateLimit-Remaining']}
-                Resets at: {err.headers['x-RateLimit-Reset']}"""
+                Used: {err.headers["X-RateLimit-Used"]}
+                Limit: {err.headers["X-RateLimit-Limit"]}
+                Remaining: {err.headers["X-RateLimit-Remaining"]}
+                Resets at: {err.headers["x-RateLimit-Reset"]}"""
             )
         else:
             print(f"Error fetching {url} {err}")
 
@@ -485,7 +485,7 @@ def get_check_run_name_prefix(workflow_run: Any) -> str:
     if workflow_run is None:
         return ""
     else:
-        return f'{workflow_run["workflow"]["name"]} / '
+        return f"{workflow_run['workflow']['name']} / "
 
 
 def is_passing_status(status: Optional[str]) -> bool:
@@ -545,7 +545,7 @@ def add_conclusions(edges: Any) -> None:
                     if not isinstance(checkrun_node, dict):
                         warn(f"Expected dictionary, but got {type(checkrun_node)}")
                         continue
-                    checkrun_name = f'{get_check_run_name_prefix(workflow_run)}{checkrun_node["name"]}'
+                    checkrun_name = f"{get_check_run_name_prefix(workflow_run)}{checkrun_node['name']}"
                     existing_checkrun = workflow_obj.jobs.get(checkrun_name)
                     if existing_checkrun is None or not is_passing_status(
                         existing_checkrun.status
 
@@ -79,7 +79,7 @@ def get_merge_message(
             (
                 "<details><summary>Advanced Debugging</summary>",
                 "Check the merge workflow status ",
-                f"<a href=\"{os.getenv('GH_RUN_URL')}\">here</a>",
+                f'<a href="{os.getenv("GH_RUN_URL")}">here</a>',
                 "</details>",
             )
         )
 
@@ -66,7 +66,6 @@ on:
 jobs:
   upload:
     runs-on: ubuntu-22.04
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     container:
       image: continuumio/miniconda3:4.12.0
     env:
 
@@ -168,7 +168,6 @@ jobs:
       contents: read
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
 
@@ -105,7 +105,8 @@ jobs:
           { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
           { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
           { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
-          { config: "cachebench", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
@@ -149,9 +149,9 @@ jobs:
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       test-matrix: |
         { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.2", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
         ]}
     secrets: inherit
 
 
@@ -246,3 +246,29 @@ jobs:
       docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
+
+  verify-cachebench-cpu-build:
+    name: verify-cachebench-cpu-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+      test-matrix: |
+        { include: [
+          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  verify-cachebench-cpu-test:
+    name: verify-cachebench-cpu-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - verify-cachebench-cpu-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-gcc11
+      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
+    secrets: inherit
@@ -332,7 +332,10 @@ at::BlasBackend Context::blasPreferredBackend() {
       static const std::vector<std::string> archs = {
           "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
-          "gfx1100", "gfx1101"
+          "gfx1100", "gfx1101", "gfx1200", "gfx1201"
+#endif
+#if ROCM_VERSION >= 60500
+          "gfx950"
 #endif
       };
       for (auto index: c10::irange(getNumGPUs())) {
@@ -430,6 +433,18 @@ void Context::setAllowFP16AccumulationCuBLAS(bool b) {
   allow_fp16_accumulation_cublas = b;
 }
 
+std::optional<int32_t> Context::_SMCarveout_EXPERIMENTAL() const {
+  return sm_carveout;
+}
+
+void Context::_setSMCarveout_EXPERIMENTAL(std::optional<int32_t> c) {
+  if (c.has_value()) {
+    TORCH_WARN_ONCE(
+      "Setting the SM carveout for matmuls is a temporary experimental mitigation for performance issues, "
+      "while more robust solutions are developed. It may be removed at any moment without notice.");
+  }
+  sm_carveout = c;
+}
 
 bool Context::hasMKL() {
 #if AT_MKL_ENABLED()
 
@@ -345,6 +345,19 @@ class TORCH_API Context {
   void setAllowBF16ReductionCuBLAS(bool);
   bool allowFP16AccumulationCuBLAS() const;
   void setAllowFP16AccumulationCuBLAS(bool);
+
+  // Matmuls can use a so-called "persistent" kernel which launches one CUDA
+  // block for each SM on the GPU, and each block then iterates over multiple
+  // output tiles. This allows to use software pipelining to hide the begin/end
+  // latencies (e.g., epilogue), especially when only one tile fits per SM.
+  // However, if some SMs are busy (e.g., with a background NCCL kernel), the
+  // matmul's blocks will be scheduled in two waves and, in the absence of some
+  // smart load balancing, the kernel will take twice as long. This flag allows
+  // to make matmuls target only a subset of the SMs, so they can fully schedule
+  // even next to a comms kernel, and only be a few percent slower.
+  std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
+
   at::QEngine qEngine() const;
   void setQEngine(at::QEngine e);
   static const std::vector<at::QEngine>& supportedQEngines();
@@ -423,6 +436,7 @@ class TORCH_API Context {
   bool allow_fp16_reduction_cublas = true;
   bool allow_bf16_reduction_cublas = true;
   bool allow_fp16_accumulation_cublas = false;
+  std::optional<int32_t> sm_carveout = std::nullopt;
   bool enabled_mkldnn = true;
   bool allow_tf32_onednn = false;
   bool enabled_nnpack = true;
 
@@ -50,7 +50,7 @@
 /*
 https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170
 Use _MSC_FULL_VER to identify current compiler is msvc,
-Windows llvm will not have this defination.
+Windows llvm will not have this definition.
 */
 #define __msvc_cl__
 #endif
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644`
`115`	`115`	`if (!fp) {`
`116`	`116`	`- fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,`
`117`	`117`	`- strerror(errno));`
`118`		`-+ fprintf(stderr, "amdgpu.ids: No such file or directory\n");`
	`118`	`++ //fprintf(stderr, "amdgpu.ids: No such file or directory\n");`
`119`	`119`	`return;`
`120`	`120`	`}`
`121`	`121`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ def get_merge_message(`
`79`	`79`	`(`
`80`	`80`	`"<details><summary>Advanced Debugging</summary>",`
`81`	`81`	`"Check the merge workflow status ",`
`82`		`- f"<a href=\"{os.getenv('GH_RUN_URL')}\">here</a>",`
	`82`	`+ f'<a href="{os.getenv("GH_RUN_URL")}">here</a>',`
`83`	`83`	`"</details>",`
`84`	`84`	`)`
`85`	`85`	`)`