pytorch
diff --git a/‎.ci/docker/common/install_inductor_benchmark_deps.sh
Lines changed: 0 additions & 7 deletions b/‎.ci/docker/common/install_inductor_benchmark_deps.sh
Lines changed: 0 additions & 7 deletions
diff --git a/‎.ci/magma/Makefile
Lines changed: 1 addition & 7 deletions b/‎.ci/magma/Makefile
Lines changed: 1 addition & 7 deletions
diff --git a/‎.github/workflows/build-magma-linux.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-magma-linux.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/llm_td_retrieval.yml
Lines changed: 5 additions & 9 deletions b/‎.github/workflows/llm_td_retrieval.yml
Lines changed: 5 additions & 9 deletions
diff --git a/‎.github/workflows/upload-test-stats-while-running.yml
Lines changed: 2 additions & 7 deletions b/‎.github/workflows/upload-test-stats-while-running.yml
Lines changed: 2 additions & 7 deletions
diff --git a/‎aten/src/ATen/core/List_test.cpp
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/List_test.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
Lines changed: 6 additions & 6 deletions b/‎aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/MathBitsFallback.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/MathBitsFallback.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/Indexing.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/Indexing.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
Lines changed: 6 additions & 6 deletions b/‎aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/mps/MetalShaderLibrary.h
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/native/mps/MetalShaderLibrary.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 1 addition & 1 deletion
@@ -14,13 +14,6 @@ function install_timm() {
   local commit
   commit=$(get_pinned_commit timm)
 
-  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
-  # I'm using nightly here instead. We just need to package to be able to install
-  # TIMM. Removing this once vision has a release on 3.13
-  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
-    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
-  fi
-
   pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
   # Clean up
   conda_run pip uninstall -y cmake torch torchvision triton
 
@@ -12,13 +12,12 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh
 
 .PHONY: all
 all: magma-cuda128
 all: magma-cuda126
-all: magma-cuda124
 all: magma-cuda118
 
 .PHONY:
@@ -37,11 +36,6 @@ magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
 
-.PHONY: magma-cuda124
-magma-cuda124: DESIRED_CUDA := 12.4
-magma-cuda124:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda118
 magma-cuda118: DESIRED_CUDA := 11.8
 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
 
@@ -34,7 +34,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        cuda_version: ["128", "126", "124", "118"]
+        cuda_version: ["128", "126", "118"]
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
@@ -50,18 +50,13 @@ jobs:
           ref: v0.0.2
           path: llm-target-determinator
 
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-        with:
-          python-version: "3.9"
-
       - name: Install requirements
         shell: bash
         run: |
           set -euxo pipefail
-          ${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt
+          python3 -m pip install -r llm-target-determinator/requirements.txt
           cd "${GITHUB_WORKSPACE}/codellama"
-          ${CONDA_RUN} pip install -e .
+          python3 -m pip install -e .
 
       - name: Fetch CodeLlama Checkpoint
         shell: bash
@@ -80,7 +75,7 @@ jobs:
           shell: bash
           command: |
             set -euxo pipefail
-            ${CONDA_RUN} python -m pip install awscli==1.29.40
+            python3 -m pip install awscli==1.29.40
             cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
             aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive
 
@@ -94,7 +89,8 @@ jobs:
         run: |
           set -euxo pipefail
           cd "${GITHUB_WORKSPACE}"/llm-target-determinator
-          ${CONDA_RUN} torchrun \
+          export PATH="$HOME/.local/bin:$PATH"
+          torchrun \
             --standalone \
             --nnodes=1 \
             --nproc-per-node=1 \
 
@@ -24,17 +24,12 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-        with:
-          python-version: "3.10"
-
       - name: Install requirements
         run: |
-          ${CONDA_RUN} pip install requests==2.32.2 boto3==1.35.42
+          python3 -m pip install requests==2.32.2 boto3==1.35.42
 
       - name: Upload test stats
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          ${CONDA_RUN} python -m tools.stats.upload_test_stats_running_jobs
+          python3 -m tools.stats.upload_test_stats_running_jobs
@@ -2,6 +2,7 @@
 #include <gtest/gtest.h>
 
 using namespace c10;
+using std::string;
 
 // NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move)
 TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
 
@@ -519,7 +519,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithDictInput
   EXPECT_EQ(2, captured_dict_size);
 }
 
-string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
   return input1.at("key2");
 }
 
@@ -581,7 +581,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithUnordered
   EXPECT_EQ(2, captured_dict_size);
 }
 
-string kernelWithUnorderedMapInputWithOutput(std::unordered_map<string, string> input1) {
+std::string kernelWithUnor
10000
deredMapInputWithOutput(std::unordered_map<string, string> input1) {
   return input1.at("key2");
 }
 
 
@@ -468,7 +468,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithDictInput_witho
   EXPECT_EQ(2, captured_dict_size);
 }
 
-string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
   return input1.at("key2");
 }
 
 
@@ -463,7 +463,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withou
 }
 
 struct KernelWithDictInputWithOutput final : OperatorKernel {
-  string operator()(Dict<string, string> input1) {
+std::string operator()(Dict<string, std::string> input1) {
     return input1.at("key2");
   }
 };
@@ -475,7 +475,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
   auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_input", ""});
   ASSERT_TRUE(op.has_value());
 
-  Dict<string, string> dict;
+  Dict<string, std::string> dict;
   dict.insert("key1", "value1");
   dict.insert("key2", "value2");
   auto outputs = callOp(*op, dict);
@@ -484,7 +484,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
 }
 
 struct KernelWithDictOutput final : OperatorKernel {
-  Dict<string, string> operator()(Dict<string, string> input) {
+  Dict<string, std::string> operator()(Dict<string, std::string> input) {
     return input;
   }
 };
@@ -496,12 +496,12 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictOutput_whenR
   auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_output", ""});
   ASSERT_TRUE(op.has_value());
 
-  Dict<string, string> dict;
+  Dict<string, std::string> dict;
   dict.insert("key1", "value1");
   dict.insert("key2", "value2");
   auto outputs = callOp(*op, dict);
   EXPECT_EQ(1, outputs.size());
-  auto output = c10::impl::toTypedDict<string, string>(outputs[0].toGenericDict());
+  auto output = c10::impl::toTypedDict<string, std::string>(outputs[0].toGenericDict());
 
   EXPECT_EQ(2, output.size());
   EXPECT_EQ("value1", output.at("key1"));
@@ -520,7 +520,7 @@ class KernelWithCache final : public OperatorKernel {
 };
 
 struct KernelWithTupleInput final : OperatorKernel {
-  string operator()(std::tuple<string, int64_t, double> input1) {
+  std::string operator()(std::tuple<string, int64_t, double> input1) {
     return std::get<0>(input1);
   }
 };
 
@@ -22,7 +22,7 @@ namespace at::native {
 
 // NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
 struct MathOpFallback {
-  MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+  MathOpFallback(DispatchKey key_, std::string op_name_) : key(key_), op_name(std::move(op_name_)) {}
   virtual bool is_bit_set(const Tensor&) = 0;
   void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
     /*
@@ -151,7 +151,7 @@ struct MathOpFallback {
   virtual ~MathOpFallback() = default;
 
   DispatchKey key;
-  string op_name;
+  std::string op_name;
 };
 
 } // namespace at::native
@@ -1735,7 +1735,7 @@ Tensor& index_select_out_cuda(
     int64_t dim,
     const Tensor& index,
     Tensor& out) {
-  static constexpr string_view DIM_WARNING =
+  static constexpr std::string_view DIM_WARNING =
       "Tensor too large or too many (> 25) dimensions";
   TORCH_CHECK(
       at::cuda::check_device({out, self, index}),
 
@@ -772,7 +772,7 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   auto dprops = at::cuda::getCurrentDeviceProperties();
-  c10::string_view arch(dprops->gcnArchName);
+  std::string_view arch(dprops->gcnArchName);
   if (arch == "gfx1100") {
     dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
   } else{
 
@@ -101,11 +101,11 @@ void quantized_matmul(
     std::optional<at::Tensor> other, // extra input for binary-post-op
     double other_scale,
     int64_t other_zero_point,
-    const c10::string_view& binary_post_op,
+    const std::string_view& binary_post_op,
     double binary_alpha,
-    const c10::string_view& unary_post_op,
+    const std::string_view& unary_post_op,
     torch::List<std::optional<at::Scalar>>& unary_post_op_args,
-    c10::string_view unary_post_op_algorithm,
+    std::string_view unary_post_op_algorithm,
     bool m2_trans) {
   // [Note] Quantized Matrix Multiplication at XPU
   // The following code integrates oneDNN quantized gemm. The quantization
 
@@ -156,11 +156,11 @@ void quantized_matmul(
     std::optional<at::Tensor> other, // extra input for binary-post-op
     double other_scale,
     int64_t other_zero_point,
-    const c10::string_view& binary_post_op,
+    const std::string_view& binary_post_op,
     double binary_alpha,
-    const c10::string_view& unary_post_op,
+    const std::string_view& unary_post_op,
     torch::List<std::optional<at::Scalar>>& unary_post_op_args,
-    c10::string_view unary_post_op_algorithm,
+    std::string_view unary_post_op_algorithm,
     bool m2_trnas);
 
 void gpu_float_sdpa(
 
@@ -151,11 +151,11 @@ static Tensor q_linear_pointwise_binary(
     std::optional<c10::ScalarType> output_dtype,
     double other_scale,
     int64_t other_zero_point,
-    c10::string_view binary_post_op,
+    std::string_view binary_post_op,
     double binary_alpha,
-    c10::string_view unary_post_op,
+    std::string_view unary_post_op,
     torch::List<std::optional<at::Scalar>> unary_post_op_args,
-    c10::string_view unary_post_op_algorithm) {
+    std::string_view unary_post_op_algorithm) {
   TORCH_CHECK(
       act.device() == weight.device() &&
           act.device() == weight_scales.device() &&
@@ -222,11 +222,11 @@ static Tensor q_linear_pointwise_binary_tensor(
     std::optional<c10::ScalarType> output_dtype,
     double other_scale,
     int64_t other_zero_point,
-    c10::string_view binary_post_op,
+    std::string_view binary_post_op,
     double binary_alpha,
-    c10::string_view unary_post_op,
+    std::string_view unary_post_op,
     torch::List<std::optional<at::Scalar>> unary_post_op_args,
-    c10::string_view unary_post_op_algorithm) {
+    std::string_view unary_post_op_algorithm) {
   return q_linear_pointwise_binary(
       act,
       act_scale.item().toDouble(),
 
@@ -29,6 +29,9 @@ struct TensorIteratorBase;
 
 namespace at::native::mps {
 
+// Forward declaration of MPSScalar - for exec_binary_alpha_kernel()
+struct MPSScalar;
+
 namespace de
741A
tail {
 template <typename T>
 class has_size_type {
@@ -138,6 +141,10 @@ class MetalShaderLibrary {
       const std::string& name,
       std::optional<int64_t> extra = std::nullopt);
   void exec_binary_kernel(TensorIteratorBase& iter, const std::string& name);
+  void exec_binary_alpha_kernel(
+      TensorIteratorBase& iter,
+      const std::string& name,
+      const MPSScalar& alpha);
 
  protected:
   virtual MTLLibrary_t getLibrary();
 
@@ -156,7 +156,7 @@ MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, const TensorBase&
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, const Scalar& scalar);
 
-string get_mem_format_string(c10::MemoryFormat memory_format);
+std::string get_mem_format_string(c10::MemoryFormat memory_format);
 
 using MPSCacheKey = uint64_t;
Original file line number	Diff line number	Diff line change
`@@ -519,7 +519,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithDictInput`
`519`	`519`	`EXPECT_EQ(2, captured_dict_size);`
`520`	`520`	`}`
`521`	`521`
`522`		`-string kernelWithDictInputWithOutput(Dict<string, string> input1) {`
	`522`	`+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {`
`523`	`523`	`return input1.at("key2");`
`524`	`524`	`}`
`525`	`525`
`@@ -581,7 +581,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithUnordered`
`581`	`581`	`EXPECT_EQ(2, captured_dict_size);`
`582`	`582`	`}`
`583`	`583`
`584`		`-string kernelWithUnorderedMapInputWithOutput(std::unordered_map<string, string> input1) {`
	`584`	`+std::string kernelWithUnor 10000 deredMapInputWithOutput(std::unordered_map<string, string> input1) {`
`585`	`585`	`return input1.at("key2");`
`586`	`586`	`}`
`587`	`587`
Original file line number	Diff line number	Diff line change
`@@ -468,7 +468,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithDictInput_witho`
`468`	`468`	`EXPECT_EQ(2, captured_dict_size);`
`469`	`469`	`}`
`470`	`470`
`471`		`-string kernelWithDictInputWithOutput(Dict<string, string> input1) {`
	`471`	`+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {`
`472`	`472`	`return input1.at("key2");`
`473`	`473`	`}`
`474`	`474`