pytorch
diff --git a/‎aten/src/ATen/cuda/tunable/Tunable.cpp
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/cuda/tunable/Tunable.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 35 additions & 31 deletions b/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 35 additions & 31 deletions
diff --git a/‎aten/src/ATen/native/mps/kernels/UnaryKernel.metal
Lines changed: 38 additions & 9 deletions b/‎aten/src/ATen/native/mps/kernels/UnaryKernel.metal
Lines changed: 38 additions & 9 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/check_results.py
Lines changed: 12 additions & 2 deletions b/‎benchmarks/dynamo/pr_time_benchmarks/check_results.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎c10/metal/indexing.h
Lines changed: 35 additions & 9 deletions b/‎c10/metal/indexing.h
Lines changed: 35 additions & 9 deletions
diff --git a/‎cmake/Codegen.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/Codegen.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 1 addition & 1 deletion b/‎setup.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
Lines changed: 3 additions & 0 deletions b/‎test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/distributed/test_c10d_common.py
Lines changed: 41 additions & 0 deletions b/‎test/distributed/test_c10d_common.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎test/distributed/test_c10d_pypg.py
Lines changed: 6 additions & 0 deletions b/‎test/distributed/test_c10d_pypg.py
Lines changed: 6 additions & 0 deletions
@@ -31,7 +31,11 @@
 
 // for validators
 #ifdef USE_ROCM
+#ifdef _WIN32
+#include <hip/hip_version.h>
+#else
 #include <rocm-core/rocm_version.h>
+#endif
 #define ROCBLAS_BETA_FEATURES_API
 #include <rocblas/rocblas.h>
 #include <hipblaslt/hipblaslt.h>
@@ -218,7 +222,11 @@ TuningResultsValidator::TuningResultsValidator() {
 #ifdef USE_ROCM
   // rocm
   {
+#ifdef _WIN32
+    std::string rocm_version = HIP_VERSION_BUILD_NAME;
+#else
     std::string rocm_version = ROCM_BUILD_INFO;
+#endif
     RegisterValidator(
        "ROCM_VERSION",
        [rocm_version]() { return rocm_version; },
 
@@ -638,28 +638,28 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
   switch (type) {
     case ScalarType::Double:
     case ScalarType::Float:
-      return {.value.f = scalar.to<float>(), .size = sizeof(float), .type = type};
+      return {.size = sizeof(float), .type = type, .value.f = scalar.to<float>()};
     case ScalarType::Half:
-      return {.value.h = scalar.to<Half>(), .size = sizeof(short), .type = type};
+      return {.size = sizeof(short), .type = type, .value.h = scalar.to<Half>()};
     case ScalarType::BFloat16:
-      return {.value.bf16 = scalar.to<BFloat16>(), .size = sizeof(short), .type = type};
+      return {.size = sizeof(short), .type = type, .value.bf16 = scalar.to<BFloat16>()};
     case ScalarType::Long:
-      return {.value.i = scalar.to<int64_t>(), .size = sizeof(int64_t), .type = type};
+      return {.size = sizeof(int64_t), .type = type, .value.i = scalar.to<int64_t>()};
     case ScalarType::Int:
-      return {.value.i = scalar.to<int32_t>(), .size = sizeof(int32_t), .type = type};
+      return {.size = sizeof(int32_t), .type = type, .value.i = scalar.to<int32_t>()};
     case ScalarType::Short:
-      return {.value.i = scalar.to<int16_t>(), .size = sizeof(int16_t), .type = type};
+      return {.size = sizeof(int16_t), .type = type, .value.i = scalar.to<int16_t>()};
     case ScalarType::Char:
-      return {.value.i = scalar.to<int8_t>(), .size = sizeof(int8_t), .type = type};
+      return {.size = sizeof(int8_t), .type = type, .value.i = scalar.to<int8_t>()};
     case ScalarType::Byte:
-      return {.value.i = scalar.to<uint8_t>(), .size = sizeof(uint8_t), .type = type};
+      return {.size = sizeof(uint8_t), .type = type, .value.i = scalar.to<uint8_t>()};
     case ScalarType::Bool:
-      return {.value.b = scalar.to<bool>(), .size = sizeof(bool), .type = type};
+      return {.size = sizeof(bool), .type = type, .value.b = scalar.to<bool>()};
     case ScalarType::ComplexHalf:
-      return {.value.ch = scalar.to<c10::complex<Half>>(), .size = sizeof(int32_t), .type = type};
+      return {.size = sizeof(int32_t), .type = type, .value.ch = scalar.to<c10::complex<Half>>()};
     case ScalarType::ComplexFloat:
     case ScalarType::ComplexDouble:
-      return {.value.cf = scalar.to<c10::complex<float>>(), .size = sizeof(int64_t), .type = type};
+      return {.size = sizeof(int64_t), .type = type, .value.cf = scalar.to<c10::complex<float>>()};
     default:
       TORCH_INTERNAL_ASSERT(false, "Unsupported scalar type '", type, "' on MPS backend.");
   }
@@ -965,45 +965,49 @@ static dispatch_data_t getSectionData(const std::string& name) {
                                            std::optional<int64_t> extra) {
   auto inputTensor = iter.input(0);
   auto outputTensor = iter.output(0);
-  bool is_dense_strided = is_dense_in_storage(inputTensor) && inputTensor.strides().equals(outputTensor.strides());
-  bool needs_output_copy = false;
-  uint32_t length = outputTensor.numel();
+  bool is_storage_dense = is_dense_in_storage(inputTensor) && inputTensor.strides().equals(outputTensor.strides());
+  uint32_t length = iter.numel();
   if (length == 0) {
     return;
   }
   using namespace mps;
   @autoreleasepool {
     id<MTLComputePipelineState> cplState = nil;
-    cplState = getPipelineStateForFunc(fmt::format(
-        "{}_dense_{}_{}", name, scalarToMetalTypeString(outputTensor), scalarToMetalTypeString(inputTensor)));
-
-    if (!is_dense_strided) {
-      inputTensor = inputTensor.contiguous();
-      if (!outputTensor.is_contiguous()) {
-        outputTensor = outputTensor.contiguous();
-        needs_output_copy = true;
-      }
-    }
+    cplState = getPipelineStateForFunc(fmt::format("{}_{}_{}_{}",
+                                                   name,
+                                                   is_storage_dense ? "dense" : "strided",
+                                                   scalarToMetalTypeString(outputTensor),
+                                                   scalarToMetalTypeString(inputTensor)));
 
     MPSStream* mpsStream = getCurrentMPSStream();
     dispatch_sync(mpsStream->queue(), ^() {
-      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      auto computeEncoder = mpsStream->commandEncoder();
 
       getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
 
       [computeEncoder setComputePipelineState:cplState];
-      mtl_setArgs(computeEncoder, outputTensor, inputTensor);
-      if (extra) {
-        mtl_setBytes(computeEncoder, *extra, 2);
+      if (is_storage_dense) {
+        mtl_setArgs(computeEncoder, outputTensor, inputTensor);
+        if (extra) {
+          mtl_setBytes(computeEncoder, *extra, 2);
+        }
+      } else {
+        mtl_setArgs(computeEncoder,
+                    outputTensor,
+                    inputTensor,
+                    outputTensor.sizes(),
+                    inputTensor.strides(),
+                    outputTensor.strides(),
+                    inputTensor.ndimension());
+        if (extra) {
+          mtl_setBytes(computeEncoder, *extra, 6);
+        }
       }
       mtl_dispatch1DJob(computeEncoder, cplState, length);
 
       getMPSProfiler().endProfileKernel(cplState);
     });
   }
-  if (needs_output_copy) {
-    iter.output(0).copy_(outputTensor);
-  }
 }
 
 MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
 
@@ -99,7 +99,7 @@ INSTANTIATE_UNARY_KERNELS_VEC2(half);
 INSTANTIATE_UNARY_KERNELS_VEC2(float);
 
 template <typename T>
-kernel void round_decimals_kernel(
+kernel void round_decimals_dense(
     device T* output [[buffer(0)]],
     constant T* input [[buffer(1)]],
     constant long& ndigits [[buffer(2)]],
@@ -108,14 +108,43 @@ kernel void round_decimals_kernel(
       rint(exp10(float(ndigits)) * input[index]) * exp10(float(-ndigits)));
 }
 
-#define INSTANTIATE_ROUND_DECIMALS(DTYPE)                                  \
-  template                                                                 \
-      [[host_name("round_decimals_dense_" #DTYPE "_" #DTYPE)]] kernel void \
-      round_decimals_kernel(                                               \
-          device DTYPE* output [[buffer(0)]],                              \
-          constant DTYPE* input [[buffer(1)]],                             \
-          constant long& ndigits [[buffer(2)]],                            \
-          uint id [[thread_position_in_grid]])
+template <typename T>
+kernel void round_decimals_strided(
+    device T* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    constant long& ndigits [[buffer(6)]],
+    uint index [[thread_position_in_grid]]) {
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  output[output_offs] = static_cast<T>(
+      rint(exp10(float(ndigits)) * input[input_offs]) * exp10(float(-ndigits)));
+}
+
+#define INSTANTIATE_ROUND_DECIMALS(DTYPE)                                    \
+  template                                                                   \
+      [[host_name("round_decimals_dense_" #DTYPE "_" #DTYPE)]] kernel void   \
+      round_decimals_dense(                                                  \
+          device DTYPE* output [[buffer(0)]],                                \
+          constant DTYPE* input [[buffer(1)]],                               \
+          constant long& ndigits [[buffer(2)]],                              \
+          uint index [[thread_position_in_grid]]);                           \
+  template                                                                   \
+      [[host_name("round_decimals_strided_" #DTYPE "_" #DTYPE)]] kernel void \
+      round_decimals_strided(                                                \
+          device DTYPE* output [[buffer(0)]],                                \
+          constant DTYPE* input [[buffer(1)]],                               \
+          constant long* sizes,                                              \
+          constant long* input_strides,                                      \
+          constant long* output_strides,                                     \
+          constant uint& ndim,                                               \
+          constant long& ndigits [[buffer(6)]],                              \
+          uint index)
 
 INSTANTIATE_ROUND_DECIMALS(float);
 INSTANTIATE_ROUND_DECIMALS(half);
 
@@ -210,9 +210,19 @@ def log(event_name):
             writer.writerow([])
             writer.writerow([])
 
-    print("new expected results file content if needed:")
+    print("=" * 80)
+    print("=" * 80)
+    print("=" * 80)
+    print("To update expected results, run the following command:")
+    print()
+    print("cat > benchmarks/dynamo/pr_time_benchmarks/expected_results.csv << EOF")
     with open(reference_expected_results_path) as f:
-        print(f.read())
+        print(f.read().rstrip())
+    print("EOF")
+    print()
+    print("=" * 80)
+    print("=" * 80)
+    print("=" * 80)
 
     if fail:
         print(
 
@@ -52,15 +52,41 @@ kernel void unary_dense(
   output[index] = f(input[index]);
 }
 
-#define REGISTER_UNARY_OP(NAME, DTYPE0, DTYPE1)                               \
-  static_assert(                                                              \
-      ::metal::                                                               \
-          is_same_v<DTYPE1, ::c10::metal::result_of<NAME##_functor, DTYPE0>>, \
-      "Output dtype mismatch for unary op " #NAME " and input " #DTYPE0);     \
-  template [[host_name(#NAME "_dense_" #DTYPE1 "_" #DTYPE0)]] kernel void ::  \
-      c10::metal::unary_dense<DTYPE0, NAME##_functor>(                        \
-          device ::c10::metal::result_of<NAME##_functor, DTYPE0> * output,    \
-          constant DTYPE0 * input,                                            \
+template <typename T, typename F>
+kernel void unary_strided(
+    device result_of<F, T>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  output[output_offs] = f(input[input_offs]);
+}
+
+#define REGISTER_UNARY_OP(NAME, DTYPE0, DTYPE1)                                \
+  static_assert(                                                               \
+      ::metal::                                                                \
+          is_same_v<DTYPE1, ::c10::metal::result_of<NAME##_functor, DTYPE0>>,  \
+      "Output dtype mismatch for unary op " #NAME " and input " #DTYPE0);      \
+  template [[host_name(#NAME "_dense_" #DTYPE1 "_" #DTYPE0)]] kernel void ::   \
+      c10::metal::unary_dense<DTYPE0, NAME##_functor>(                         \
+          device ::c10::metal::result_of<NAME##_functor, DTYPE0> * output,     \
+          constant DTYPE0 * input,                                             \
+          uint index);                                                         \
+  template [[host_name(#NAME "_strided_" #DTYPE1 "_" #DTYPE0)]] kernel void :: \
+      c10::metal::unary_strided<DTYPE0, NAME##_functor>(                       \
+          device ::c10::metal::result_of<NAME##_functor, DTYPE0> * output,     \
+          constant DTYPE0 * input,                                             \
+          constant long* sizes,                                                \
+          constant long* input_strides,                                        \
+          constant long* output_strides,                                       \
+          constant uint& ndim,                                                 \
           uint index)
 
 #define DEFINE_UNARY_FLOATING_FUNCTOR(NAME)                                     \
 
@@ -92,7 +92,7 @@ if(INTERN_BUILD_ATEN_OPS)
       if(EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
         list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_90a,code=sm_90a")
       endif()
-      if(EXISTING_ARCH_FLAGS MATCHES ".*compute_100a.*")
+      if(EXISTING_ARCH_FLAGS MATCHES ".*compute_100.*")
         list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_100a,code=sm_100a")
       endif()
     endif()
 
@@ -219,7 +219,7 @@
 #      Builds libtorch.so and its dependencies as a wheel
 #
 #   BUILD_PYTHON_ONLY
-#      Builds pytorch as a wheel using libtorch.so from a seperate wheel
+#      Builds pytorch as a wheel using libtorch.so from a separate wheel
 
 import os
 import sys
 
@@ -363,6 +363,9 @@ class TestDebugInfoWriter : public c10d::DebugInfoWriter {
 };
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
+  // Note (kwen2501) 03/07/2025
+  // TODO: re-enable
+  GTEST_SKIP() << "Skipping test as the trace write seems unstable.";
   int heartBeatIntervalInSec = 2;
   std::string timeInterval = std::to_string(heartBeatIntervalInSec);
   ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "0", 1) == 0);
 
@@ -1559,6 +1559,11 @@ def wait(self, timeout=5.0):
 
 
 class DummyProcessGroup(dist.ProcessGroup):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._aborted = False
+        self._shutdown = False
+
     def getBackendName(self):
         return "Dummy"
 
@@ -1622,6 +1627,12 @@ def recv(self, tensor_list, src, tag=0):
 
         return DummyWork()
 
+    def abort(self) -> None:
+        self._aborted = True
+
+    def shutdown(self) -> None:
+        self._shutdown = True
+
 
 class PythonProcessGroupExtensionTest(MultiProcessTestCase):
     def setUp(self):
@@ -1794,6 +1805,36 @@ def test_send_recv(self):
         # intentionally not calling into `destroy_process_group` as not all
         # user applications would explicitly that.
 
+    def test_shutdown(self) -> None:
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
+
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
+        dist.init_process_group("dummy", rank=self.rank, world_size=self.world_size)
+
+        pg = c10d._get_default_group()
+
+        dist.destroy_process_group()
+
+        self.assertTrue(pg._shutdown)
+
+    def test_abort(self) -> None:
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
+
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
+        dist.init_process_group("dummy", rank=self.rank, world_size=self.world_size)
+
+        pg = c10d._get_default_group()
+
+        c10d._abort_process_group()
+
+        self.assertTrue(pg._aborted)
+
 
 instantiate_parametrized_tests(CommonDistributedDataParallelTest)
 
 
@@ -191,6 +191,12 @@ def test_attr_overrides(self):
         pg._set_group_desc("desc")
         self.assertEqual(pg.group_desc, "py:desc")
 
+    def test_abort_shutdown(self) -> None:
+        # verify this are noops
+        pg = DummyAttrProcessGroup(0, 1)
+        pg.abort()
+        pg.shutdown()
+
 
 if __name__ == "__main__":
     run_tests()
Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@`
`219`	`219`	`# Builds libtorch.so and its dependencies as a wheel`
`220`	`220`	`#`
`221`	`221`	`# BUILD_PYTHON_ONLY`
`222`		`-# Builds pytorch as a wheel using libtorch.so from a seperate wheel`
	`222`	`+# Builds pytorch as a wheel using libtorch.so from a separate wheel`
`223`	`223`
`224`	`224`	`import os`
`225`	`225`	`import sys`