pytorch
diff --git a/‎aten/src/ATen/mps/MPSGeneratorImpl.mm
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/mps/MPSGeneratorImpl.mm
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 48 additions & 2 deletions b/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 48 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/mps/operations/Copy.mm
Lines changed: 5 additions & 3 deletions b/‎aten/src/ATen/native/mps/operations/Copy.mm
Lines changed: 5 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/Indexing.mm
Lines changed: 9 additions & 2 deletions b/‎aten/src/ATen/native/mps/operations/Indexing.mm
Lines changed: 9 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/View.mm
Lines changed: 9 additions & 2 deletions b/‎aten/src/ATen/native/mps/operations/View.mm
Lines changed: 9 additions & 2 deletions
diff --git a/‎test/test_mps.py
Lines changed: 180 additions & 1 deletion b/‎test/test_mps.py
Lines changed: 180 additions & 1 deletion
@@ -70,7 +70,7 @@ Generator createMPSGenerator(uint64_t seed_val) {
 
   auto state_tensor = at::detail::empty_cpu(
       {(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
-  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  auto rng_state = state_tensor.mutable_data_ptr<uint8_t>();
   auto current_seed = this->current_seed();
   auto current_offset = this->get_offset();
 
 
@@ -171,6 +171,9 @@ Tensor _lazy_clone(Tensor const& self, std::optional<c10::Device> device_opt) {
     if (self.device().type() == c10::kMPS) {
       at::detail::getMPSHooks().deviceSynchronize();
     }
+  } else if (self.device().type() == c10::kMPS) {
+    // CHECK: Do we always need to sync for MPS?
+    at::detail::getMPSHooks().deviceSynchronize();
   }
   return Tensor(std::move(tensor));
 }
 
@@ -108,9 +108,32 @@ MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = Mem
 MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
 static inline id<MTLBuffer> getMTLBufferStorage(const TensorBase& tensor) {
-  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().mutable_data());
 }
 
+// This class wraps a tensor with an API that can obtain the underlying
+// `id<MTLBuffer>` while preventing COW materialization and attempting to
+// prevent mutations to the data. Unfortunately, there is no way to make the
+// compiler actually prevent mutating the data in the MPS code because Metal
+// APIs operate on `id<MTLBuffer>`, which resolves to `struct objc_object *`, a
+// pointer to non-const data.
+class ConstMTLBufferTensor {
+ public:
+  ConstMTLBufferTensor(const TensorBase& tensor) : _tensor(tensor) {}
+
+  // WARNING: Do not write to the buffer returned by this function.
+  id<MTLBuffer> mtl_buffer_unsafe() const {
+    return __builtin_bit_cast(id<MTLBuffer>, _tensor.storage().data());
+  }
+
+  const TensorBase& tensor() const {
+    return _tensor;
+  }
+
+ private:
+  const TensorBase& _tensor;
+};
+
 class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
@@ -355,7 +378,7 @@ static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigne
   if (C10_UNLIKELY(t.device().type() == kCPU)) {
     if constexpr (std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t>) {
       TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op");
-      [encoder setBytes:t.storage().data() length:t.element_size() atIndex:idx];
+      [encoder setBytes:t.storage().mutable_data() length:t.element_size() atIndex:idx];
     } else {
       TORCH_CHECK(false, "Passed CPU tensor to MPS op");
     }
@@ -364,6 +387,25 @@ static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigne
   [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx];
 }
 
+template <typename encoder_t,
+          typename = std::enable_if_t<std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t> ||
+                                      std::is_same_v<id<MTLArgumentEncoder>, encoder_t>>>
+static inline void mtl_setBuffer(encoder_t encoder, ConstMTLBufferTensor b, unsigned idx) {
+  const TensorBase& t = b.tensor();
+  if (C10_UNLIKELY(t.device().type() == kCPU)) {
+    if constexpr (std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t>) {
+      TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op");
+      [encoder setBytes:b.mtl_buffer_unsafe() length:t.element_size() atIndex:idx];
+      // [encoder setBytes:getMTLBufferStorage(t) length:t.element_size() atIndex:idx];
+    } else {
+      TORCH_CHECK(false, "Passed CPU tensor to MPS op");
+    }
+    return;
+  }
+  [encoder setBuffer:b.mtl_buffer_unsafe() offset:t.storage_offset() * t.element_size() atIndex:idx];
+  // [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx];
+}
+
 // Implementation of setBytes for containers vs trivially copiable types must be separate
 // Containers like `std::array` could have been uploaded directly, but `c10::ArrayRef`,
 // while trivially copiable, includes padding  which if copied as Metal shader parameters
@@ -395,6 +437,10 @@ inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> val,
   [encoder setBuffer:val offset:0 atIndex:idx];
 }
 
+inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, ConstMTLBufferTensor val, unsigned idx) {
+  mtl_setBuffer(encoder, val, idx);
+}
+
 template <>
 inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, const Tensor& val, unsigned idx) {
   mtl_setBuffer(encoder, val, idx);
 
@@ -388,7 +388,8 @@ void printTensorNDArray(const TensorBase& t) {
   auto selfDType = getMPSDataType(t.scalar_type());
 
   // Initialize data
-  id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
+  id<MTLBuffer> selfBuf = ConstMTLBufferTensor(t).mtl_buffer_unsafe();
+  // id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
   MPSGraphTensorData* tdata = [[[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf shape:selfShape
                                                                     dataType:selfDType] autorelease];
   C10_CLANG_DIAGNOSTIC_PUSH()
 
@@ -112,7 +112,7 @@ static void copy_cast_mps(at::Tensor& dst,
     MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
     NSUInteger alignedLength = 0;
 
-    const void* host_dst = static_cast<const char*>(dst.storage().data()) + dst.storage_offset() * dst.itemsize();
+    void* host_dst = static_cast<char*>(dst.storage().mutable_data()) + dst.storage_offset() * dst.itemsize();
     void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)dst_tensor_nbytes, &alignedLength);
     NSUInteger destOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr));
     // 4 bytes alignment required on macos for blits.
@@ -258,7 +258,8 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     src = src_;
   }
   id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
-  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
+  id<MTLBuffer> sourceBuffer = ConstMTLBufferTensor(src).mtl_buffer_unsafe();
+  /// id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
 
   // Scatter to `dst` if the memory is not contiguous
   // If the memory is not contiguous, it means that the tensor has strides and we would not be
@@ -295,7 +296,8 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     } else if (dst_byte_offset) {
       auto maybeCastedSource =
           at::empty(dst_.sizes(), dst_.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-      auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
+      auto maybeCastedSourceBuffer = ConstMTLBufferTensor(maybeCastedSource).mtl_buffer_unsafe();
+      // auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
       copy_cast_mps(maybeCastedSource, src, maybeCastedSourceBuffer, sourceBuffer);
 
       uint64_t profile_id = getMPSProfiler().beginProfileCopy(
 
@@ -162,8 +162,15 @@ static bool dispatchIndexKernel(TensorIteratorBase& iter,
       getMPSProfiler().beginProfileKernel(indexSelectPSO, indexFunction, {inputTensor});
 
       [computeEncoder setComputePipelineState:indexSelectPSO];
-      mtl_setArgs(
-          computeEncoder, indexAB, index_size, index_stride, kernelDataOffsets, inputTensor, outputTensor, num_indices);
+      mtl_setArgs(computeEncoder,
+                  indexAB,
+                  index_size,
+                  index_stride,
+                  kernelDataOffsets,
+                  ConstMTLBufferTensor(inputTensor),
+                  // inputTensor,
+                  outputTensor,
+                  num_indices);
       MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
       if (serial_index_put) {
         mtl_setBytes(computeEncoder, numIters, 7);
 
@@ -146,7 +146,13 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
     }
 
     [computeEncoder setComputePipelineState:gatherPSO];
-    mtl_setArgs(computeEncoder, src, dst.has_storage() ? dst : output, src_sizes, src_strides, numThreads);
+    mtl_setArgs(computeEncoder,
+                ConstMTLBufferTensor(src),
+                // src,
+                dst.has_storage() ? dst : output,
+                src_sizes,
+                src_strides,
+                numThreads);
     if (src.dim() > 4) {
       mtl_setBytes<int32_t>(computeEncoder, src.dim(), 5);
     }
@@ -192,7 +198,8 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
       }
 
       [computeEncoder setComputePipelineState:scatterPSO];
-      mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);
+      mtl_setArgs(computeEncoder, ConstMTLBufferTensor(src), output, output_sizes, output_strides, numThreads);
+      // mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);
       if (output.dim() > 4) {
         mtl_setBytes<int32_t>(computeEncoder, output.dim(), 5);
       }
 
@@ -20,7 +20,7 @@
 from collections import defaultdict
 from torch import inf
 from torch.nn import Buffer, Parameter
-from torch.testing._internal import opinfo
+from torch.testing._internal import composite_compliance, opinfo
 from torch.testing._internal.common_utils import \
     (gradcheck, gradgradcheck, parametrize, run_tests, TestCase, download_file, MACOS_VERSION, IS_CI,
      NoTest, skipIfSlowGradcheckEnv, suppress_warnings, serialTest, instantiate_parametrized_tests)
@@ -48,6 +48,7 @@
 import operator
 
 test_consistency_op_db = copy.deepcopy(op_db)
+test_cow_inputs_op_db = copy.deepcopy(op_db)
 test_error_inputs_op_db = copy.deepcopy(op_db)
 
 # Add bicubic2d_aa to test_consistency_op_db
@@ -12049,6 +12050,183 @@ def test_fmax_mixed_dtypes(self, device):
             self.assertEqual(op(x, y[0]), op(x.to("mps"), y.to("mps")[0]).cpu())
 
 
+class TestCOWInputs(TestCase):
+    # Tests that MPS ops do not mutate the underlying data of COW inputs.
+    # Materialization is allowed, but the original data buffer should never be
+    # written to.
+    # TODO: When we enable the `test_cow_input` test from `test_ops.py` for MPS,
+    # we can remove this test.
+    @ops(test_cow_inputs_op_db, allowed_dtypes=(torch.float,))
+    def test_cow_input_not_mutated(self, device, dtype, op):
+        samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
+
+        def is_strided_tensor(arg):
+            return torch.is_tensor(arg) and arg.layout == torch.strided
+
+        def check_cow_input(
+            arg_copy,
+            arg_raw,
+            idx_or_kw,
+            backward_or_forward="forward",
+        ):
+            arg_name = (
+                f"Argument {idx_or_kw}"
+                if isinstance(idx_or_kw, int)
+                else f"Keyword argument '{idx_or_kw}'"
+            ) + f" during {backward_or_forward} call"
+
+            if is_strided_tensor(arg_raw):
+                self.assertTrue(
+                    torch._C._is_cow_tensor(arg_raw),
+                    msg=(
+                        f"{arg_name} raw input should remain COW, but it "
+                        "unexpectedly materialized."
+                    ),
+                )
+                # TODO: Make `torch.allclose` avoid materializing. We have to
+                # lazy clone arg_raw here before the comparison to prevent it
+                # from materializing and messing up subsequent checks.
+                arg_lazy_cloned = torch._lazy_clone(arg_raw)
+                print('------------------------------')
+                print('original value:')
+                print(arg_copy)
+                print('value after op:')
+                print(arg_lazy_cloned)
+                print('------------------------------')
+                self.assertTrue(
+                    torch.allclose(
+                        arg_lazy_cloned, arg_copy, rtol=0, atol=0, equal_nan=True
+                    ),
+                    msg=(
+                        f"{arg_name} COW input data was mutated."
+                    ),
+                )
+
+        for sample in samples:
+            args_raw = [sample.input] + list(sample.args)
+            kwargs_raw = sample.kwargs
+
+            # Eagerly cloned inputs used to keep track of the original values of
+            # inputs
+            args_copy = []
+            kwargs_copy = {}
+
+            # The lazy cloned inputs to be passed to the op.
+            args_lazy_cloned = []
+            kwargs_lazy_cloned = {}
+
+            # In order to keep the original args/kwargs_raw COW in cases where
+            # the op materializes the input, we need to start with three sets of
+            # COW inputs.
+            args_lazy_cloned_2 = []
+            kwargs_lazy_cloned_2 = {}
+
+            leaf_tensors = composite_compliance.gather_leaf_tensors(args_raw, kwargs_raw)
+
+            # Convert strided tensor inputs to COW tensors and make copies of
+            # all inputs
+            for idx, arg in enumerate(args_raw):
+                if is_strided_tensor(arg):
+                    args_copy.append(arg.detach().clone())
+                    args_lazy_cloned.append(torch._lazy_clone(arg))
+                    args_lazy_cloned_2.append(torch._lazy_clone(arg))
+                else:
+                    if torch.is_tensor(arg):
+                        args_copy.append(arg.detach().clone())
+                    else:
+                        args_copy.append(copy.deepcopy(arg))
+                    args_lazy_cloned.append(arg)
+                    args_lazy_cloned_2.append(arg)
+
+            for kw, arg in kwargs_raw.items():
+                if is_strided_tensor(arg):
+                    kwargs_copy[kw] = arg.detach().clone()
+                    kwargs_lazy_cloned[kw] = torch._lazy_clone(arg)
+                    kwargs_lazy_cloned_2[kw] = torch._lazy_clone(arg)
+                else:
+                    if torch.is_tensor(arg):
+                        kwargs_copy[kw] = arg.detach().clone()
+                    else:
+                        kwargs_copy[kw] = copy.deepcopy(arg)
+                    kwargs_lazy_cloned[kw] = arg
+                    kwargs_lazy_cloned_2[kw] = arg
+
+            # Call forward op
+            try:
+                results_raw = op.get_op()(*args_lazy_cloned, **kwargs_lazy_cloned)
+            except NotImplementedError:
+                raise unittest.SkipTest("Op not implemented") from None
+
+            # Check that COW inputs remain COW after the forward op is executed
+            for idx, arg in enumerate(args_lazy_cloned):
+                check_cow_input(args_copy[idx], args_raw[idx], idx)
+
+            for kw, arg in kwargs_lazy_cloned.items():
+                check_cow_input(kwargs_copy[kw], kwargs_raw[kw], kw)
+
+            # Call backward op if it is supported. This part of the test is
+            # based on `composite_compliance.check_backward_formula`
+            if (
+                op.supports_autograd
+                and len(leaf_tensors) > 0
+                and not op.skip_cow_input_backward
+            ):
+                if sample.output_process_fn_grad is not None:
+                    results_raw = sample.output_process_fn_grad(results_raw)
+
+                leaf_results = pytree.tree_leaves(results_raw)
+                results = [
+                    r
+                    for r in leaf_results
+                    if isinstance(r, torch.Tensor) and r.requires_grad
+                ]
+
+                all_results_strided = all(
+                    is_strided_tensor(result) for result in results
+                )
+
+                # Only test backward if the results are strided tensors
+                if all_results_strided:
+                    output_grads_raw = [
+                        torch.ones(r.shape, device=r.device, dtype=r.dtype)
+                        for r in results
+                    ]
+                    output_grads_copy = []
+                    output_grads_lazy_cloned = []
+                    output_grads_lazy_cloned_2 = []
+
+                    # Convert output grads to COW tensors and make copies
+                    for output_grad in output_grads_raw:
+                        output_grads_copy.append(output_grad.detach().clone())
+                        output_grads_lazy_cloned.append(torch._lazy_clone(output_grad))
+                        output_grads_lazy_cloned_2.append(torch._lazy_clone(output_grad))
+
+                    torch.autograd.grad(
+                        results,
+                        leaf_tensors,
+                        output_grads_lazy_cloned,
+                        allow_unused=True,
+                        retain_graph=True,
+                    )
+
+                    # Check that COW inputs remain COW after the backward op is executed
+                    for idx, arg in enumerate(args_lazy_cloned):
+                        check_cow_input(
+                            args_copy[idx],
+                            args_raw[idx],
+                            idx,
+                            backward_or_forward="backward",
+                        )
+
+                    # Check that COW inputs remain COW after the backward op is executed
+                    for idx, output_grad in enumerate(output_grads_lazy_cloned):
+                        check_cow_input(
+                            output_grads_copy[idx],
+                            output_grads_raw[idx],
+                            f"output grad {idx}",
+                            backward_or_forward="backward",
+                        )
+
 
 class TestErrorInputs(TestCase):
     _ignore_not_implemented_error = True
@@ -12342,6 +12520,7 @@ def test_metal_capture(self):
 instantiate_device_type_tests(TestErrorInputs, globals(), allow_mps=True, only_for="mps")
 instantiate_device_type_tests(TestCommon, globals(), allow_mps=True, only_for="mps")
 instantiate_device_type_tests(TestLinalgMPS, globals(), allow_mps=True, only_for="mps")
+instantiate_device_type_tests(TestCOWInputs, globals(), allow_mps=True, only_for="mps")
 instantiate_parametrized_tests(TestLogical)
 instantiate_parametrized_tests(TestMPS)
 instantiate_parametrized_tests(TestSDPA)
Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,9 @@ Tensor _lazy_clone(Tensor const& self, std::optional<c10::Device> device_opt) {`
`171`	`171`	`if (self.device().type() == c10::kMPS) {`
`172`	`172`	`at::detail::getMPSHooks().deviceSynchronize();`
`173`	`173`	`}`
	`174`	`+ } else if (self.device().type() == c10::kMPS) {`
	`175`	`+ // CHECK: Do we always need to sync for MPS?`
	`176`	`+ at::detail::getMPSHooks().deviceSynchronize();`
`174`	`177`	`}`
`175`	`178`	`return Tensor(std::move(tensor));`
`176`	`179`	`}`
Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,13 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {`
`146`	`146`	`}`
`147`	`147`
`148`	`148`	`[computeEncoder setComputePipelineState:gatherPSO];`
`149`		`- mtl_setArgs(computeEncoder, src, dst.has_storage() ? dst : output, src_sizes, src_strides, numThreads);`
	`149`	`+ mtl_setArgs(computeEncoder,`
	`150`	`+ ConstMTLBufferTensor(src),`
	`151`	`+ // src,`
	`152`	`+ dst.has_storage() ? dst : output,`
	`153`	`+ src_sizes,`
	`154`	`+ src_strides,`
	`155`	`+ numThreads);`
`150`	`156`	`if (src.dim() > 4) {`
`151`	`157`	`mtl_setBytes<int32_t>(computeEncoder, src.dim(), 5);`
`152`	`158`	`}`
`@@ -192,7 +198,8 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {`
`192`	`198`	`}`
`193`	`199`
`194`	`200`	`[computeEncoder setComputePipelineState:scatterPSO];`
`195`		`- mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);`
	`201`	`+ mtl_setArgs(computeEncoder, ConstMTLBufferTensor(src), output, output_sizes, output_strides, numThreads);`
	`202`	`+ // mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);`
`196`	`203`	`if (output.dim() > 4) {`
`197`	`204`	`mtl_setBytes<int32_t>(computeEncoder, output.dim(), 5);`
`198`	`205`	`}`