pytorch
diff --git a/‎aten/src/ATen/mps/MPSAllocator.mm
Lines changed: 11 additions & 0 deletions b/‎aten/src/ATen/mps/MPSAllocator.mm
Lines changed: 11 additions & 0 deletions
diff --git a/‎aten/src/ATen/mps/MPSGeneratorImpl.mm
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/mps/MPSGeneratorImpl.mm
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 48 additions & 0 deletions b/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 48 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/mps/operations/Copy.mm
Lines changed: 6 additions & 3 deletions b/‎aten/src/ATen/native/mps/operations/Copy.mm
Lines changed: 6 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/Indexing.mm
Lines changed: 9 additions & 2 deletions b/‎aten/src/ATen/native/mps/operations/Indexing.mm
Lines changed: 9 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/View.mm
Lines changed: 9 additions & 2 deletions b/‎aten/src/ATen/native/mps/operations/View.mm
Lines changed: 9 additions & 2 deletions
diff --git a/‎test/test_lazy_clone.py
Lines changed: 31 additions & 0 deletions b/‎test/test_lazy_clone.py
Lines changed: 31 additions & 0 deletions
@@ -853,7 +853,18 @@ void copy_data(void* dest, const void* src, std::size_t count) const final {
     } else if (isSharedBufferCPUPtr(dest)) {
       TORCH_INTERNAL_ASSERT(isSharedBufferCPUPtr(src));
     }
+    // CHECK: Do we need to sync here?
+    auto stream = getDefaultMPSStream();
+    dispatch_sync(stream->queue(), ^() {
+      stream->synchronize(SyncType::COMMIT_AND_WAIT);
+    });
+
     default_copy_data(dest, src, count);
+
+    // CHECK: Do we need to sync here?
+    dispatch_sync(stream->queue(), ^() {
+      stream->synchronize(SyncType::COMMIT_AND_WAIT);
+    });
   }
 
   void* get_cpu_ptr_from_device_ptr(void* device_ptr) const override {
 
@@ -70,7 +70,7 @@ Generator createMPSGenerator(uint64_t seed_val) {
 
   auto state_tensor = at::detail::empty_cpu(
       {(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
-  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  auto rng_state = state_tensor.mutable_data_ptr<uint8_t>();
   auto current_seed = this->current_seed();
   auto current_offset = this->get_offset();
 
 
@@ -171,6 +171,9 @@ Tensor _lazy_clone(Tensor const& self, std::optional<c10::Device> device_opt) {
     if (self.device().type() == c10::kMPS) {
       at::detail::getMPSHooks().deviceSynchronize();
     }
+  } else if (self.device().type() == c10::kMPS) {
+    // CHECK: Do we always need to sync for MPS?
+    at::detail::getMPSHooks().deviceSynchronize();
   }
   return Tensor(std::move(tensor));
 }
 
@@ -108,9 +108,33 @@ MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = Mem
 MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
 static inline id<MTLBuffer> getMTLBufferStorage(const TensorBase& tensor) {
+  // return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().mutable_data());
   return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
 }
 
+// This class wraps a tensor with an API that can obtain the underlying
+// `id<MTLBuffer>` while preventing COW materialization and attempting to
+// prevent mutations to the data. Unfortunately, there is no way to make the
+// compiler actually prevent mutating the data in the MPS code because Metal
+// APIs operate on `id<MTLBuffer>`, which resolves to `struct objc_object *`, a
+// pointer to non-const data.
+class ConstMTLBufferTensor {
+ public:
+  ConstMTLBufferTensor(const TensorBase& tensor) : _tensor(tensor) {}
+
+  // WARNING: Do not write to the buffer returned by this function.
+  id<MTLBuffer> mtl_buffer_unsafe() const {
+    return __builtin_bit_cast(id<MTLBuffer>, _tensor.storage().data());
+  }
+
+  const TensorBase& tensor() const {
+    return _tensor;
+  }
+
+ private:
+  const TensorBase& _tensor;
+};
+
 class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
@@ -355,6 +379,7 @@ static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigne
   if (C10_UNLIKELY(t.device().type() == kCPU)) {
     if constexpr (std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t>) {
       TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op");
+      // [encoder setBytes:t.storage().mutable_data() length:t.element_size() atIndex:idx];
       [encoder setBytes:t.storage().data() length:t.element_size() atIndex:idx];
     } else {
       TORCH_CHECK(false, "Passed CPU tensor to MPS op");
@@ -364,6 +389,25 @@ static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigne
   [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx];
 }
 
+template <typename encoder_t,
+          typename = std::enable_if_t<std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t> ||
+                                      std::is_same_v<id<MTLArgumentEncoder>, encoder_t>>>
+static inline void mtl_setBuffer(encoder_t encoder, ConstMTLBufferTensor b, unsigned idx) {
+  const TensorBase& t = b.tensor();
+  if (C10_UNLIKELY(t.device().type() == kCPU)) {
+    if constexpr (std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t>) {
+      TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op");
+      [encoder setBytes:b.mtl_buffer_unsafe() length:t.element_size() atIndex:idx];
+      // [encoder setBytes:getMTLBufferStorage(t) length:t.element_size() atIndex:idx];
+    } else {
+      TORCH_CHECK(false, "Passed CPU tensor to MPS op");
+    }
+    return;
+  }
+  [encoder setBuffer:b.mtl_buffer_unsafe() offset:t.storage_offset() * t.element_size() atIndex:idx];
+  // [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx];
+}
+
 // Implementation of setBytes for containers vs trivially copiable types must be separate
 // Containers like `std::array` could have been uploaded directly, but `c10::ArrayRef`,
 // while trivially copiable, includes padding  which if copied as Metal shader parameters
@@ -395,6 +439,10 @@ inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> val,
   [encoder setBuffer:val offset:0 atIndex:idx];
 }
 
+inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, ConstMTLBufferTensor val, unsigned idx) {
+  mtl_setBuffer(encoder, val, idx);
+}
+
 template <>
 inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, const Tensor& val, unsigned idx) {
   mtl_setBuffer(encoder, val, idx);
 
@@ -387,7 +387,8 @@ void printTensorNDArray(const TensorBase& t) {
   auto selfDType = getMPSDataType(t.scalar_type());
 
   // Initialize data
-  id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
+  id<MTLBuffer> selfBuf = ConstMTLBufferTensor(t).mtl_buffer_unsafe();
+  // id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
   MPSGraphTensorData* tdata = [[[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf shape:selfShape
                                                                     dataType:selfDType] autorelease];
   C10_CLANG_DIAGNOSTIC_PUSH()
 
@@ -112,7 +112,8 @@ static void copy_cast_mps(at::Tensor& dst,
     MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
     NSUInteger alignedLength = 0;
 
-    const void* host_dst = static_cast<const char*>(dst.storage().data()) + dst.storage_offset() * dst.itemsize();
+    // void* host_dst = static_cast<char*>(dst.storage().mutable_data()) + dst.storage_offset() * dst.itemsize();
+    void* host_dst = static_cast<char*>(dst.storage().data()) + dst.storage_offset() * dst.itemsize();
     void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)dst_tensor_nbytes, &alignedLength);
     NSUInteger destOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr));
     // 4 bytes alignment required on macos for blits.
@@ -258,7 +259,8 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     src = src_;
   }
   id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
-  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
+  id<MTLBuffer> sourceBuffer = ConstMTLBufferTensor(src).mtl_buffer_unsafe();
+  /// id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
 
   // Scatter to `dst` if the memory is not contiguous
   // If the memory is not contiguous, it means that the tensor has strides and we would not be
@@ -295,7 +297,8 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     } else if (dst_byte_offset) {
       auto maybeCastedSource =
           at::empty(dst_.sizes(), dst_.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-      auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
+      auto maybeCastedSourceBuffer = ConstMTLBufferTensor(maybeCastedSource).mtl_buffer_unsafe();
+      // auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
       copy_cast_mps(maybeCastedSource, src, maybeCastedSourceBuffer, sourceBuffer);
 
       uint64_t profile_id = getMPSProfiler().beginProfileCopy(
 
@@ -162,8 +162,15 @@ static bool dispatchIndexKernel(TensorIteratorBase& iter,
       getMPSProfiler().beginProfileKernel(indexSelectPSO, indexFunction, {inputTensor});
 
       [computeEncoder setComputePipelineState:indexSelectPSO];
-      mtl_setArgs(
-          computeEncoder, indexAB, index_size, index_stride, kernelDataOffsets, inputTensor, outputTensor, num_indices);
+      mtl_setArgs(computeEncoder,
+                  indexAB,
+                  index_size,
+                  index_stride,
+                  kernelDataOffsets,
+                  ConstMTLBufferTensor(inputTensor),
+                  // inputTensor,
+                  outputTensor,
+                  num_indices);
       MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
       if (serial_index_put) {
         mtl_setBytes(computeEncoder, numIters, 7);
 
@@ -146,7 +146,13 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
     }
 
     [computeEncoder setComputePipelineState:gatherPSO];
-    mtl_setArgs(computeEncoder, src, dst.has_storage() ? dst : output, src_sizes, src_strides, numThreads);
+    mtl_setArgs(computeEncoder,
+                ConstMTLBufferTensor(src),
+                // src,
+                dst.has_storage() ? dst : output,
+                src_sizes,
+                src_strides,
+                numThreads);
     if (src.dim() > 4) {
       mtl_setBytes<int32_t>(computeEncoder, src.dim(), 5);
     }
@@ -192,7 +198,8 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
       }
 
       [computeEncoder setComputePipelineState:scatterPSO];
-      mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);
+      mtl_setArgs(computeEncoder, ConstMTLBufferTensor(src), output, output_sizes, output_strides, numThreads);
+      // mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);
       if (output.dim() > 4) {
         mtl_setBytes<int32_t>(computeEncoder, output.dim(), 5);
       }
 
@@ -208,6 +208,37 @@ def test_interdevice_read(self, device, case):
         self.assertTrue(torch._C._is_cow_tensor(b))
         self.assertEqual(torch._C._data_address_resolve_unified(b), orig_data_ptr)
 
+    def test_clone_after_lazy_clone(self, device):
+        a = torch.randn(10, device=device)
+        orig_data_ptr = torch._C._data_address_resolve_unified(a)
+        b = torch._lazy_clone(a)
+
+        self.assertTrue(torch._C._is_cow_tensor(a))
+        self.assertTrue(torch._C._is_cow_tensor(b))
+        self.assertEqual(torch._C._data_address_resolve_unified(a), orig_data_ptr)
+        self.assertEqual(torch._C._data_address_resolve_unified(b), orig_data_ptr)
+
+        c = b.clone()
+
+        self.assertTrue(torch._C._is_cow_tensor(a))
+        self.assertTrue(torch._C._is_cow_tensor(b))
+        self.assertFalse(torch._C._is_cow_tensor(c))
+        self.assertEqual(torch._C._data_address_resolve_unified(a), orig_data_ptr)
+        self.assertEqual(torch._C._data_address_resolve_unified(b), orig_data_ptr)
+
+        self.assertEqual(b.clone(), c)
+        self.assertEqual(a.clone(), c)
+
+        self.assertTrue(torch._C._is_cow_tensor(a))
+        self.assertTrue(torch._C._is_cow_tensor(b))
+        self.assertFalse(torch._C._is_cow_tensor(c))
+        self.assertEqual(torch._C._data_address_resolve_unified(a), orig_data_ptr)
+        self.assertEqual(torch._C._data_address_resolve_unified(b), orig_data_ptr)
+
+        self.assertEqual(a, b)
+        self.assertEqual(a, c)
+        self.assertEqual(b, c)
+
 
 instantiate_device_type_tests(TestLazyCloneDeviceType, globals(), allow_mps=True)
Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,9 @@ Tensor _lazy_clone(Tensor const& self, std::optional<c10::Device> device_opt) {`
`171`	`171`	`if (self.device().type() == c10::kMPS) {`
`172`	`172`	`at::detail::getMPSHooks().deviceSynchronize();`
`173`	`173`	`}`
	`174`	`+ } else if (self.device().type() == c10::kMPS) {`
	`175`	`+ // CHECK: Do we always need to sync for MPS?`
	`176`	`+ at::detail::getMPSHooks().deviceSynchronize();`
`174`	`177`	`}`
`175`	`178`	`return Tensor(std::move(tensor));`
`176`	`179`	`}`
Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,13 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {`
`146`	`146`	`}`
`147`	`147`
`148`	`148`	`[computeEncoder setComputePipelineState:gatherPSO];`
`149`		`- mtl_setArgs(computeEncoder, src, dst.has_storage() ? dst : output, src_sizes, src_strides, numThreads);`
	`149`	`+ mtl_setArgs(computeEncoder,`
	`150`	`+ ConstMTLBufferTensor(src),`
	`151`	`+ // src,`
	`152`	`+ dst.has_storage() ? dst : output,`
	`153`	`+ src_sizes,`
	`154`	`+ src_strides,`
	`155`	`+ numThreads);`
`150`	`156`	`if (src.dim() > 4) {`
`151`	`157`	`mtl_setBytes<int32_t>(computeEncoder, src.dim(), 5);`
`152`	`158`	`}`
`@@ -192,7 +198,8 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {`
`192`	`198`	`}`
`193`	`199`
`194`	`200`	`[computeEncoder setComputePipelineState:scatterPSO];`
`195`		`- mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);`
	`201`	`+ mtl_setArgs(computeEncoder, ConstMTLBufferTensor(src), output, output_sizes, output_strides, numThreads);`
	`202`	`+ // mtl_setArgs(computeEncoder, src, output, output_sizes, output_strides, numThreads);`
`196`	`203`	`if (output.dim() > 4) {`
`197`	`204`	`mtl_setBytes<int32_t>(computeEncoder, output.dim(), 5);`
`198`	`205`	`}`