pytorch
diff --git a/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 11 additions & 3 deletions b/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 11 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/TensorConversions.cpp
Lines changed: 26 additions & 0 deletions b/‎aten/src/ATen/native/TensorConversions.cpp
Lines changed: 26 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 3 additions & 2 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 3 additions & 2 deletions
diff --git a/‎c10/core/DispatchKey.h
Lines changed: 1 addition & 1 deletion b/‎c10/core/DispatchKey.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/core/impl/COW.cpp
Lines changed: 50 additions & 15 deletions b/‎c10/core/impl/COW.cpp
Lines changed: 50 additions & 15 deletions
diff --git a/‎c10/core/impl/COW.h
Lines changed: 7 additions & 1 deletion b/‎c10/core/impl/COW.h
Lines changed: 7 additions & 1 deletion
diff --git a/‎c10/core/impl/COWDeleter.cpp
Lines changed: 11 additions & 2 deletions b/‎c10/core/impl/COWDeleter.cpp
Lines changed: 11 additions & 2 deletions
diff --git a/‎c10/core/impl/COWDeleter.h
Lines changed: 9 additions & 1 deletion b/‎c10/core/impl/COWDeleter.h
Lines changed: 9 additions & 1 deletion
diff --git a/‎test/test_torch.py
Lines changed: 47 additions & 0 deletions b/‎test/test_torch.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎tools/autograd/derivatives.yaml
Lines changed: 1 addition & 1 deletion b/‎tools/autograd/derivatives.yaml
Lines changed: 1 addition & 1 deletion
@@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <c10/util/SmallBuffer.h>
 #include <c10/core/impl/COW.h>
+#include <c10/core/DispatchKey.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -91,14 +92,21 @@ bool _has_same_storage_numel(const at::Tensor& base, const at::Tensor& other) {
   return base.storage().sym_nbytes() / base.itemsize() == other.storage().sym_nbytes() / other.itemsize();
 }
 
-Tensor _lazy_clone(Tensor const& self) {
+Tensor _lazy_clone(Tensor const& self, optional<c10::Device> device_opt) {
   c10::StorageImpl* self_storage = self.storage().unsafeGetStorageImpl();
   c10::intrusive_ptr<c10::StorageImpl> storage =
-    c10::impl::cow::lazy_clone_storage(*self_storage);
+    c10::impl::cow::lazy_clone_storage(*self_storage, device_opt);
   TORCH_CHECK(storage != nullptr);
+  c10::DispatchKeySet key_set = self.key_set();
+  // If the target device differs, then we must change the key set
+  if (device_opt.has_value() && device_opt.value().type() != self.device().type()) {
+    c10::BackendComponent old_backend = c10::toBackendComponent(self.device().type());
+    c10::BackendComponent new_backend = c10::toBackendComponent(device_opt.value().type());
+    key_set = key_set.remove_backend(old_backend) | c10::DispatchKeySet(new_backend);
+  }
   auto tensor = c10::make_intrusive<c10::TensorImpl>(
       c10::Storage(std::move(storage)),
-      self.key_set(),
+      key_set,
       self.dtype());
   tensor->set_sizes_and_strides(self.sym_sizes(),
                                 self.sym_strides(),
 
@@ -17,6 +17,7 @@
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_lazy_clone.h>
 #include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
@@ -422,6 +423,25 @@ bool to_will_alias(
        self.suggest_memory_format() == memory_format);
 }
 
+bool _only_device_differs(
+    const Tensor& self,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+
std::optional<c10::MemoryFormat> optional_memory_format) {    std::optional<c10::MemoryFormat> optional_memory_format) {
+  bool device_differs = device.has_value() && device.value() != self.device();
+  bool dtype_differs = dtype.has_value() && dtype.value() != self.scalar_type();
+  bool layout_differs = layout.has_value() && layout.value() != self.layout();
+  bool pin_memory_differs =
+      pin_memory.has_value() && pin_memory.value() != self.is_pinned();
+  auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
+  bool memory_format_differs = memory_format != MemoryFormat::Preserve &&
+      memory_format != self.suggest_memory_format();
+  return device_differs && !dtype_differs && !layout_differs &&
+      !pin_memory_differs && !memory_format_differs;
+}
+
 static inline Tensor to_impl(
     const Tensor& self,
     std::optional<ScalarType> dtype,
@@ -436,6 +456,12 @@ static inline Tensor to_impl(
           self, dtype, layout, device, copy, optional_memory_format)) {
     return self;
   }
+  // TODO: after I prove that this works, I should only allow it for CPU-MPS,
+  // and we can enabled others later if needed.
+  // if (_only_device_differs(self, dtype, layout, device, pin_memory,
+  // optional_memory_format)) {
+  //  return at::_lazy_clone(self, device);
+  //}
   return at::_to_copy(
       self,
       dtype,
 
@@ -1250,9 +1250,10 @@
     CompositeExplicitAutograd: copysign_out
   tags: pointwise
 
-- func: _lazy_clone(Tensor self) -> Tensor
+- func: _lazy_clone(Tensor self, *, Device? device=None) -> Tensor
   # Like clone, but the copy takes place lazily, only if either the
-  # input or the output are written.
+  # input or the output are written. If `device` is given, the output
+  # will be copied to the specified device when the write occurs.
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: _lazy_clone
 
@@ -679,7 +679,7 @@ constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
   }
 }
 
-BackendComponent toBackendComponent(DeviceType device_type);
+C10_API BackendComponent toBackendComponent(DeviceType device_type);
 
 // Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
 // DispatchKey::CUDA.
 
@@ -4,6 +4,8 @@
 #include <c10/core/StorageImpl.h>
 #include <c10/core/alignment.h>
 #include <c10/core/impl/COWDeleter.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ParallelGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
@@ -48,7 +50,9 @@ bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
   return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
 }
 
-c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
+c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
+    StorageImpl& storage,
+    c10::optional<c10::Device> device_opt) {
   const at::DataPtr& data_ptr = storage.data_ptr();
 
   // There are three possible circumstances:
@@ -76,38 +80,66 @@ c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
   //
   //    No locking is required in this case.
 
-  std::optional<DataPtr> new_data_ptr; // must be set below
+  std::optional<DataPtr> new_data_ptr_opt; // must be set below
 
   if (has_simple_data_ptr(storage)) {
     // Case 1) We have a simple data pointer: wrap it.
     std::unique_ptr<void, DeleterFnPtr> original_ctx =
         storage._mutable_data_ptr_no_checks().move_context();
 
     // Save this for the result.
-    new_data_ptr = make_data_ptr(
-        data_ptr, *new cow::COWDeleterContext(std::move(original_ctx)));
+    new_data_ptr_opt = make_data_ptr(
+        data_ptr,
+        *new cow::COWDeleterContext(std::move(original_ctx), storage.device()));
 
     // Update this storage to the new copy on write context.
-    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr));
+    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr_opt));
   } else if (is_cow_data_ptr(data_ptr)) {
     // Case 2): there is already a copy on write context. Just return a
     // new storage impl.
-    new_data_ptr = copy_data_ptr(data_ptr);
+    new_data_ptr_opt = copy_data_ptr(data_ptr);
   } else {
     // Case 3) There is a context and it's not copy-on-write. Nothing
     // we can do here.
     return nullptr;
   }
 
-  TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
+  TORCH_INTERNAL_ASSERT(new_data_ptr_opt.has_value());
+
+  c10::Allocator* allocator = storage.allocator();
+  c10::DeviceType device_type = storage.device_type();
+
+  if (device_opt.has_value()) {
+    DeviceGuard device_guard(device_opt.value());
+    Device device = device_guard.current_device();
+
+    // If a different target device was given, then convert the data pointer to
+    // that device.
+    if (device != storage.device()) {
+      DataPtr& new_data_ptr = new_data_ptr_opt.value();
+      auto* ctx = new_data_ptr.cast_context<c10::impl::cow::COWDeleterContext>(
+          c10::impl::cow::cow_deleter);
+      device_type = device.type();
+
+      if (device_type == c10::kCUDA) {
+        allocator = c10::cuda::CUDACachingAllocator::get();
+      } else {
+        allocator = c10::GetAllocator(device.type());
+      }
+
+      new_data_ptr.release_context();
+      new_data_ptr_opt = c10::DataPtr(
+          new_data_ptr.get(), ctx, c10::impl::cow::cow_deleter, device);
+    }
+  }
 
   return make_storage_impl(
       StorageImpl::use_byte_size_t(),
       storage.sym_nbytes(),
-      *std::move(new_data_ptr),
-      storage.allocator(),
+      *std::move(new_data_ptr_opt),
+      allocator,
       storage.resizable(),
-      storage.device_type());
+      device_type);
 }
 
 C10_API void materialize_cow_storage(StorageImpl& storage) {
@@ -118,13 +150,14 @@ C10_API void materialize_cow_storage(StorageImpl& storage) {
 
   auto* ctx = data_ptr.cast_context<cow::COWDeleterContext>(cow::cow_deleter);
   TORCH_INTERNAL_ASSERT(ctx != nullptr);
-
+  bool devices_match = storage.device() == ctx->original_device();
   auto result = ctx->decrement_refcount();
 
   // This must be set by each branch below.
   std::optional<DataPtr> new_data_ptr;
 
-  if (std::holds_alternative<cow::COWDeleterContext::LastReference>(result)) {
+  if (devices_match &&
+      std::holds_alternative<cow::COWDeleterContext::LastReference>(result)) {
     // This is the only reference to the data. If there were any racing writes,
     // the context ensured they finished before giving us the result.
     std::unique_ptr<void, DeleterFnPtr> data =
@@ -133,12 +166,14 @@ C10_API void materialize_cow_storage(StorageImpl& storage) {
     new_data_ptr = DataPtr(
         data.release(), data_ptr.get(), data.get_deleter(), data_ptr.device());
   } else {
-    TORCH_INTERNAL_ASSERT(
-        std::holds_alternative<cow::COWDeleterContext::NotLastReference>(
-            result));
     // We don't need to consume the result, it's just a shared lock ensuring
     // that the data will remain while we copy it.
     new_data_ptr = storage.allocator()->clone(data_ptr.get(), storage.nbytes());
+    if (!devices_match) {
+      if (storage.device().type() == c10::kCUDA) {
+        c10::cuda::device_synchronize();
+      }
+    }
   }
 
   TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
 
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <c10/core/Device.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
 #include <c10/util/intrusive_ptr.h>
 
 namespace c10 {
@@ -17,8 +19,12 @@ namespace c10::impl::cow {
 // storage's DataPtr has some context (`DataPtr::get_context()`) which is not
 // equal to the data pointer (`DataPtr::get()`). In this case, a nullptr is
 // returned.
+//
+// If `device_opt` is given, the output will be copied to the specified device
+// when materialization occurs.
 C10_API c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
-    StorageImpl& storage);
+    StorageImpl& storage,
+    optional<Device> device_opt = nullopt);
 
 // Check if a storage has a simple DataPtr with no abnormal context
 C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage);
 
@@ -9,8 +9,9 @@ void cow::cow_deleter(void* ctx) {
 }
 
 cow::COWDeleterContext::COWDeleterContext(
-    std::unique_ptr<void, DeleterFnPtr> data)
-    : data_(std::move(data)) {
+    std::unique_ptr<void, DeleterFnPtr> data,
+    c10::Device original_device)
+    : data_(std::move(data)), original_device_(original_device) {
   // We never wrap a COWDeleterContext.
   TORCH_INTERNAL_ASSERT(data_.get_deleter() != cow::cow_deleter);
 }
@@ -39,4 +40,12 @@ cow::COWDeleterContext::~COWDeleterContext() {
   TORCH_INTERNAL_ASSERT(refcount_ == 0);
 }
 
+c10::Device cow::COWDeleterContext::original_device() {
+  return original_device_;
+}
+
+std::int64_t cow::COWDeleterContext::refcount() {
+  return refcount_.load();
+}
+
 } // namespace c10::impl
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/Device.h>
 #include <c10/macros/Export.h>
 #include <c10/util/UniqueVoidPtr.h>
 
@@ -21,7 +22,9 @@ class C10_API COWDeleterContext {
   // Note that the deleter will only be called in our destructor if
   // the last reference to this goes away without getting
   // materialized.
-  explicit COWDeleterContext(std::unique_ptr<void, DeleterFnPtr> data);
+  explicit COWDeleterContext(
+      std::unique_ptr<void, DeleterFnPtr> data,
+      c10::Device original_device);
 
   // Increments the current refcount.
   void increment_refcount();
@@ -45,6 +48,10 @@ class C10_API COWDeleterContext {
   // do with it.
   std::variant<NotLastReference, LastReference> decrement_refcount();
 
+  c10::Device original_device();
+
+  std::int64_t refcount();
+
  private:
   // The destructor is hidden, this should only ever be used within
   // UniqueVoidPtr using cow::delete_context as the deleter.
@@ -53,6 +60,7 @@ class C10_API COWDeleterContext {
   std::shared_mutex mutex_;
   std::unique_ptr<void, DeleterFnPtr> data_;
   std::atomic<std::int64_t> refcount_ = 1;
+  c10::Device original_device_;
 };
 
 // `cow_deleter` is used as the `ctx_deleter` for DataPtr to implement a COW
 
@@ -5336,6 +5336,53 @@ def run(num_threads, num_parallel, skip_first, should_error):
         run(10, 2, False, True)
         run(10, 2, True, True)
 
+    @onlyCUDA
+    def test_lazy_clone_to_device(self, device):
+        device_pairs = [
+            ('cpu', 'cuda'),
+            ('cpu', 'cuda:0'),
+            ('cpu', 'cuda:1'),
+            ('cuda:1', 'cuda:0'),
+            ('cuda:0', 'cuda:1'),
+            # TODO: Figure out why CUDA to CPU segfaults
+            # ('cuda', 'cpu'),
+        ]
+        for from_device, to_device in device_pairs:
+            from_device_check = torch.empty(0, device=from_device).device
+            to_device_check = torch.empty(0, device=to_device).device
+
+            a = torch.randn(10, device=from_device)
+            orig_data_ptr = a.data_ptr()
+            b = a._lazy_clone(device=to_device)
+
+            self.assertEqual(a.device, from_device_check)
+            self.assertEqual(b.device, to_device_check)
+            self.assertTrue(torch._C._is_cow_tensor(a))
+            self.assertEqual(torch._C._data_address(a), orig_data_ptr)
+            self.assertTrue(torch._C._is_cow_tensor(b))
+            self.assertEqual(torch._C._data_address(b), orig_data_ptr)
+
+            a[0] = 1
+
+            self.assertEqual(a.device, from_device_check)
+            self.assertEqual(b.device, to_device_check)
+            self.assertFalse(torch._C._is_cow_tensor(a))
+            self.assertNotEqual(torch._C._data_address(a), orig_data_ptr)
+            self.assertTrue(torch._C._is_cow_tensor(b))
+            self.assertEqual(torch._C._data_address(b), orig_data_ptr)
+
+            b[0] = 2
+
+            self.assertEqual(a.device, from_device_check)
+            self.assertEqual(b.device, to_device_check)
+            self.assertFalse(torch._C._is_cow_tensor(a))
+            self.assertNotEqual(torch._C._data_address(a), orig_data_ptr)
+            self.assertFalse(torch._C._is_cow_tensor(b))
+            self.assertNotEqual(torch._C._data_address(b), orig_data_ptr)
+
+            self.assertEqual(a[0], 1)
+            self.assertEqual(b[0], 2)
+
     # FIXME: move to test distributions
     @skipIfMPS
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
 
@@ -451,7 +451,7 @@
   self: grad
   result: auto_linear
 
-- name: _lazy_clone(Tensor self) -> Tensor
+- name: _lazy_clone(Tensor self, *, Device? device=None) -> Tensor
   self: grad
   result: auto_linear
Original file line number	Diff line number	Diff line change
`@@ -679,7 +679,7 @@ constexpr DispatchKey toFunctionalityKey(DispatchKey k) {`
`679`	`679`	`}`
`680`	`680`	`}`
`681`	`681`
`682`		`-BackendComponent toBackendComponent(DeviceType device_type);`
	`682`	`+C10_API BackendComponent toBackendComponent(DeviceType device_type);`
`683`	`683`
`684`	`684`	`// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns`
`685`	`685`	`// DispatchKey::CUDA.`