pytorch
diff --git a/‎aten/src/ATen/EmptyTensor.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/EmptyTensor.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/mps/MPSAllocator.mm
Lines changed: 5 additions & 1 deletion b/‎aten/src/ATen/mps/MPSAllocator.mm
Lines changed: 5 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 16 additions & 3 deletions b/‎aten/src/ATen/native/AutogradComposite.cpp
Lines changed: 16 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/TensorConversions.cpp
Lines changed: 27 additions & 0 deletions b/‎aten/src/ATen/native/TensorConversions.cpp
Lines changed: 27 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/TensorFactories.h
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/TensorFactories.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 3 additions & 2 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 3 additions & 2 deletions
diff --git a/‎aten/src/ATen/test/xla_tensor_test.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/test/xla_tensor_test.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/core/Allocator.cpp
Lines changed: 2 additions & 2 deletions b/‎c10/core/Allocator.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎c10/core/Allocator.h
Lines changed: 12 additions & 4 deletions b/‎c10/core/Allocator.h
Lines changed: 12 additions & 4 deletions
diff --git a/‎c10/core/CPUAllocator.cpp
Lines changed: 10 additions & 2 deletions b/‎c10/core/CPUAllocator.cpp
Lines changed: 10 additions & 2 deletions
diff --git a/‎c10/core/DispatchKey.h
Lines changed: 1 addition & 1 deletion b/‎c10/core/DispatchKey.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/core/impl/COW.cpp
Lines changed: 44 additions & 16 deletions b/‎c10/core/impl/COW.cpp
Lines changed: 44 additions & 16 deletions
@@ -348,7 +348,7 @@ struct MetaAllocator final : public at::Allocator {
   DeleterFnPtr raw_deleter() const override {
     return deleter;
   }
-  void copy_data(void* dest, const void* src, std::size_t count) const final {}
+  void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {}
 };
 
 static MetaAllocator g_meta_alloc;
 
@@ -339,7 +339,7 @@ struct CachingHostAllocatorImpl {
     return false;
   }
 
-  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
+  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]], bool sync [[maybe_unused]] = false) const {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
   }
 
@@ -641,9 +641,9 @@ struct CachingHostAllocatorInterface : public at::Allocator {
     impl_->empty_cache();
   }
 
-  void copy_data(void* dest, const void* src, std::size_t count)
+  void copy_data(void* dest, const void* src, std::size_t count, bool sync=false)
       const override {
-    impl_->copy_data(dest, src, count);
+    impl_->copy_data(dest, src, count, sync);
   }
 
   HostStats getStats() {
 
@@ -23,8 +23,8 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
   DeleterFnPtr raw_deleter() const override {
     return allocator_->raw_deleter();
   }
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    allocator_->copy_data(dest, src, count);
+  void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {
+    allocator_->copy_data(dest, src, count, sync);
   }
 };
 
 
@@ -5,6 +5,7 @@
 #include <ATen/mps/MPSAllocator.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Storage.h>
+#include <ATen/detail/MPSHooksInterface.h>
 
 #include <iostream>
 
@@ -820,8 +821,11 @@ bool waitForEvents(c10::ArrayRef<const void*> buffers) const override {
     return _getAllocImpl().format_size(size);
   }
 
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
+  void copy_data(void* dest, const void* src, std::size_t count, bool sync = false) const final {
     default_copy_data(dest, src, count);
+    if (sync) {
+      at::detail::getMPSHooks().deviceSynchronize();
+    }
   }
 
  private:
 
@@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <c10/util/SmallBuffer.h>
 #include <c10/core/impl/COW.h>
+#include <c10/core/DispatchKey.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -13,6 +14,7 @@
 #include <ATen/ops/_unpack_dual_native.h>
 #include <ATen/ops/_lazy_clone_native.h>
 #include <ATen/ops/alias.h>
+#include <ATen/ops/empty.h>
 #include <ATen/ops/zeros.h>
 #endif
 
@@ -91,14 +93,25 @@ bool _has_same_storage_numel(const at::Tensor& base, const at::Tensor& other) {
   return base.storage().sym_nbytes() / base.itemsize() == other.storage().sym_nbytes() / other.itemsize();
 }
 
-Tensor _lazy_clone(Tensor const& self) {
+Tensor _lazy_clone(Tensor const& self, optional<c10::Device> device_opt) {
+  optional<c10::Allocator*> allocator_opt = nullopt;
+  if (device_opt.has_value()) {
+    allocator_opt = at::empty({}, at::TensorOptions().device(device_opt.value())).storage().allocator();
+  }
   c10::StorageImpl* self_storage = self.storage().unsafeGetStorageImpl();
   c10::intrusive_ptr<c10::StorageImpl> storage =
-    c10::impl::cow::lazy_clone_storage(*self_storage);
+    c10::impl::cow::lazy_clone_storage(*self_storage, device_opt, allocator_opt);
   TORCH_CHECK(storage != nullptr);
+  c10::DispatchKeySet key_set = self.key_set();
+  // If the target device differs, then we must change the key set
+  if (device_opt.has_value() && device_opt.value().type() != self.device().type()) {
+    c10::BackendComponent old_backend = c10::toBackendComponent(self.device().type());
+    c10::BackendComponent new_backend = c10::toBackendComponent(device_opt.value().type());
+    key_set = key_set.remove_backend(old_backend) | c10::DispatchKeySet(new_backend);
+  }
   auto tensor = c10::make_intrusive<c10::TensorImpl>(
       c10::Storage(std::move(storage)),
-      self.key_set(),
+      key_set,
       self.dtype());
   tensor->set_sizes_and_strides(self.sym_sizes(),
                                 self.sym_strides(),
 
@@ -17,6 +17,7 @@
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_lazy_clone.h>
 #include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
@@ -422,6 +423,26 @@ bool to_will_alias(
        self.suggest_memory_format() == memory_format);
 }
 
+// static bool _only_device_differs(
+//     const Tensor& self,
+//     std::optional<ScalarType> dtype,
+//     std::optional<Layout> layout,
+//     std::optional<Device> device,
+//     std::optional<bool> pin_memory,
+//     std::optional<c10::MemoryFormat> optional_memory_format) {
+//   bool device_differs = device.has_value() && device.value() !=
+//   self.device(); bool dtype_differs = dtype.has_value() && dtype.value() !=
+//   self.scalar_type(); bool layout_differs = layout.has_value() &&
+//   layout.value() != self.layout(); bool pin_memory_differs =
+//       pin_memory.has_value() && pin_memory.value() != self.is_pinned();
+//   auto memory_format =
+//   optional_memory_format.value_or(MemoryFormat::Preserve); bool
+//   memory_format_differs = memory_format != MemoryFormat::Preserve &&
+//       memory_format != self.suggest_memory_format();
+//   return device_differs && !dtype_differs && !layout_differs &&
+//       !pin_memory_differs && !memory_format_differs;
+// }
+
 static inline Tensor to_impl(
     const Tensor& self,
     std::optional<ScalarType> dtype,
@@ -436,6 +457,12 @@ static inline Tensor to_impl(
           self, dtype, layout, device, copy, optional_memory_format)) {
     return self;
   }
+  // TODO: after I prove that this works, I should only allow it for CPU-MPS,
+  // and we can enabled others later if needed.
+  // if (_only_device_differs(self, dtype, layout, device, pin_memory,
+  // optional_memory_format)) {
+  //  return at::_lazy_clone(self, device);
+  //}
   return at::_to_copy(
       self,
       dtype,
 
@@ -157,7 +157,8 @@ struct ZeroTensorAllocator final : public at::Allocator {
   void copy_data(
       void* dest [[maybe_unused]],
       const void* src [[maybe_unused]],
-      std::size_t count [[maybe_unused]]) const final {}
+      std::size_t count [[maybe_unused]],
+      bool sync [[maybe_unused]] = false) const final {}
   at::Device device_;
 };
 
 
@@ -1250,9 +1250,10 @@
     CompositeExplicitAutograd: copysign_out
   tags: pointwise
 
-- func: _lazy_clone(Tensor self) -> Tensor
+- func: _lazy_clone(Tensor self, *, Device? device=None) -> Tensor
   # Like clone, but the copy takes place lazily, only if either the
-  # input or the output are written.
+  # input or the output are written. If `device` is given, the output
+  # will be copied to the specified device when the write occurs.
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: _lazy_clone
 
@@ -24,7 +24,7 @@ struct XLAAllocator final : public at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &XLAFree;
   }
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
+  void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {
     default_copy_data(dest, src, count);
   }
 };
 
@@ -7,9 +7,9 @@
 
 namespace c10 {
 
-DataPtr Allocator::clone(const void* data, std::size_t n) {
+DataPtr Allocator::clone(const void* data, std::size_t n, bool sync) {
   DataPtr new_data = allocate(n);
-  copy_data(new_data.mutable_get(), data, n);
+  copy_data(new_data.mutable_get(), data, n, sync);
   return new_data;
 }
 
 
@@ -173,8 +173,13 @@ struct C10_API Allocator {
   // Note that this explicitly ignores any context that may have been
   // attached to the input data.
   //
-  // Requires: input data was allocated by the same allocator.
-  DataPtr clone(const void* data, std::size_t n);
+  // If `sync=true` is given, then the device will synchronize after the clone
+  // happens, if the device is normally asynchronous.
+  //
+  // Requires: Depending on the details of the allocator, input data may need to
+  // have been allocated by the same allocator. Some allocators do support
+  // cloning from a different device.
+  DataPtr clone(const void* data, std::size_t n, bool sync = false);
 
   // Checks if DataPtr has a simple context, not wrapped with any out of the
   // ordinary contexts.
@@ -205,8 +210,11 @@ struct C10_API Allocator {
   //
   // Requires: src and dest were allocated by this allocator
   // Requires: src and dest both have length >= count
-  virtual void copy_data(void* dest, const void* src, std::size_t count)
-      const = 0;
+  virtual void copy_data(
+      void* dest,
+      const void* src,
+      std::size_t count,
+      bool sync = false) const = 0;
 
  protected:
   // Uses `std::memcpy` to copy data.
 
@@ -41,7 +41,11 @@ struct C10_API DefaultCPUAllocator final : at::Allocator {
     return &ReportAndDelete;
   }
 
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
+  void copy_data(
+      void* dest,
+      const void* src,
+      std::size_t count,
+      bool sync = false) const final {
     default_copy_data(dest, src, count);
   }
 };
@@ -149,7 +153,11 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
         PreGuardBytes;
   }
 
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
+  void copy_data(
+      void* dest,
+      const void* src,
+      std::size_t count,
+      bool sync = false) const final {
     default_copy_data(dest, src, count);
   }
 };
 
@@ -679,7 +679,7 @@ constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
   }
 }
 
-BackendComponent toBackendComponent(DeviceType device_type);
+C10_API BackendComponent toBackendComponent(DeviceType device_type);
 
 // Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
 // DispatchKey::CUDA.
 
@@ -1,6 +1,7 @@
 #include <c10/core/impl/COW.h>
 
 #include <c10/core/Allocator.h>
+#include <c10/core/DeviceGuard.h>
 #include <c10/core/StorageImpl.h>
 #include <c10/core/alignment.h>
 #include <c10/core/impl/COWDeleter.h>
@@ -48,7 +49,12 @@ bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
   return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
 }
 
-c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
+c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
+    StorageImpl& storage,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<c10::Allocator*> allocator_opt) {
+  TORCH_INTERNAL_ASSERT(device_opt.has_value() == allocator_opt.has_value());
+
   const at::DataPtr& data_ptr = storage.data_ptr();
 
   // There are three possible circumstances:
@@ -76,38 +82,61 @@ c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
   //
   //    No locking is required in this case.
 
-  std::optional<DataPtr> new_data_ptr; // must be set below
+  std::optional<DataPtr> new_data_ptr_opt; // must be set below
 
   if (has_simple_data_ptr(storage)) {
     // Case 1) We have a simple data pointer: wrap it.
     std::unique_ptr<void, DeleterFnPtr> original_ctx =
         storage._mutable_data_ptr_no_checks().move_context();
 
     // Save this for the result.
-    new_data_ptr = make_data_ptr(
-        data_ptr, *new cow::COWDeleterContext(std::move(original_ctx)));
+    new_data_ptr_opt = make_data_ptr(
+        data_ptr,
+        *new cow::COWDeleterContext(std::move(original_ctx), storage.device()));
 
     // Update this storage to the new copy on write context.
-    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr));
+    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr_opt));
   } else if (is_cow_data_ptr(data_ptr)) {
     // Case 2): there is already a copy on write context. Just return a
     // new storage impl.
-    new_data_ptr = copy_data_ptr(data_ptr);
+    new_data_ptr_opt = copy_data_ptr(data_ptr);
   } else {
     // Case 3) There is a context and it's not copy-on-write. Nothing
     // we can do here.
     return nullptr;
   }
 
-  TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
+  TORCH_INTERNAL_ASSERT(new_data_ptr_opt.has_value());
+
+  c10::Allocator* allocator = storage.allocator();
+  c10::DeviceType device_type = storage.device_type();
+
+  if (device_opt.has_value()) {
+    allocator = allocator_opt.value();
+
+    DeviceGuard device_guard(device_opt.value());
+    Device device = device_guard.current_device();
+
+    // If a different target device was given, then convert the data pointer to
+    // that device.
+    if (device != storage.device()) {
+      DataPtr& new_data_ptr = new_data_ptr_opt.value();
+      auto* ctx = new_data_ptr.cast_context<c10::impl::cow::COWDeleterContext>(
+          c10::impl::cow::cow_deleter);
+      device_type = device.type();
+      new_data_ptr.release_context();
+      new_data_ptr_opt = c10::DataPtr(
+          new_data_ptr.get(), ctx, c10::impl::cow::cow_deleter, device);
+    }
+  }
 
   return make_storage_impl(
       StorageImpl::use_byte_size_t(),
       storage.sym_nbytes(),
-      *std::move(new_data_ptr),
-      storage.allocator(),
+      *std::move(new_data_ptr_opt),
+      allocator,
       storage.resizable(),
-      storage.device_type());
+      device_type);
 }
 
 C10_API void materialize_cow_storage(StorageImpl& storage) {
@@ -118,13 +147,14 @@ C10_API void materialize_cow_storage(StorageImpl& storage) {
 
   auto* ctx = data_ptr.cast_context<cow::COWDeleterContext>(cow::cow_deleter);
   TORCH_INTERNAL_ASSERT(ctx != nullptr);
-
+  bool devices_match = storage.device() == ctx->original_device();
   auto result = ctx->decrement_refcount();
 
   // This must be set by each branch below.
   std::optional<DataPtr> new_data_ptr;
 
-  if (std::holds_alternative<cow::COWDeleterContext::LastReference>(result)) {
+  if (devices_match &&
+      std::holds_alternative<cow::COWDeleterContext::LastReference>(result)) {
     // This is the only reference to the data. If there were any racing writes,
     // the context ensured they finished before giving us the result.
     std::unique_ptr<void, DeleterFnPtr> data =
@@ -133,12 +163,10 @@ C10_API void materialize_cow_storage(StorageImpl& storage) {
     new_data_ptr = DataPtr(
         data.release(), data_ptr.get(), data.get_deleter(), data_ptr.device());
   } else {
-    TORCH_INTERNAL_ASSERT(
-        std::holds_alternative<cow::COWDeleterContext::NotLastReference>(
-            result));
     // We don't need to consume the result, it's just a shared lock ensuring
     // that the data will remain while we copy it.
-    new_data_ptr = storage.allocator()->clone(data_ptr.get(), storage.nbytes());
+    new_data_ptr = storage.allocator()->clone(
+        data_ptr.get(), storage.nbytes(), /*sync=*/!devices_match);
   }
 
   TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ struct MetaAllocator final : public at::Allocator {`
`348`	`348`	`DeleterFnPtr raw_deleter() const override {`
`349`	`349`	`return deleter;`
`350`	`350`	`}`
`351`		`- void copy_data(void* dest, const void* src, std::size_t count) const final {}`
	`351`	`+ void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {}`
`352`	`352`	`};`
`353`	`353`
`354`	`354`	`static MetaAllocator g_meta_alloc;`
Original file line number	Diff line number	Diff line change
`@@ -339,7 +339,7 @@ struct CachingHostAllocatorImpl {`
`339`	`339`	`return false;`
`340`	`340`	`}`
`341`	`341`
`342`		`- virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {`
	`342`	`+ virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]], bool sync [[maybe_unused]] = false) const {`
`343`	`343`	`TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");`
`344`	`344`	`}`
`345`	`345`
`@@ -641,9 +641,9 @@ struct CachingHostAllocatorInterface : public at::Allocator {`
`641`	`641`	`impl_->empty_cache();`
`642`	`642`	`}`
`643`	`643`
`644`		`- void copy_data(void* dest, const void* src, std::size_t count)`
	`644`	`+ void copy_data(void* dest, const void* src, std::size_t count, bool sync=false)`
`645`	`645`	`const override {`
`646`		`- impl_->copy_data(dest, src, count);`
	`646`	`+ impl_->copy_data(dest, src, count, sync);`
`647`	`647`	`}`
`648`	`648`
`649`	`649`	`HostStats getStats() {`
Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator {`
`23`	`23`	`DeleterFnPtr raw_deleter() const override {`
`24`	`24`	`return allocator_->raw_deleter();`
`25`	`25`	`}`
`26`		`- void copy_data(void* dest, const void* src, std::size_t count) const final {`
`27`		`- allocator_->copy_data(dest, src, count);`
	`26`	`+ void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {`
	`27`	`+ allocator_->copy_data(dest, src, count, sync);`
`28`	`28`	`}`
`29`	`29`	`};`
`30`	`30`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ struct XLAAllocator final : public at::Allocator {`
`24`	`24`	`at::DeleterFnPtr raw_deleter() const override {`
`25`	`25`	`return &XLAFree;`
`26`	`26`	`}`
`27`		`- void copy_data(void* dest, const void* src, std::size_t count) const final {`
	`27`	`+ void copy_data(void* dest, const void* src, std::size_t count, bool sync=false) const final {`
`28`	`28`	`default_copy_data(dest, src, count);`
`29`	`29`	`}`
`30`	`30`	`};`
Original file line number	Diff line number	Diff line change
`@@ -7,9 +7,9 @@`
`7`	`7`
`8`	`8`	`namespace c10 {`
`9`	`9`
`10`		`-DataPtr Allocator::clone(const void* data, std::size_t n) {`
	`10`	`+DataPtr Allocator::clone(const void* data, std::size_t n, bool sync) {`
`11`	`11`	`DataPtr new_data = allocate(n);`
`12`		`- copy_data(new_data.mutable_get(), data, n);`
	`12`	`+ copy_data(new_data.mutable_get(), data, n, sync);`
`13`	`13`	`return new_data;`
`14`	`14`	`}`
`15`	`15`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,11 @@ struct C10_API DefaultCPUAllocator final : at::Allocator {`
`41`	`41`	`return &ReportAndDelete;`
`42`	`42`	`}`
`43`	`43`
`44`		`- void copy_data(void* dest, const void* src, std::size_t count) const final {`
	`44`	`+ void copy_data(`
	`45`	`+ void* dest,`
	`46`	`+ const void* src,`
	`47`	`+ std::size_t count,`
	`48`	`+ bool sync = false) const final {`
`45`	`49`	`default_copy_data(dest, src, count);`
`46`	`50`	`}`
`47`	`51`	`};`
`@@ -149,7 +153,11 @@ class DefaultMobileCPUAllocator final : public at::Allocator {`
`149`	`153`	`PreGuardBytes;`
`150`	`154`	`}`
`151`	`155`
`152`		`- void copy_data(void* dest, const void* src, std::size_t count) const final {`
	`156`	`+ void copy_data(`
	`157`	`+ void* dest,`
	`158`	`+ const void* src,`
	`159`	`+ std::size_t count,`
	`160`	`+ bool sync = false) const final {`
`153`	`161`	`default_copy_data(dest, src, count);`
`154`	`162`	`}`
`155`	`163`	`};`
Original file line number	Diff line number	Diff line change
`@@ -679,7 +679,7 @@ constexpr DispatchKey toFunctionalityKey(DispatchKey k) {`
`679`	`679`	`}`
`680`	`680`	`}`
`681`	`681`
`682`		`-BackendComponent toBackendComponent(DeviceType device_type);`
	`682`	`+C10_API BackendComponent toBackendComponent(DeviceType device_type);`
`683`	`683`
`684`	`684`	`// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns`
`685`	`685`	`// DispatchKey::CUDA.`