pytorch
diff --git a/‎aten/src/ATen/native/Copy.cpp
Lines changed: 4 additions & 1 deletion b/‎aten/src/ATen/native/Copy.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/Distance.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/Distance.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/native/ReduceOps.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
Lines changed: 4 additions & 0 deletions
@@ -130,7 +130,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
 // (e.g. XLA) may be supported by overriding copy_ and _copy_from.
 bool is_supported_device(Device device) {
   DeviceType device_type = device.type();
-  return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS;
+  return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS || device_type == kXPU;
 }
 
 } // namespace
@@ -221,6 +221,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   //   cpu_tensor.copy_(xla_tensor) => xla_tensor._copy_from(cpu_tensor)
   //   xla_tensor.copy_(cpu_tensor) => cpu_tensor._copy_from(xla_tensor)
   // Both the _copy_from calls above will be dispatched to XLA's _copy_from kernels.
+
   if (!is_supported_device(src.device()) || !is_supported_device(self.device())) {
     at::_copy_from(src, self, non_blocking);
     return self;
@@ -287,6 +288,8 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
     device_type = kHIP;
   } else if (iter.device_type(1) == kMPS) {
     device_type = kMPS;
+  } else if (iter.device_type(1) == kXPU){
+    device_type = kXPU;
   }
 
   // TODO: if we need to, we can also enable this path for quantized tensor
 
@@ -102,8 +102,8 @@ static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, std
   // See Note [cdist relies on cdist_impl redispatching]
   // Keep this condition in sync with the condition at the Note
   if (!(p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25))))) {
-    TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
-    TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
+    TORCH_CHECK(device1 == kCPU || device1 == kCUDA || device1 == kXPU, "cdist only supports CPU, XPU and CUDA devices, X1 got: ", device1);
+    TORCH_CHECK(device2 == kCPU || device2 == kCUDA || device2 == kXPU, "cdist only supports CPU, XPU and CUDA devices, X2 got: ", device2);
   }
 
   auto dim1 = x1.dim();
 
@@ -1814,8 +1814,8 @@ static Tensor& std_var_out(
     const char* fname, Tensor& result, const Tensor& self,
     at::OptionalIntArrayRef dim, const std::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
-  TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(),
-              "std and var only supports tensors on a CPU or CUDA device, but got: ",
+  TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda() || self.device().is_xpu(),
+              "std and var supports tensors on a CPU, CUDA, or XPU device only, but got: ",
               self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "std and var only supports strided layout, got: ", self.layout());
@@ -1887,8 +1887,8 @@ static std::tuple<Tensor&, Tensor&> std_var_mean_out(
     at::OptionalIntArrayRef dim, const std::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
   AT_ASSERT(result1.defined() && result2.defined());
-  TORCH_CHECK(self.device().is_cpu() || self.is_cuda(),
-              fname, " only supports tensors on a CPU or CUDA device, got: ",
+  TORCH_CHECK(self.device().is_cpu() || self.is_cuda() || self.is_xpu(),
+              fname, " supports tensors on a CPU, CUDA, or XPU device only, got: ",
               self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               fname, " only supports strided layout, got: ", self.layout());
 
@@ -811,7 +811,7 @@ Tensor & _index_put_impl_(Tensor & self, const torch::List<std::optional<Tensor>
       at::assert_no_overlap(self, *index);
     }
   }
-  if (self.device().type() == DeviceType::CUDA && (accumulate || globalContext().deterministicAlgorithms())) {
+  if ((self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::XPU) && (accumulate || globalContext().deterministicAlgorithms())) {
       TORCH_CHECK(value_.device() == self.device(), "expected device ", self.device(), " but got device ",
       value_.device(), " for value tensor");
       index_put_with_sort_stub(self.device().type(), self, indices, value_, accumulate, unsafe);
 
@@ -11,7 +11,9 @@
 
 namespace at::native {
 
+#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
 CONSTEXPR_EXCEPT_WIN_CUDA char addcmul_name[] = "addcmul";
+#endif
 void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -55,8 +57,10 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
+#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
 // return a + alpha * (b / static_cast<accscalar_t>(c));
 CONSTEXPR_EXCEPT_WIN_CUDA char addcdiv_name[] = "addcdiv";
+#endif
 void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
Original file line number	Diff line number	Diff line change
`@@ -811,7 +811,7 @@ Tensor & _index_put_impl_(Tensor & self, const torch::List<std::optional<Tensor>`
`811`	`811`	`at::assert_no_overlap(self, *index);`
`812`	`812`	`}`
`813`	`813`	`}`
`814`		`- if (self.device().type() == DeviceType::CUDA && (accumulate \|\| globalContext().deterministicAlgorithms())) {`
	`814`	`+ if ((self.device().type() == DeviceType::CUDA \|\| self.device().type() == DeviceType::XPU) && (accumulate \|\| globalContext().deterministicAlgorithms())) {`
`815`	`815`	`TORCH_CHECK(value_.device() == self.device(), "expected device ", self.device(), " but got device ",`
`816`	`816`	`value_.device(), " for value tensor");`
`817`	`817`	`index_put_with_sort_stub(self.device().type(), self, indices, value_, accumulate, unsafe);`