pytorch
diff --git a/‎aten/src/ATen/AccumulateType.cpp
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/AccumulateType.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/Nonzero.cu
Lines changed: 11 additions & 1 deletion b/‎aten/src/ATen/native/cuda/Nonzero.cu
Lines changed: 11 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 1 addition & 2 deletions b/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎build_variables.bzl
Lines changed: 1 addition & 0 deletions b/‎build_variables.bzl
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/notes/serialization.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/notes/serialization.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/distributed/_tensor/test_dtensor.py
Lines changed: 36 additions & 2 deletions b/‎test/distributed/_tensor/test_dtensor.py
Lines changed: 36 additions & 2 deletions
diff --git a/‎test/distributed/_tensor/test_dtensor_ops.py
Lines changed: 0 additions & 3 deletions b/‎test/distributed/_tensor/test_dtensor_ops.py
Lines changed: 0 additions & 3 deletions
@@ -9,6 +9,8 @@ c10::ScalarType toAccumulateType(c10::ScalarType type, c10::DeviceType device) {
       switch (device) {                                                                            \
         case DeviceType::CUDA:                                                                     \
           return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CUDA>>::value; \
+        case DeviceType::XPU:                                                                      \
+          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::XPU>>::value;  \
         case DeviceType::MPS:                                                                      \
           return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::MPS>>::value;  \
         default:                                                                                   \
 
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cuda/EmptyTensor.h>
@@ -70,7 +71,16 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
   auto temp_storage = allocator.allocate(temp_storage_bytes);
   cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
   int num_nonzeros_h;
-  at::cuda::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
+  auto pinned_num_nonzeros_h = at::detail::empty_cpu(
+            {1}, /* size */
+            c10::CppTypeToScalarType<int>(), /* dtype */
+            std::nullopt, /* layout */
+            std::nullopt, /* device */
+            true, /* pin_memory */
+            std::nullopt /* memory format */
+          );
+  at::
F438
cuda::memcpy_and_sync((void *)pinned_num_nonzeros_h.const_data_ptr<int>(), num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream);
+  num_nonzeros_h = (int)*(pinned_num_nonzeros_h.const_data_ptr<int>());
   //expected output size is num_nonzeros x ndim
   //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
   //we are able to directly use passed output with this size and strides, and we can also (per contract)
 
@@ -870,7 +870,6 @@ _scaled_dot_product_flash_attention_cpu(
   int64_t batchSize = query.size(0);
   int64_t qSize = query.size(2);
   int64_t num_head = query.size(1);
-  int64_t headSize = query.size(3);
 
   TORCH_CHECK(c10::isFloatingType(dtype),
     "scaled_dot_product_attention_flash_attention: Expected data type in FP32, FP64, BF16, FP16, but got ", dtype, " instead.");
@@ -888,7 +887,7 @@ _scaled_dot_product_flash_attention_cpu(
           (attn_mask.value().dim() == 2 || attn_mask.value().dim() == 4),
     "scaled_dot_product_attention_flash_attention: Attention mask dim in {2, 4}");
 
-  at::Tensor output = at::empty({batchSize, qSize, num_head, headSize}, query.options());
+  at::Tensor output = at::empty_like(query, query.options()).transpose(1, 2);
   const auto accumulate_dtype = toOpMathType(dtype);
   at::Tensor logsumexp = at::empty({batchSize, qSize, num_head},
       query.options().dtype(accumulate_dtype));
 
@@ -835,6 +835,7 @@ libtorch_python_core_sources = [
     "torch/csrc/dynamo/extra_state.cpp",
     "torch/csrc/dynamo/framelocals_mapping.cpp",
     "torch/csrc/dynamo/guards.cpp",
+    "torch/csrc/dynamo/utils.cpp",
     "torch/csrc/dynamo/init.cpp",
     "torch/csrc/functorch/init.cpp",
     "torch/csrc/fx/node.cpp",
 
@@ -398,3 +398,4 @@ The following utility functions are related to serialization:
 .. autofunction:: clear_safe_globals
 .. autofunction:: get_safe_globals
 .. autoclass:: safe_globals
+.. autoclass:: skip_data
@@ -15,6 +15,7 @@
     init_device_mesh,
 )
 from torch.distributed._tensor.debug import CommDebugMode
+from torch.distributed._tensor.experimental import implicit_replication
 from torch.distributed._tensor.placement_types import (
     DTensorSpec,
     Partial,
@@ -778,8 +779,6 @@ def test_implicit_replication(self):
         local_tensor1 = torch.ones(4, 3)
         sharded_dtensor = DTensor.from_local(local_tensor1, mesh, [Shard(0)])
 
-        from torch.distributed._tensor.experimental import implicit_replication
-
         with implicit_replication():
             # We put the scalar tensor as the left operand so we can test out
             # when a non-dtensor is a the arg in the args list.
@@ -816,6 +815,41 @@ def add_scalar_tensor_with_dtensor():
             (numel_1_tensor + sharded_dtensor).to_local(), numel_1_tensor + local_tensor
         )
 
+    @with_comms
+    def test_implicit_replication_for_foreach_ops(self):
+        mesh = init_device_mesh(
+            self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+        )
+        global_tensor1 = torch.randn(4, 2)
+        dtensor_2d = distribute_tensor(global_tensor1, mesh, [Shard(0), Shard(1)])
+        self.assertEqual(dtensor_2d.full_tensor(), global_tensor1)
+        global_tensor2 = torch.randn(4)
+        dtensor_1d = distribute_tensor(global_tensor2, mesh["dp"], [Shard(0)])
+        dtensor_list = [dtensor_2d, dtensor_1d]
+
+        # Check without implicit replication, cross mesh error raises.
+        with self.assertRaisesRegex(
+            RuntimeError, "DTensor does not support cross-mesh operation yet!"
+        ):
+            torch._foreach_mul(dtensor_list, 2.0)
+
+        # Check dtensor result matches tensor result.
+        with implicit_replication():
+            torch._foreach_mul_(dtensor_list, 2.0)
+            self.assertEqual(dtensor_list[0].full_tensor(), global_tensor1 * 2.0)
+            self.assertEqual(dtensor_list[1].full_tensor(), global_tensor2 * 2.0)
+
+        mesh_1d = DeviceMesh.from_group(mesh["tp"].get_group(), self.device_type)
+        dtensor_1d = distribute_tensor(global_tensor2, mesh_1d, [Shard(0)])
+        dtensor_list = [dtensor_2d, dtensor_1d]
+
+        # Check even with implicit replication, cross mesh error raises if different device mesh don't
+        # belong to the same root mesh.
+        with self.assertRaisesRegex(
+            RuntimeError, "DTensor does not support cross-mesh operation yet!"
+        ):
+            torch._foreach_mul_(dtensor_list, 2.0)
+
     @with_comms
     def test_metadata_consistency_check(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
@@ -191,9 +191,6 @@ def wrapped(fn):
     xfail("index_reduce", "amin"),
     xfail("index_select"),
     xfail("isin"),
-    xfail("isinf"),
-    xfail("isneginf"),
-    xfail("isposinf"),
     xfail("kthvalue"),
     xfail("linalg.cholesky"),
     xfail("linalg.cholesky_ex"),