pytorch
diff --git a/‎test/distributed/tensor/test_dtensor_compile.py
Lines changed: 9 additions & 3 deletions b/‎test/distributed/tensor/test_dtensor_compile.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎test/distributed/tensor/test_random_ops.py
Lines changed: 11 additions & 8 deletions b/‎test/distributed/tensor/test_random_ops.py
Lines changed: 11 additions & 8 deletions
diff --git a/‎test/distributed/tensor/test_redistribute.py
Lines changed: 2 additions & 2 deletions b/‎test/distributed/tensor/test_redistribute.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/testing/_internal/distributed/_tensor/common_dtensor.py
Lines changed: 23 additions & 12 deletions b/‎torch/testing/_internal/distributed/_tensor/common_dtensor.py
Lines changed: 23 additions & 12 deletions
@@ -35,11 +35,14 @@
     RowwiseParallel,
 )
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
     skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_HPU,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -52,6 +55,9 @@
 from torch.utils.checkpoint import checkpoint
 
 
+dev_type = torch.device(get_devtype())
+
+
 class SimpleModel(nn.Module):
     def __init__(self, device):
         super().__init__()
@@ -102,7 +108,7 @@ def tearDown(self):
 
     @property
     def device_type(self) -> str:
-        return "cuda" if torch.cuda.is_available() else "cpu"
+        return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"
 
     @property
     def world_size(self) -> int:
@@ -907,7 +913,7 @@ def test_2d_fsdp_tp_compile(self):
         tp_model = parallelize_module(model, twod_mesh["tp"], parallelize_plan)
         eager_2d = FSDP(
             tp_model,
-            device_id=self.rank,
+            device_id=dev_type.type,
             use_orig_params=True,
             device_mesh=twod_mesh["dp"],
         )
@@ -919,7 +925,7 @@ def test_2d_fsdp_tp_compile(self):
         )
         fsdp_2d = FSDP(
             tp_model2,
-            device_id=self.rank,
+            device_id=dev_type.type,
             use_orig_params=True,
             device_mesh=twod_mesh["dp"],
         )
 
@@ -19,7 +19,7 @@
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_HPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_if_lt_x_gpu,
@@ -28,6 +28,9 @@
 )
 
 
+TYPE_DEVICE = "hpu" if TEST_HPU else "cuda"
+
+
 class DistTensorRandomInitTest(DTensorTestBase):
     def _run_init_op(self, init_op, *args, **kwargs):
         device_mesh = self.build_device_mesh()
@@ -47,7 +50,7 @@ def _run_init_op(self, init_op, *args, **kwargs):
             self.assertEqual(local_tensor_clone, dtensor.to_local())
         else:
             # create DTensor from Tensor
-            _tensor = torch.empty(*input_size, device="cuda")
+            _tensor = torch.empty(*input_size, device=TYPE_DEVICE)
             dtensor = distribute_tensor(_tensor, device_mesh, [Shard(1)])
 
             # DTensor random init
@@ -242,15 +245,15 @@ class DistTensorRandomOpTest(DTensorTestBase):
     @with_comms
     @skip_unless_torch_gpu
     def test_rng_tracker_init(self):
-        torch.cuda.manual_seed(self.rank)
-        object_list = [torch.cuda.initial_seed()]
+        torch.manual_seed(self.rank)
+        object_list = [torch.initial_seed()]
         broadcast_object_list(object_list)
         seed_from_rank_0 = int(object_list[0])
 
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         # seed synchronization happens after the first `distribute_tensor` call
         distribute_tensor(
-            torch.empty([self.world_size], device="cuda"), device_mesh, [Shard(0)]
+            torch.empty([self.world_size], device=TYPE_DEVICE), device_mesh, [Shard(0)]
         )
         self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
 
@@ -340,13 +343,13 @@ def test_deterministic_dropout_1d(self):
         # execution the default random seed will be different (a random value).
         # The DTensor random ops will use the same random seed even though the
         # torch random generator keeps different seeds on ranks.
-        torch.cuda.manual_seed(self.rank)
+        torch.manual_seed(self.rank)
         # TODO: add test before/after enabling distribute region
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         size = [4, 4]
 
         dtensor = distribute_tensor(
-            torch.empty(*size, device="cuda"), device_mesh, [Shard(1)]
+            torch.empty(*size, device=TYPE_DEVICE), device_mesh, [Shard(1)]
         )
 
         # a random op call shifts the offset
@@ -400,7 +403,7 @@ def test_deterministic_rand_1d(self):
                         local_tensor[other_slice, :],
                     )
 
-            torch.cuda.manual_seed(self.rank)
+            torch.manual_seed(self.rank)
             dtensor = fn(size, device_mesh=device_mesh, placements=[Replicate()])
             local_tensor = funcol.all_gather_tensor(
                 dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
 
@@ -9,7 +9,7 @@
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -366,7 +366,7 @@ def test_redistribute_shard_dim_change(self):
                 local_out_dt = out_dt.to_local()
                 local_expected_dt = expected_dt.to_local()
                 self.assertEqual(out_dt.to_local(), expected_dt.to_local())
-                if self.device_type == "cuda":
+                if TEST_HPU or TEST_CUDA:
                     self.assertEqual(
                         comm_mode.get_comm_counts()[
                             torch.ops._dtensor.shard_dim_alltoall
 
@@ -32,6 +32,10 @@
     RowwiseParallel,
     SequenceParallel,
 )
+from torch.testing._internal.common_utils import (
+    TEST_HPU,
+    TEST_CUDA,
+)
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     MultiThreadedTestCase,
@@ -41,17 +45,26 @@
 )
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
-
-DEVICE_TYPE = (
-    "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
-)
+from torch._utils import _get_device_module
+
+if TEST_CUDA:
+    DEVICE_TYPE = "cuda"
+    PG_BACKEND = "nccl"
+    DEVICE_COUNT = _get_device_module("cuda").device_count()
+elif TEST_HPU:
+    DEVICE_TYPE = "hpu"
+    PG_BACKEND = "hccl"
+    DEVICE_COUNT = _get_device_module("hpu").device_count()
+else:
+    DEVICE_TYPE = "cpu"
+    PG_BACKEND = "gloo"
 
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
-if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+if TEST_CUDA and DEVICE_COUNT > 1:
     # when we actually have multiple GPUs, relax the requirement to smaller counts.
-    NUM_DEVICES = min(NUM_DEVICES, torch.cuda.device_count())
+    NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
 
 T = TypeVar("T")
 
@@ -311,7 +324,7 @@ def world_size(self) -> int:
 
     @property
     def backend(self) -> str:
-        backend = "nccl" if self.device_type == "cuda" else "gloo"
+        backend = "nccl" if TEST_CUDA else "hccl" if TEST_HPU else "gloo"
         return backend
 
     def build_device_mesh(self) -> DeviceMesh:
@@ -321,7 +334,7 @@ def init_pg(self, eager_init) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
-        if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl"]:
+        if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl", "hccl"]:
             raise RuntimeError(f"Backend {self.backend} not supported!")
 
         device_id = None
@@ -330,7 +343,6 @@ def init_pg(self, eager_init) -> None:
             torch.cuda.set_device(self.rank)
             # we only need to set device_id for nccl backend with eager init
             device_id = torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
-
         # For nccl backend, bind the device to the process if device_id is not None
         # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
         # for form subgroup to avoid unnecesssary overhead.
@@ -342,11 +354,10 @@ def init_pg(self, eager_init) -> None:
             device_id=device_id,
         )
 
-
     def destroy_pg(self) -> None:
         # Wait for all ranks to reach here before starting shutdown.
         # FIXME dist.barrier deadlocks with multiple threads and NCCL: https://github.com/pytorch/pytorch/issues/95895
-        # dist.all_reduce(torch.zeros((1,), device="cuda" if torch.cuda.is_available() else "cpu"))
+        # dist.all_reduce(torch.zeros((1,), device="cuda" if TEST_CUDA else "cpu"))
         # FIXME can't use the above all_reduce as it causes hangs on bionic and focal. It hangs:
         #  test_dtensor.py  -- DTensorMeshTest.test_dtensor_device_mesh_device_conversion
         dist.barrier()
@@ -383,7 +394,7 @@ def wrapper(
             self, *args: tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
         ) -> None:
             # if enough GPU we can use GPU, otherwise we fallback to CPU
-            if not torch.cuda.is_available() or torch.cuda.device_count() < self.world_size:
+            if not TEST_CUDA or torch.cuda.device_count() < self.world_size:
                 self.device_type = "cpu"
             else:
                 self.device_type = DEVICE_TYPE