pytorch
diff --git a/‎test/distributed/tensor/parallel/test_tp_random_state.py
Lines changed: 4 additions & 2 deletions b/‎test/distributed/tensor/parallel/test_tp_random_state.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎test/distributed/tensor/parallel/test_tp_style.py
Lines changed: 2 additions & 0 deletions b/‎test/distributed/tensor/parallel/test_tp_style.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/distributed/tensor/test_random_ops.py
Lines changed: 29 additions & 17 deletions b/‎test/distributed/tensor/test_random_ops.py
Lines changed: 29 additions & 17 deletions
diff --git a/‎torch/distributed/tensor/_api.py
Lines changed: 1 addition & 12 deletions b/‎torch/distributed/tensor/_api.py
Lines changed: 1 addition & 12 deletions
diff --git a/‎torch/distributed/tensor/_dispatch.py
Lines changed: 1 addition & 1 deletion b/‎torch/distributed/tensor/_dispatch.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/distributed/tensor/_random.py
Lines changed: 38 additions & 16 deletions b/‎torch/distributed/tensor/_random.py
Lines changed: 38 additions & 16 deletions
@@ -49,7 +49,7 @@ def test_model_init(self):
         self.assertEqual(dp_rank, self.rank // tp_size)
         self.assertEqual(tp_rank, self.rank % tp_size)
 
-        for enable_distribute_flag in [False, True]:
+        for enable_distribute_fla
8000
g in [True, False]:
             # a local model on meta device
             model = MLPModule(device="meta")
             # the col-wise parallel style shards the weight over tensor dim 0
@@ -68,7 +68,9 @@ def test_model_init(self):
             torch.cuda.manual_seed(dp_rank)
 
             # disable/enable parallel RNG feature
-            random._rng_tracker.distribute_region_enabled = enable_distribute_flag
+            if random._rng_tracker:
+                random._rng_tracker.distribute_region_enabled = enable_distribute_flag
+
             self.assertTrue(model_tp.net1.weight.is_meta)
             # initialize the model's local shard
             model_tp.to_empty(device=self.device_type)
 
@@ -346,6 +346,8 @@ def test_prepare_module_output(self):
     @with_comms
     def test_sequence_parallel_style(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
+        # early init RNG tracker
+        torch.distributed.tensor._random.manual_seed(0, mesh)
 
         comm_mode = CommDebugMode()
         batch, N, embedding_dim = 20, 8, 12
 
@@ -100,34 +100,41 @@ def test_meta_tensor_init(self):
         meta_dtensor = distribute_tensor(
             torch.empty(*size, device="meta"), device_mesh, [Replicate()]
         )
+
+        # the tensor slice on the current rank
+        self_slice = slice(1024 * self.rank, 1024 * self.rank + 1024)
+
+        # Test 1: enable the distribute region for RNG (by default)
         self.assertTrue(meta_dtensor.is_meta)
+        # Tensor meta init
         dtensor = torch.empty_like(meta_dtensor, device=self.device_type)
-
-        # disable the distribute region for RNG
-        random._rng_tracker.distribute_region_enabled = False
         dtensor.uniform_()
+        # check `distribute_region_enabled` is set to True by default
+        self.assertTrue(random._rng_tracker.distribute_region_enabled)
 
         # allgather the local tensors
         local_tensor = funcol.all_gather_tensor(
             dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
 
         # compare with local tensors from other ranks
-        self_slice = slice(1024 * self.rank, 1024 * self.rank + 1024)
         for other_rank in range(self.world_size):
-            # the RNG result on each rank differs even they're supposed
-            # to be replicated
+            # the RNG result on each rank are the same because they're replicated
             if self.rank != other_rank:
+                # other rank should have an identical local tensor
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
-                self.assertNotEqual(
+                self.assertEqual(
                     local_tensor[self_slice, :], local_tensor[other_slice, :]
                 )
 
-        # enable the distribute region for RNG
-        random._rng_tracker.distribute_region_enabled = True
+        # Test 2: disable the distribute region for RNG
         self.assertTrue(meta_dtensor.is_meta)
+        # Tensor meta init
         dtensor = torch.empty_like(meta_dtensor, device=self.device_type)
+        random._rng_tracker.distribute_region_enabled = False
         dtensor.uniform_()
+        # check `distribute_region_enabled` is set to False
+        self.assertTrue(not random._rng_tracker.distribute_region_enabled)
 
         # allgather the local tensors
         local_tensor = funcol.all_gather_tensor(
@@ -136,11 +143,11 @@ def test_meta_tensor_init(self):
 
         # compare with local tensors from other ranks
         for other_rank in range(self.world_size):
-            # the RNG result on each rank are the same because they're replicated
+            # the RNG result on each rank differs even they're supposed
+            # to be replicated
             if self.rank != other_rank:
                 # other rank should have an identical local tensor
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
-                self.assertEqual(
+                self.assertNotEqual(
                     local_tensor[self_slice, :], local_tensor[other_slice, :]
                 )
 
@@ -251,10 +258,15 @@ def test_rng_tracker_init(self):
         seed_from_rank_0 = int(object_list[0])
 
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
-        # seed synchronization happens after the first `distribute_tensor` call
-        distribute_tensor(
+        # seed synchronization now does NOT happen after the first `distribute_tensor`
+        # call
+        dt = distribute_tensor(
             torch.empty([self.world_size], device=TYPE_DEVICE), device_mesh, [Shard(0)]
         )
+        self.assertTrue(random._rng_tracker is None)
+        # seed synchronization only happens after `manual_seed` or the first DTensor
+        # random op call
+        dt.uniform_(0, 1)
         self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
 
     @with_comms
@@ -459,6 +471,9 @@ def test_deterministic_uniform_2d(self):
         for placements, shard_index in zip(placements_list, shard_index_list):
             dtensor = dtensor.redistribute(device_mesh, placements)
 
+            # random op call
+            dtensor.uniform_(0, 1)
+
             # check shard information is correct
             shard_coord = [
                 coordinate[mesh_dim] if mesh_dim >= 0 else 0
@@ -503,9 +518,6 @@ def test_deterministic_uniform_2d(self):
 
             local_shard_comb = itertools.product(*local_shard_list_on_dim)
 
-            # random op call
-            dtensor.uniform_(0, 1)
-
             # the local shard
             local_tensor = dtensor.to_local()
             # allgather the local tensors
 
@@ -14,10 +14,6 @@
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.tensor._collective_utils import check_tensor_meta, mesh_broadcast
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
-from torch.distributed.tensor._random import (
-    is_rng_supported_mesh,
-    OffsetBasedRNGTracker,
-)
 from torch.distributed.tensor._redistribute import (
     Redistribute,
     redistribute_local_tensor,
@@ -705,13 +701,6 @@ def distribute_tensor(
             msg = "To use DTensor API with xla, you must install the torch_xla package!"
             raise ImportError(msg) from e
 
-    # instantiate a RNG tracker if haven't. By default DTensor uses an
-    # OffsetBasedRNGTracker to perform random operators.
-    # TODO: the value assignment to global variable is not the ideal solution
-    # we can replace it in future.
-    if not random._rng_tracker and is_rng_supported_mesh(device_mesh):
-        random._rng_tracker = OffsetBasedRNGTracker(device_type)
-
     if not tensor.is_leaf:
         raise RuntimeError(
             "`distribute_tensor` should be used to distribute leaf tensors! but found non-leaf tensor!"
@@ -1025,7 +1014,7 @@ def _dtensor_init_helper(  # type: ignore[no-untyped-def]
         spec = DTensorSpec(device_mesh, tuple(placements), tensor_meta=tensor_meta)
 
         if random.is_rng_supported_mesh(device_mesh) and not random._rng_tracker:
-            random._rng_tracker = random.OffsetBasedRNGTracker()
+            random._rng_tracker = random.OffsetBasedRNGTracker(device_mesh)
 
         assert random._rng_tracker is not None
         with random._rng_tracker._distribute_region(spec):
 
@@ -198,7 +198,7 @@ def dispatch(
                 if not random._rng_tracker and is_rng_supported_mesh(mesh):
                     # Default to `OffsetBasedRNGTracker` if the parallelism API
                     # did not already construct one
-                    random._rng_tracker = random.OffsetBasedRNGTracker(mesh.device_type)
+                    random._rng_tracker = random.OffsetBasedRNGTracker(mesh)
 
                 first_arg, first_local_arg = cast(dtensor.DTensor, args[0]), cast(
                     torch.Tensor, local_tensor_args[0]
 
@@ -68,19 +68,18 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
         ``manual_seed`` will throw an error.
         Current implementation only supports a GPU device mesh.
     """
-    device_handle = _get_device_handle(device_mesh.device_type)
-    if not device_handle:
-        raise NotImplementedError(
-            f"DTensor randomness only supports cuda/cuda-like device type, but got {device_mesh.device_type}"
+    if not is_rng_supported_mesh(device_mesh):
+        warnings.warn(
+            "DTensor manual_seed() may not have complete support "
+            f"on {device_mesh.device_type} device mesh"
         )
+        return
 
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
     global _rng_tracker
     if not _rng_tracker:
-        _rng_tracker = OffsetBasedRNGTracker(
-            device_mesh.device_type, run_state_sync=False
-        )
+        _rng_tracker = OffsetBasedRNGTracker(device_mesh, run_state_sync=False)
 
     # the current rank is in mesh
     if device_mesh.get_coordinate() is not None:
@@ -102,16 +101,16 @@ class _RNGStateTracker:
     a random op (an operator that calls RNG).
     """
 
-    def __init__(self, device_type: str = "cuda"):
-        self._device_type = device_type
-        self._device_handle = _get_device_handle(device_type)
+    def __init__(self, device: torch.device):
+        self._device = device
+        self._device_handle = _get_device_handle(self._device.type)
         if not (self._device_handle and self._device_handle.is_available()):
             raise RuntimeError(
-                f"{self.__class__.__name__} instantiation requires the presence of CUDA/CUDA-like device"
+                f"{self.__class__.__name__} instantiation requires the presence of "
+                f"{device.type} device but couldn't find."
             )
 
         self._states: dict[str, Tensor] = {}
-        self._devices = [self._device_handle.current_device()]
         self._use_distribute_region = True
 
     @property
@@ -159,11 +158,25 @@ class OffsetBasedRNGTracker(_RNGStateTracker):
     This subclass of ``_RNGStateTracker`` defines the default policy of how RNG states
     should be shared and synchronized among all ranks to respect the semantics of DTensor
     random operators.
+
+    note: _RNGStateTracker only supports cuda/cuda-like device
     """
 
-    def __init__(self, device_type: str = "cuda", run_state_sync: bool = True):
-        super().__init__(device_type)
-        rng_state = self._device_handle.get_rng_state().to(device_type)
+    def __init__(
+        self,
+        device_mesh: DeviceMesh,
+        run_state_sync: bool = True,
+    ):
+        super().__init__(_resolve_device(device_mesh=device_mesh))
+        assert self._device_handle is not None
+        # DTensor RNG tracker so far only supports CUDA/CUDA-like devices
+        if self._device.type != "cuda":
+            raise RuntimeError(
+                f"{self.__class__.__name__} instantiation requires the presence of "
+                f"CUDA/CUDA-like device. Got {self._device.type} instead."
+            )
+
+        rng_state = self._device_handle.get_rng_state().to(self._device)
         if run_state_sync:
             # synchronize RNG state using rank 0's current one
             dist.broadcast(rng_state, 0)
@@ -185,7 +198,8 @@ def _distribute_region(self, spec: DTensorSpec):
         if self.distribute_region_enabled:
             old_offset = self.get_offset("parallel-rng")
             self._set_pre_op_offset(spec)
-            with torch.random.fork_rng(self._devices, device_type=self._device_type):
+            with torch.random.fork_rng(devices=[self._device]):
+                assert self._device_handle is not None
                 self._device_handle.set_rng_state(self.rng_states["parallel-rng"])
                 try:
                     yield  # execute the region code
@@ -366,3 +380,11 @@ def _calc_shard_linear_idx(
             shard_coord_stride *= size
 
         return shard_linear_idx
+
+
+def _resolve_device(device_mesh: DeviceMesh) -> torch.device:
+    device_type = device_mesh.device_type
+    device_handle = _get_device_handle(device_type)
+    assert device_handle is not None
+    device_idx = device_mesh.get_rank() % device_handle.device_count()
+    return torch.device(f"{device_type}:{device_idx:d}")