pytorch
diff --git a/‎test/distributed/tensor/parallel/test_tp_style.py
Lines changed: 23 additions & 0 deletions b/‎test/distributed/tensor/parallel/test_tp_style.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎torch/_inductor/runtime/runtime_utils.py
Lines changed: 2 additions & 1 deletion b/‎torch/_inductor/runtime/runtime_utils.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎torch/distributed/_tensor/dispatch.py
Lines changed: 11 additions & 3 deletions b/‎torch/distributed/_tensor/dispatch.py
Lines changed: 11 additions & 3 deletions
diff --git a/‎torch/distributed/_tensor/op_schema.py
Lines changed: 6 additions & 3 deletions b/‎torch/distributed/_tensor/op_schema.py
Lines changed: 6 additions & 3 deletions
@@ -168,6 +168,29 @@ def test_rowwise_parallel_embedding(self):
             # no comm in bwd
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
+        sharded_row_parallel = RowwiseParallel(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        )
+
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
+
+        inp_indices = torch.arange(8, device=self.device_type)
+        with comm_mode:
+            out = rowwise_mod(inp_indices)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (8, 16 // self.world_size))
+            # reduce scatter in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+            out.sum().backward()
+            # allgather comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+
     @with_comms
     def test_prepare_module_input(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
 
@@ -10,7 +10,6 @@
 import time
 
 import torch
-from torch._inductor.utils import is_cpu_device
 
 
 def conditional_product(*args):
@@ -72,6 +71,8 @@ def get_max_y_grid():
 
 
 def do_bench(fn, fn_args, fn_kwargs, **kwargs):
+    from torch._inductor.utils import is_cpu_device
+
     args = list(fn_args)
     args.extend(fn_kwargs.values())
     if is_cpu_device(args):
 
@@ -187,10 +187,18 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                     # Default to `OffsetBasedRNGTracker` if the parallelism API
                     # did not already construct one
                     random._rng_tracker = random.OffsetBasedRNGTracker(mesh.device_type)
+
+                first_arg, first_local_arg = cast(dtensor.DTensor, args[0]), cast(
+                    torch.Tensor, local_tensor_args[0]
+                )
+                rng_context = (
+                    random._rng_tracker._distribute_region(first_arg._spec)
+                    if random._rng_tracker and not first_local_arg.is_meta
+                    else contextlib.nullcontext()
+                )
+
                 # For DTensor random operator, run it within a distribute region
-                with random._rng_tracker._distribute_region(
-                    cast(dtensor.DTensor, args[0])._spec
-                ) if random._rng_tracker else contextlib.nullcontext():
+                with rng_context:
                     local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
             else:
                 local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
 
@@ -104,9 +104,12 @@ def input_spec(self, index: int = 0) -> DTensorSpec:
         return self.input_specs[index]
 
     def __str__(self) -> str:
-        input_specs_str = _pretty_print_spec(self.input_specs)
+        if self.input_specs is not None:
+            input_specs_str = f"{_pretty_print_spec(self.input_specs)} -> "
+        else:
+            input_specs_str = ""
         output_spec_str = _pretty_print_spec(self.output_specs)
-        return f"{input_specs_str} -> {output_spec_str}"
+        return f"{input_specs_str}{output_spec_str}"
 
 
 class StrategyType:
@@ -130,7 +133,7 @@ def __init__(self, strategies: List[PlacementStrategy]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.output_mesh_shape
-        return f"OpStrategy:[{strategy_list_str}] @ mesh: {mesh_shape}"
+        return f"[{strategy_list_str}] @ mesh: {mesh_shape}"
 
     def max_num_shards(self) -> int:
         """