pytorch
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_init.py
Lines changed: 2 additions & 4 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_init.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
Lines changed: 4 additions & 13 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
Lines changed: 4 additions & 13 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_training.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/fsdp/test_fully_shard_training.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/_composable/test_composability/test_2d_composability.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/_composable/test_composability/test_2d_composability.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/distributed/_composable/fsdp/_fsdp_param.py
Lines changed: 13 additions & 10 deletions b/‎torch/distributed/_composable/fsdp/_fsdp_param.py
Lines changed: 13 additions & 10 deletions
@@ -387,7 +387,7 @@ def test_shard_dtensor_parameters(self):
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         # Use odd dim sizes to test uneven shards
-        # TODO change "mlp_dim" back to 8 when uneven sharding
+        # TODO: change "mlp_dim" back to 9 when uneven sharding
         # is supported for FSDP+TP
         model = MLP(8, dim_multiplier=3)
         orig_params = [param.detach().clone() for param in model.parameters()]
@@ -583,9 +583,7 @@ def test_meta_device_2d_init(self):
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
 
         # Test both even sharding (8) and uneven sharding (3)
-        # TODO change "mlp_dim" back to (8, 3) when uneven sharding
-        # is supported for FSDP+TP
-        for mlp_dim in (8, 4):
+        for mlp_dim in (8, 3):
             with torch.device("meta"):
                 model = MLP(mlp_dim, with_buffer=True)
                 for param in model.parameters():
 
@@ -177,19 +177,10 @@ def _test_dp_tp_state_dict_save_load(self, global_mesh: DeviceMesh, mlp_dim: int
                 "2.out_proj": RowwiseParallel(),
             },
         )
-        # TODO: remove ``assertRaisesRegex`` once uneven sharding is supported
-        if mlp_dim % dp_mesh.size() != 0:
-            with self.assertRaisesRegex(
-                NotImplementedError, "does not support uneven sharding"
-            ):
-                for mlp in model:
-                    fully_shard(mlp, mesh=dp_mesh)
-                fully_shard(model, mesh=dp_mesh)
-        else:
-            for mlp in model:
-                fully_shard(mlp, mesh=dp_mesh)
-            fully_shard(model, mesh=dp_mesh)
-            self._test_state_dict_save_load(model)
+        for mlp in model:
+            fully_shard(mlp, mesh=dp_mesh)
+        fully_shard(model, mesh=dp_mesh)
+        self._test_state_dict_save_load(model)
 
     def _test_state_dict_save_load(self, model: nn.Module):
         for param_name, param in model.named_parameters():
 
@@ -993,7 +993,7 @@ def test_2d_mlp_with_nd_mesh(self):
             {
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
-                # TODO change "mlp_dim" back to [3, 16, 17] when uneven sharding
+                # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding
                 # is supported for FSDP+TP
                 "mlp_dim": [4, 16, 20],
                 "foreach": [False],
 
@@ -60,7 +60,7 @@ def test_train_parity_2d_mlp(self):
             {
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
-                # TODO change "mlp_dim" back to [3, 16, 17] when uneven sharding
+                # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding
                 # is supported for FSDP+TP
                 "mlp_dim": [4, 16, 20],
             },
 
@@ -302,16 +302,19 @@ def _init_sharded_param(self, param: nn.Parameter, device: torch.device):
             )
             # NOTE: FSDP+TP does not support uneven sharding for now
             # TODO: enable uneven sharding for FSDP+TP
-            num_shards_map = self._sharding_spec.num_shards_map
-            tensor_shape = list(self._sharding_spec.shape)
-            assert len(num_shards_map) == len(tensor_shape)
-            for i, (size, num_shards) in enumerate(zip(tensor_shape, num_shards_map)):
-                if size % num_shards != 0:
-                    raise NotImplementedError(
-                        "FSDP+TP sharding does not support uneven sharding for now: "
-                        f"tensor dim {i} has size {size} which cannot be evenly sharded "
-                        f"into {num_shards} shards."
-                    )
+            if split_factor > 1:  # FSDP has strided sharding on tensor dim 0
+                num_shards_map = self._sharding_spec.num_shards_map
+
tensor_shape = list(self._sharding_spec.shape)                tensor_shape = list(self._sharding_spec.shape)
+                assert len(num_shards_map) == len(tensor_shape)
+                for i, (size, num_shards) in enumerate(
+                    zip(tensor_shape, num_shards_map)
+                ):
+                    if size % num_shards != 0:
+                        raise NotImplementedError(
+                            "FSDP+TP sharding does not support uneven sharding for now: "
+                            f"tensor dim {i} has size {size} which cannot be evenly "
+                            f"sharded into {num_shards} shards."
+                        )
 
             param_data = cast(DTensor, param)._local_tensor
         else:
Original file line number	Diff line number	Diff line change
`@@ -993,7 +993,7 @@ def test_2d_mlp_with_nd_mesh(self):`
`993`	`993`	`{`
`994`	`994`	`"reshard_after_forward": [False, True],`
`995`	`995`	`"use_activation_checkpointing": [False, True],`
`996`		`- # TODO change "mlp_dim" back to [3, 16, 17] when uneven sharding`
	`996`	`+ # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding`
`997`	`997`	`# is supported for FSDP+TP`
`998`	`998`	`"mlp_dim": [4, 16, 20],`
`999`	`999`	`"foreach": [False],`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ def test_train_parity_2d_mlp(self):`
`60`	`60`	`{`
`61`	`61`	`"reshard_after_forward": [False, True],`
`62`	`62`	`"use_activation_checkpointing": [False, True],`
`63`		`- # TODO change "mlp_dim" back to [3, 16, 17] when uneven sharding`
	`63`	`+ # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding`
`64`	`64`	`# is supported for FSDP+TP`
`65`	`65`	`"mlp_dim": [4, 16, 20],`
`66`	`66`	`},`