pytorch
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_comm.py
Lines changed: 58 additions & 0 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_comm.py
Lines changed: 58 additions & 0 deletions
diff --git a/‎torch/distributed/fsdp/_fully_shard/_fully_shard.py
Lines changed: 29 additions & 0 deletions b/‎torch/distributed/fsdp/_fully_shard/_fully_shard.py
Lines changed: 29 additions & 0 deletions
@@ -422,6 +422,64 @@ def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
             self.assertEqual(ref_loss, loss)
             check_sharded_parity(self, ref_model, model)
 
+    @skip_if_lt_x_gpu(2)
+    def test_set_reshard_after_forward(self):
+        """
+        Tests that FSDP issues the expected number of all-gathers and
+        reduce-scatters during a train step when setting reshard_after_forward.
+        comm_count should perform same as test_fully_shard_communication_count.
+        """
+        self.run_subtests(
+            {"set_reshard_after_forward": [True, False], "recurse": [True, False]},
+            self._test_set_reshard_after_forward_by_communication_count,
+        )
+
+    def _test_set_reshard_after_forward_by_communication_count(
+        self,
+        set_reshard_after_forward: bool,
+        recurse: bool,
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        fully_shard_fn = functools.partial(
+            fully_shard, reshard_after_forward=not set_reshard_after_forward
+        )
+        num_blocks = 0
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+                num_blocks += 1
+        fully_shard_fn(model)
+        num_fsdp_modules = sum(
+            isinstance(module, FSDPModule) for module in model.modules()
+        )
+        model.set_reshard_after_forward(
+            reshard_after_forward=set_reshard_after_forward, recurse=recurse
+        )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        with CommDebugMode() as fwd_comm_mode:
+            loss = model(inp)
+        fwd_comm_counts = fwd_comm_mode.get_comm_counts()
+        self.assertEqual(len(fwd_comm_counts), 1)
+        self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules)
+
+        with CommDebugMode() as bwd_comm_mode:
+            loss.sum().backward()
+        bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+        # If recurse is False, set_reshard_after_forward only affects the root module,
+        # resulting in comm_counts identical to those without set_reshard_after_forward.
+        if recurse == set_reshard_after_forward:
+            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+        else:
+            self.assertEqual(len(bwd_comm_counts), 1)
+        self.assertEqual(
+            bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_blocks + 1
+        )
+
 
 class TestFullyShardPrefetch(FSDPTest):
     @property
 
@@ -351,6 +351,35 @@ def set_requires_all_reduce(
                 if fsdp_param_group := state._fsdp_param_group:
                     fsdp_param_group.all_reduce_grads = requires_all_reduce
 
+    def set_reshard_after_forward(
+        self, reshard_after_forward: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should reshard parameters after forward. This can be
+        used to change the ``reshard_after_forward`` FSDP arg at runtime. For
+        example, this can be used to set the FSDP root module's value to
+        ``True`` (since it is otherwise specially set to ``False``), or it can
+        set an FSDP module's value to ``False`` for running evals and set back
+        to ``True`` for training.
+
+        Args:
+            reshard_after_forward (bool): Whether to reshard parameters after
+                forward.
+            recurse (bool): Whether to set for all FSDP submodules or just the
+                passed-in module.
+        """
+        self_module = cast(nn.Module, self)
+        modules = list(self_module.modules()) if recurse else [self_module]
+        for module in modules:
+            if isinstance(module, FSDPModule):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.post_forward_mesh_info = (
+                        _get_post_forward_mesh_info(
+                            reshard_after_forward, fsdp_param_group.mesh_info
+                        )
+                    )
+
     def set_reshard_after_backward(
         self, reshard_after_backward: bool, *, recurse: bool = True
     ) -> None: