[2d] add gradient hook to FSDP extension

wanchaol · wanchaol · commit ce3bc8580e14 · 2023-12-19T13:23:47.000-08:00
ghstack-source-id: 17a723a Pull Request resolved: #116122
diff --git a/torch/distributed/tensor/parallel/_data_parallel_utils.py b/torch/distributed/tensor/parallel/_data_parallel_utils.py
@@ -1,24 +1,43 @@
+from functools import partial
 from typing import Optional, Tuple
 
 import torch
-from torch.distributed._tensor import DTensor as DistributedTensor
+from torch.distributed._tensor import DTensor
 from torch.distributed._tensor.placement_types import DTensorSpec
 
 
+def grad_layout_hook(param_placements, grad):
+    # a gradient hook to ensure the gradient layout is the same as
+    # the parameter layout, this is due to the fact that our current
+    # FSDP have implicit assumption that param/grad sharding layout
+    # should be the same after backward. However this is not always
+    # the case for DTensor, i.e. we might have a replicated param
+    # and a partial gradient and DTensor was relying on optimizer
+    # who really consumes the gradient to convert the layout.
+    if isinstance(grad, DTensor) and grad.placements != param_placements:
+        grad = grad.redistribute(grad.device_mesh, param_placements)
+    return grad
+
+
 def _flatten_tensor(
     tensor: torch.Tensor,
 ) -> Tuple[torch.Tensor, Optional[DTensorSpec]]:
-    if isinstance(tensor, DistributedTensor):
+    if isinstance(tensor, DTensor):
         tensor._local_tensor.requires_grad_()
         return tensor._local_tensor, tensor._spec
     return tensor, None
 
 
+@torch._dynamo.disable
 def _unflatten_tensor(tensor: torch.Tensor, spec: DTensorSpec) -> torch.Tensor:
-    result = DistributedTensor.from_local(
+    # unflatten would mainly be called everytime FSDP allgather parameters.
+    result = DTensor.from_local(
         tensor,
         spec.mesh,
         spec.placements,
         run_check=False,
     )
+    if result.requires_grad:
+        # only register the hook if the tensor requires grad
+        result.register_hook(partial(grad_layout_hook, spec.placements))
     return result