pytorch
diff --git a/‎test/export/test_export.py
Lines changed: 78 additions & 10 deletions b/‎test/export/test_export.py
Lines changed: 78 additions & 10 deletions
diff --git a/‎torch/_export/non_strict_utils.py
Lines changed: 18 additions & 11 deletions b/‎torch/_export/non_strict_utils.py
Lines changed: 18 additions & 11 deletions
diff --git a/‎torch/distributed/_functional_collectives.py
Lines changed: 1 addition & 1 deletion b/‎torch/distributed/_functional_collectives.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/distributed/distributed_c10d.py
Lines changed: 59 additions & 0 deletions b/‎torch/distributed/distributed_c10d.py
Lines changed: 59 additions & 0 deletions
@@ -11953,6 +11953,20 @@ def forward(self, x):
             ]
             self.assertEqual(len(shift_op), 1)
 
+    @contextmanager
+    def distributed_env(self, world_size):
+        try:
+            torch.distributed.init_process_group(
+                backend="fake",
+                world_size=world_size,
+                rank=0,
+                store=FakeStore(),
+            )
+            yield
+
+        finally:
+            torch.distributed.destroy_process_group()
+
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
     def test_distributed_all_reduce(self):
         class Foo(torch.nn.Module):
@@ -11965,21 +11979,75 @@ def forward(self, x):
                 torch.distributed.all_reduce(y)
                 return y
 
-        try:
-            torch.distributed.init_process_group(
-                backend="fake",
-                world_size=2,
-                rank=0,
-                store=FakeStore(),
-            )
-
+        with self.distributed_env(world_size=2):
             m = Foo()
             ep = export(m, (torch.randn(4, 4),))
             inp = (torch.randn(4, 4),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
-        finally:
-            torch.distributed.destroy_process_group()
+    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+    def test_distributed_all_gather(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                ys = [torch.empty_like(x) for _ in range(2)]
+                torch.distributed.all_gather(ys, x)
+                return ys
+
+        with self.distributed_env(world_size=2):
+            m = Foo()
+            ep = export(m, (torch.randn(2),))
+            inp = (torch.randn(2),)
+            self.assertTrue(
+                torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
+            )
+
+    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+    def test_distributed_all_gather_into_tensor(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                y = torch.empty(2 * 2)
+                torch.distributed.all_gather_into_tensor(y, x)
+                return y
+
+        with self.distributed_env(world_size=2):
+            m = Foo()
+            ep = export(m, (torch.randn(2),))
+            inp = (torch.randn(2),)
+            self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
+
+    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+    def test_distributed_all_to_all_single(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                y = torch.empty(4)
+                torch.distributed.all_to_all_single(y, x)
+                return y
+
+        with self.distributed_env(world_size=4):
+            m = Foo()
+            ep = export(m, (torch.randn(4),))
+            nodes = ep.graph.find_nodes(
+                op="call_function",
+                target=torch.ops._c10d_functional.all_to_all_single.default,
+            )
+            self.assertEqual(len(nodes), 1)
+
+    @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+    def test_distributed_reduce_scatter_tensor(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                y = torch.empty(2)
+                torch.distributed.reduce_scatter_tensor(y, x)
+                return y
+
+        with self.distributed_env(world_size=2):
+            m = Foo()
+            ep = export(m, (torch.randn(2 * 2),))
+            nodes = ep.graph.find_nodes(
+                op="call_function",
+                target=torch.ops._c10d_functional.reduce_scatter_tensor.default,
+            )
+            self.assertEqual(len(nodes), 1)
 
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 
@@ -619,21 +619,28 @@ class _NonStrictTorchFunctionHandler(torch.overrides.TorchFunctionMode):
     """
 
     def _override(self, func, args, kwargs):
-        if torch.distributed.is_available() and func is torch.distributed.all_reduce:
-            # Redirect to a corresponding functional collective, following Dynamo.
-            # See torch/distributed/_functional_collectives.py for details.
+        if torch.distributed.is_available():
             from torch.distributed._functional_collectives import (
-                all_reduce_inplace,
                 REDUCE_OP_TO_STR,
+                traceable_collective_remaps,
             )
 
-            # see CollectiveFunctionRewriteVariable for remapping logic
-            signature = inspect.signature(func)
-            kwargs = dict(signature.bind(*args, **kwargs).arguments)
-            args = ()
-            if "op" in kwargs:
-                kwargs["op"] = REDUCE_OP_TO_STR[kwargs["op"]]
-            return all_reduce_inplace, args, kwargs
+            if func in traceable_collective_remaps:
+                # Redirect to a corresponding functional collective, following Dynamo.
+                # See torch/distributed/_functional_collectives.py for details.
+                # The following is an adaptation of CollectiveFunctionRewriteVariable.
+                mapped_func = traceable_collective_remaps[func]
+                signature = inspect.signature(func)
+                kwargs = dict(signature.bind(*args, **kwargs).arguments)
+                args = ()
+                if func in (
+                    torch.distributed.all_reduce,
+                    torch.distributed.reduce_scatter_tensor,
+                    torch.distributed._reduce_scatter_base,
+                ):
+                    if "op" in kwargs:
+                        kwargs["op"] = REDUCE_OP_TO_STR[kwargs["op"]]
+                return mapped_func, args, kwargs
         if func is torch.tensor:
             # Redirect to Python implementation of torch.tensor for data with symints.
             # NOTE(avik): We don't unconditionally redirect to this implementation
 
@@ -1052,7 +1052,7 @@ def _reduce_scatter_tensor_coalesced_native_meta(
 def all_gather_tensor_inplace(
     output_tensor: torch.Tensor,
     input_tensor: torch.Tensor,
-    group,  # TODO add a type,
+    group=None,  # TODO add a type,
     async_op: bool = False,
     tag: str = "",
     gather_dim: int = 0,
 
@@ -3703,6 +3703,20 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
         [tensor([1.+1.j, 2.+2.j], device='cuda:1'), tensor([3.+3.j, 4.+4.j], device='cuda:1')] # Rank 1
 
     """
+    # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
+    # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
+    # (e.g., non-strict export implements such a torch function mode).
+    relevant_args = (tensor,)
+    if has_torch_function(relevant_args):
+        return handle_torch_function(
+            all_gather,
+            relevant_args,
+            tensor_list,
+            tensor,
+            group=group,
+            async_op=async_op,
+        )
+
     _check_tensor_list(tensor_list, "tensor_list")
     _check_single_tensor(tensor, "tensor")
     _ensure_all_tensors_same_dtype(tensor_list, tensor)
@@ -3779,6 +3793,20 @@ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=Fal
         The Gloo backend does not support this API.
 
     """
+    # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
+    # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
+    # (e.g., non-strict export implements such a torch function mode).
+    relevant_args = (input_tensor,)
+    if has_torch_function(relevant_args):
+        return handle_torch_function(
+            all_gather_into_tensor,
+            relevant_args,
+            output_tensor,
+            input_tensor,
+            group=group,
+            async_op=async_op,
+        )
+
     _check_single_tensor(input_tensor, "input_tensor")
     _check_single_tensor(output_tensor, "output_tensor")
     if _rank_not_in_group(group):
@@ -4224,6 +4252,21 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F
         The Gloo backend does not support this API.
 
     """
+    # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
+    # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
+    # (e.g., non-strict export implements such a torch function mode).
+    relevant_args = (input,)
+    if has_torch_function(relevant_args):
+        return handle_torch_function(
+            reduce_scatter_tensor,
+            relevant_args,
+            output,
+            input,
+            op=op,
+            group=group,
+            async_op=async_op,
+        )
+
     _check_single_tensor(output, "output")
     _check_single_tensor(input, "input")
 
@@ -4382,6 +4425,22 @@ def all_to_all_single(
         tensor([3+3j, 7+7j, 11+11j, 15+15j])                            # Rank 2
         tensor([4+4j, 8+8j, 12+12j, 16+16j])                            # Rank 3
     """
+    # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
+    # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
+    # (e.g., non-strict export implements such a torch function mode).
+    relevant_args = (input,)
+    if has_torch_function(relevant_args):
+        return handle_torch_function(
+            all_to_all_single,
+            relevant_args,
+            output,
+            input,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+
     if _rank_not_in_group(group):
         _warn_not_in_group("all_to_all_single")
         return