pytorch
diff --git a/‎benchmarks/dynamo/distributed.py
Lines changed: 2 additions & 2 deletions b/‎benchmarks/dynamo/distributed.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_extensions.py
Lines changed: 4 additions & 8 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_extensions.py
Lines changed: 4 additions & 8 deletions
diff --git a/‎test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/checkpoint/test_state_dict.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/checkpoint/test_state_dict.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/pipelining/test_stage.py
Lines changed: 1 addition & 1 deletion b/‎test/distributed/pipelining/test_stage.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/distributed/tensor/test_dtensor_ops.py
Lines changed: 4 additions & 5 deletions b/‎test/distributed/tensor/test_dtensor_ops.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎test/distributed/tensor/test_pointwise_ops.py
Lines changed: 3 additions & 3 deletions b/‎test/distributed/tensor/test_pointwise_ops.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/distributed/tensor/test_view_ops.py
Lines changed: 2 additions & 2 deletions b/‎test/distributed/tensor/test_view_ops.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/distributed/_functional_collectives.py
Lines changed: 1 addition & 6 deletions b/‎torch/distributed/_functional_collectives.py
Lines changed: 1 addition & 6 deletions
diff --git a/‎torch/distributed/_shard/common_op_utils.py
Lines changed: 5 additions & 5 deletions b/‎torch/distributed/_shard/common_op_utils.py
Lines changed: 5 additions & 5 deletions
@@ -5,10 +5,10 @@
 
 import torch
 import torch._dynamo as dynamo
-import torch.utils.pytree.python as pytree
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.profiler import profile, ProfilerActivity, record_function
+from torch.utils.pytree import tree_map
 
 
 try:
@@ -62,7 +62,7 @@ def move_tensor(maybe_tensor):
             return maybe_tensor.to(dev_rank)
         return maybe_tensor
 
-    inputs = pytree.tree_map(move_tensor, inputs)
+    inputs = tree_map(move_tensor, inputs)
 
     if args.fsdp:
         model = apply_fsdp(
 
@@ -11,7 +11,6 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-import torch.utils.pytree.python as pytree
 from torch.autograd.grad_mode import _unsafe_preserve_version_counter
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
@@ -25,6 +24,7 @@
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.two_tensor import TwoTensor
+from torch.utils.pytree import tree_map_only
 
 
 def two_tensor_fsdp_pre_all_gather_v1(
@@ -144,13 +144,9 @@ def unwrap(x: cls):
                 assert pad_in_pre_all_gather == x._pad_in_pre_all_gather
             return x._data
 
-        out = func(
-            *pytree.tree_map_only(cls, unwrap, args),
-            **pytree.tree_map_only(cls, unwrap, kwargs),
-        )
-        return pytree.tree_map_only(
-            torch.Tensor, lambda x: cls(x, pad_in_pre_all_gather), out
-        )
+        args, kwargs = tree_map_only(cls, unwrap, (args, kwargs))
+        out = func(*args, **kwargs)
+        return tree_map_only(torch.Tensor, lambda x: cls(x, pad_in_pre_all_gather), out)
 
     def __tensor_flatten__(self):
         return ["_data"], None
 
@@ -28,7 +28,7 @@
 from torch.testing._internal.common_fsdp import FSDPTest, MLP
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
-from torch.utils.pytree.python import tree_all_only
+from torch.utils.pytree import tree_all_only
 
 
 class TestFullyShardWithDistributedStateDict(FSDPTest):
 
@@ -48,7 +48,7 @@
     with_comms,
 )
 from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin
-from torch.utils.pytree.python import tree_all, tree_all_only
+from torch.utils.pytree import tree_all, tree_all_only
 
 
 if not dist.is_available():
 
@@ -25,7 +25,7 @@
     parametrize,
     skip_but_pass_in_sandcastle_if,
 )
-from torch.utils.pytree.python import tree_map_only
+from torch.utils.pytree import tree_map_only
 
 
 d_hid = 512
 
@@ -7,7 +7,6 @@
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
-import torch.utils.pytree.python as pytree
 from torch.distributed._tensor import DeviceMesh, DTensor
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
@@ -20,7 +19,7 @@
     DTensorConverter,
     DTensorOpTestBase,
 )
-from torch.utils.pytree.python import tree_map
+from torch.utils.pytree import tree_leaves, tree_map
 
 
 # rewrite common size variables to sth can be sharded evenly
@@ -535,8 +534,8 @@ def test():
         self.check_dtensor_func(test, op)
 
     def assert_ref_dtensor_equal(self, dtensor_rs, rs):
-        flat_dtensor_rs = pytree.tree_leaves(dtensor_rs)
-        flat_rs = pytree.tree_leaves(rs)
+        flat_dtensor_rs = tree_leaves(dtensor_rs)
+        flat_rs = tree_leaves(rs)
         self.assertEqual(len(flat_dtensor_rs), len(flat_rs))
         for dtensor_r, r in zip(flat_dtensor_rs, flat_rs):
             if not isinstance(r, torch.Tensor):
@@ -600,7 +599,7 @@ def to_replicate(e: object) -> object:
                         # we need to skip tests containing tensors of zero elements for now.
                         # see issue: https://github.com/pytorch/tau/issues/470
                         # TODO remove this once issue above fixed.
-                        flat_args = pytree.tree_leaves(dtensor_rs)
+                        flat_args = tree_leaves(dtensor_rs)
                         if any(
                             isinstance(e, torch.Tensor) and e.numel() == 0
                             for e in flat_args
 
@@ -6,7 +6,6 @@
 from unittest import skip
 
 import torch
-import torch.utils.pytree.python as pytree
 from torch import Tensor
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import (
@@ -20,6 +19,7 @@
     DTensorOpTestBase,
     skip_unless_torch_gpu,
 )
+from torch.utils.pytree import tree_map
 
 
 def no_op():
@@ -48,7 +48,7 @@ def f(x):
             )
         return x
 
-    return pytree.tree_map(f, [val])[0]
+    return tree_map(f, val)
 
 
 def deepcopy_convert_from_dtensor(val: Any) -> Any:
@@ -64,7 +64,7 @@ def f(x):
             return x.full_tensor()
         return x
 
-    return pytree.tree_map(f, [val])[0]
+    return tree_map(f, val)
 
 
 class DistElementwiseOpsTest(DTensorOpTestBase):
 
@@ -6,7 +6,6 @@
 
 import torch
 import torch.distributed as dist
-import torch.utils.pytree.python as pytree
 from torch import rand, randn, Tensor
 from torch.distributed._tensor import (
     DeviceMesh,
@@ -32,6 +31,7 @@
     DTensorTestBase,
     with_comms,
 )
+from torch.utils.pytree import tree_leaves
 
 
 class TestViewOps(DTensorTestBase):
@@ -139,7 +139,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
         dim_map = dim_maps[op]
         rules = dim_map(*args, **kwargs)
         outputs = op(*args, **kwargs)
-        flat_args = pytree.arg_tree_leaves(*args)
+        flat_args = tree_leaves(args)
         in_shape = flat_args[0].shape
 
         no_shard_dims = set()
 
@@ -9,16 +9,11 @@
 import torch.distributed.distributed_c10d as c10d
 from torch.distributed.device_mesh import DeviceMesh
 from torch.fx.experimental.proxy_tensor import get_proxy_mode
+from torch.utils.pytree import tree_map_only
 
 from . import _functional_collectives_impl as fun_col_impl
 
 
-try:
-    from torch.utils._cxx_pytree import tree_map_only
-except ImportError:
-    from torch.utils.pytree.python import tree_map_only  # type: ignore[no-redef]
-
-
 if torch._running_with_deploy():
 
     def is_torchdynamo_compiling():
 
@@ -2,7 +2,7 @@
 from typing import Optional
 
 import torch
-import torch.utils.pytree.python as pytree
+from torch.utils.pytree import tree_map_
 
 
 def _basic_validation(op, args=(), kwargs=None):
@@ -22,8 +22,8 @@ def is_distributed_tensor(e):
         if isinstance(e, ShardedTensor):
             has_distributed_tensor = True
 
-    pytree.tree_map_(is_distributed_tensor, args)
-    pytree.tree_map_(is_distributed_tensor, kwargs)
+    tree_map_(is_distributed_tensor, args)
+    tree_map_(is_distributed_tensor, kwargs)
 
     if not has_distributed_tensor:
         raise TypeError(
@@ -44,8 +44,8 @@ def validate_pg(e):
                 )
             cur_pg = e._process_group
 
-    pytree.tree_map_(validate_pg, args)
-    pytree.tree_map_(validate_pg, kwargs)
+    tree_map_(validate_pg, args)
+    tree_map_(validate_pg, kwargs)
 
 
 def _register_default_op(op, decorator):
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`with_comms,`
`49`	`49`	`)`
`50`	`50`	`from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin`
`51`		`-from torch.utils.pytree.python import tree_all, tree_all_only`
	`51`	`+from torch.utils.pytree import tree_all, tree_all_only`
`52`	`52`
`53`	`53`
`54`	`54`	`if not dist.is_available():`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`parametrize,`
`26`	`26`	`skip_but_pass_in_sandcastle_if,`
`27`	`27`	`)`
`28`		`-from torch.utils.pytree.python import tree_map_only`
	`28`	`+from torch.utils.pytree import tree_map_only`
`29`	`29`
`30`	`30`
`31`	`31`	`d_hid = 512`