[shard] use gather_object in gather API (#71624)

Wanchao Liang · pytorchmergebot · commit d0f9556c4a60 · 2022-03-10T19:18:03.000Z
Summary: Pull Request resolved: #71624 Now we have gather available in NCCL pg, we can switch our `sharded_tensor.gather` to use gather_object instead of all_gather_object, which will reduce the communication overhead. TODO: To further reduce the comm overhead, we need to figure out a way to avoid using `gather_object`, as `gather_object` or `all_gather_object` incurs pickling copy between devices. ghstack-source-id: 151007578 Test Plan: wait for ci Reviewed By: pritamdamania87 Differential Revision: D33688907 fbshipit-source-id: 2073c5a46c33a7a2640a9e3599dc795d9e4c0a1e (cherry picked from commit dbc983a)
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
@@ -280,17 +280,19 @@ def gather(
 
         world_size = dist.get_world_size(self._process_group)
 
-        gathered_shards = [None] * world_size
-        # will revise this part with CPU support and use dist.gather()
-        # once NCCL support for gather() is ready
-        # https://github.com/pytorch/pytorch/issues/66187
-        dist.all_gather_object(
+        gathered_shards: List[Optional[List[Shard]]] = [None] * world_size if rank == dst else []
+        # TODO: see how we could use dist.gather() instead of dist.gather_object
+        # as the latter one involves pickling on CPU, see more context
+        # https://github.com/pytorch/pytorch/issues/73935
+        dist.gather_object(
             obj=local_shards,
-            object_list=gathered_shards,
+            object_gather_list=gathered_shards,
+            dst=dst,
             group=self._process_group,
         )
-
         if rank == dst:
+            if out is None:
+                raise ValueError("`out` Tensor must be provided on dst rank!")
             dims = len(full_size)
             for shards in gathered_shards:
                 if shards is None: