pytorch · cenzhaometa · Feb 27, 2025 · wconstab · Feb 28, 2025
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
@@ -2,7 +2,7 @@
 # mypy: disable-error-code="type-arg"
 from datetime import timedelta
 from enum import Enum
-from typing import Any, overload
+from typing import Any, overload, Optional
 
 import torch
 from torch import Tensor
@@ -134,6 +134,8 @@ class BroadcastOptions:
 class AllreduceOptions:
     reduceOp: ReduceOp
     timeout: timedelta
+    asyncOp: bool
+    sparseIndices: Optional[Tensor]
 
 class AllreduceCoalescedOptions(AllreduceOptions): ...
 

@@ -19,9 +19,9 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, bool asyncOp, int timeout) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
   m.def(
-      "allreduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, Tensor? sparse_indices, int timeout) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+      "allreduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, Tensor? sparse_indices, bool asyncOp, int timeout) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
   m.def(
-      "allreduce_coalesced_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, int timeout) -> __torch__.torch.classes.c10d.Work");
+      "allreduce_coalesced_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, bool asyncOp, int timeout) -> __torch__.torch.classes.c10d.Work");
   m.def(
       "allgather_(Tensor[][] output_tensors, Tensor[] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int timeout) -> (Tensor[][], __torch__.torch.classes.c10d.Work)");
   m.def(
@@ -169,12 +169,13 @@ IMPL_BROADCAST(PrivateUse1)
           const c10::intrusive_ptr<ProcessGroup>& process_group,              \
           const c10::intrusive_ptr<ReduceOp>& reduce_op,                      \
           const std::optional<at::Tensor>& sparse_indices,                    \
+          bool asyncOp,                                                       \
           int64_t timeout) {                                                  \
     auto tensor_vec = tensors.vec();                                          \
     auto work = process_group->getBackend(c10::DeviceType::DEV) -> allreduce( \
         tensor_vec,                                                           \
         AllreduceOptions{                                                     \
-            *reduce_op.get(), std::chrono::milliseconds(ti
8000
meout)});           \
+            *reduce_op.get(), std::chrono::milliseconds(timeout), asyncOp});  \
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(     \
         std::move(tensor_vec), work);                                         \
   }
@@ -188,11 +189,13 @@ IMPL_ALLREDUCE(PrivateUse1)
       at::TensorList tensors,                                     \
       const c10::intrusive_ptr<ProcessGroup>& process_group,      \
       const c10::intrusive_ptr<ReduceOp>& reduce_op,              \
+      bool asyncOp,                                               \
       int64_t timeout) {                                          \
     auto tensor_vec = tensors.vec();                              \
     AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{}; \
     opts.reduceOp = *reduce_op.get();                             \
     opts.timeout = std::chrono::milliseconds(timeout);            \
+    opts.asyncOp = asyncOp;                                       \
     return process_group->getBackend(c10::DeviceType::DEV)        \
         ->allreduce_coalesced(tensor_vec, opts);                  \
   }
@@ -464,6 +467,7 @@ allreduce_sparse_cuda_(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
     const std::optional<at::Tensor>& sparse_indices,
+    bool asyncOp,
     int64_t timeout) {
   auto tensor_vec = tensors.vec();
   auto work = process_group->getBackend(c10::DeviceType::CUDA)
@@ -472,6 +476,7 @@ allreduce_sparse_cuda_(
                       AllreduceOptions{
                           *reduce_op,
                           std::chrono::milliseconds(timeout),
+                          asyncOp,
                           sparse_indices});
 
   return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -224,13 +224,15 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     const c10::intrusive_ptr<::c10d::ReduceOp>&,
                     const std::optional<at::Tensor>& sparse_indices,
+                    bool,
                     int64_t)>();
 
     auto work = std::get<1>(op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.sparseIndices,
+        opts.asyncOp,
         opts.timeout.count()));
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -250,12 +252,14 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              at::TensorList,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                             bool,
                              int64_t)>();
 
     auto work = op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -3238,7 +3238,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     OpType opType,
     const char* profilingTitle,
     bool avoidRecordStreams,
-    bool nanCheck) {
+    bool nanCheck,
+    bool asyncOp) {
   // Environment setting by the user may add onto collective call's option
   avoidRecordStreams |= avoidRecordStreams_;
   nanCheck &= enableNanCheck_;
@@ -3283,11 +3284,15 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     }
   }
 
-  // Used many times below, so we stash the unordered_map lookup
-  auto ncclStream = ncclStreams_.at(key);
-
-  // First let NCCL streams wait for input tensors allocation streams
-  syncStream(device, ncclEvents_[key], ncclStream);
+  // in asyncOp=false [default] mode, we use currentStream as ncclStream
+  // otherwise, we use separate ncclStream and let it sync on currentStream
+  auto ncclStream = at::cuda::getCurrentCUDAStream(device.index());
+  if (asyncOp) {
+    // Used many times below, so we stash the unordered_map lookup
+    ncclStream = ncclStreams_.at(key);
+    // First let NCCL streams wait for input tensors allocation streams
+    syncStream(device, ncclEvents_[key], ncclStream);
+  }
 
   bool enqueue =
       !coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None;
@@ -3883,7 +3888,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     OpType opType,
     const char* profilingTitle,
     bool avoidRecordStreams,
-    bool nanCheck) {
+    bool nanCheck,
+    bool asyncOp) {
   auto inputs = std::vector<at::Tensor>{input};
   auto outputs = std::vector<at::Tensor>{output};
   return collective(
@@ -3895,7 +3901,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       opType,
       profilingTitle,
       avoidRecordStreams,
-      nanCheck);
+      nanCheck,
+      asyncOp);
 }
 
 template <typename Fn>
@@ -3906,7 +3913,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     OpType opType,
     const char* profilingTitle,
     bool avoidRecordStreams,
-    bool nanCheck) {
+    bool nanCheck,
+    bool asyncOp) {
   auto inputs = std::vector<at::Tensor>{input};
   auto outputs = std::vector<at::Tensor>{output};
   return collective(
@@ -3920,7 +3928,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       opType,
       profilingTitle,
       avoidRecordStreams,
-      nanCheck);
+      nanCheck,
+      asyncOp);
 }
 
 template <typename Fn>

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -876,7 +876,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       OpType opType,
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
-      bool nanCheck = true);
+      bool nanCheck = true,
+      bool asyncOp = false);
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
@@ -888,7 +889,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       OpType opType,
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
-      bool nanCheck = true);
+      bool nanCheck = true,
+      bool asyncOp = false);
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
@@ -900,7 +902,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       OpType opType,
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
-      bool nanCheck = true);
+      bool nanCheck = true,
+      bool asyncOp = false);
 
   template <typename Fn>
   c10::intrusive_ptr<Work> collectiveCoalesced(

diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
@@ -122,6 +122,7 @@ struct BroadcastOptions {
 struct AllreduceOptions {
   ReduceOp reduceOp = ReduceOp::SUM;
   std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = false;
   std::optional<at::Tensor> sparseIndices = std::nullopt;
 };
 

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -999,13 +999,15 @@ This class does not support ``__members__`` property.)");
   py::class_<::c10d::AllreduceOptions>(module, "AllreduceOptions")
       .def(py::init<>())
       .def_readwrite("reduceOp", &::c10d::AllreduceOptions::reduceOp)
-      .def_readwrite("timeout", &::c10d::AllreduceOptions::timeout);
+      .def_readwrite("timeout", &::c10d::AllreduceOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::AllreduceOptions::asyncOp);
 
   py::class_<::c10d::AllreduceCoalescedOptions>(
       module, "AllreduceCoalescedOptions")
       .def(py::init<>())
       .def_readwrite("reduceOp", &::c10d::AllreduceCoalescedOptions::reduceOp)
-      .def_readwrite("timeout", &::c10d::AllreduceCoalescedOptions::timeout);
+      .def_readwrite("timeout", &::c10d::AllreduceCoalescedOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::AllreduceCoalescedOptions::asyncOp);
 
   py::class_<::c10d::ReduceOptions>(module, "ReduceOptions")
       .def(py::init<>())

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -2806,6 +2806,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
 
     opts = AllreduceOptions()
     opts.reduceOp = op
+    opts.asyncOp = async_op
     if group is None:
         group = _get_default_group()
 
@@ -2882,6 +2883,7 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
 
     opts = AllreduceCoalescedOptions()
     opts.reduceOp = op
+    opts.asyncOp = async_op
     group = group or _get_default_group()
     work = group.allreduce_coalesced(tensors, opts)