pytorch
diff --git a/‎torch/csrc/cuda/nccl.h
+2 b/‎torch/csrc/cuda/nccl.h
+2
diff --git a/‎torch/csrc/cuda/python_nccl.cpp
+23-22 b/‎torch/csrc/cuda/python_nccl.cpp
+23-22
diff --git a/‎torch/csrc/distributed/c10d/NCCLUtils.hpp
+1-1 b/‎torch/csrc/distributed/c10d/NCCLUtils.hpp
+1-1
diff --git a/‎torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
+1 b/‎torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
+1
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <nccl.h>
+
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
 
@@ -15,43 +15,44 @@
 
 using namespace at;
 using namespace torch;
-using namespace torch::cuda::nccl;
-using namespace torch::cuda::nccl::detail;
+using namespace torch::cuda;
+
+namespace pynccl = torch::cuda::nccl;
 
 static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
 
 PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
-  return PyLong_FromUnsignedLongLong(version());
+  return PyLong_FromUnsignedLongLong(pynccl::version());
 }
 
 
10000
PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  return PyBytes_FromString(version_suffix());
+  return PyBytes_FromString(pynccl::version_suffix());
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THCPModule_nccl_unique_id(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  ncclUniqueId id;
+  pynccl::ncclUniqueId id;
   get_unique_id(id);
   return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES);
   END_HANDLE_TH_ERRORS
 }
 
-static ncclComm_t unpack_nccl_comm(PyObject* capsule) {
-  ncclComm_t comm =
-      (ncclComm_t)PyCapsule_GetPointer(capsule, COMM_CAPSULE_NAME);
+static pynccl::ncclComm_t unpack_nccl_comm(PyObject* capsule) {
+  pynccl::ncclComm_t comm =
+      (pynccl::ncclComm_t)PyCapsule_GetPointer(capsule, COMM_CAPSULE_NAME);
   if (!comm)
     throw python_error();
   return comm;
 }
 
 static void destroy_nccl_comm(PyObject* capsule) {
   HANDLE_TH_ERRORS
-  ncclComm_t comm = unpack_nccl_comm(capsule);
+  pynccl::ncclComm_t comm = unpack_nccl_comm(capsule);
   {
     pybind11::gil_scoped_release no_gil;
-    comm_destroy(comm);
+    pynccl::comm_destroy(comm);
   }
   END_HANDLE_TH_ERRORS_RET()
 }
@@ -73,19 +74,19 @@ static std::vector<std::optional<at::cuda::CUDAStream>> unpack_streams(
 static at::Tensor extract_tensor(PyObject* obj);
 static std::vector<at::Tensor> extract_tensors(PyObject* obj);
 
-static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
+static std::vector<pynccl::ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
   if (obj == Py_None) {
-    return std::vector<ncclComm_t>();
+    return std::vector<pynccl::ncclComm_t>();
   }
-  std::vector<ncclComm_t> comms;
+  std::vector<pynccl::ncclComm_t> comms;
   if (PyCapsule_CheckExact(obj)) {
     comms = {unpack_nccl_comm(obj)};
   } else {
     auto seq = THPObjectPtr(PySequence_Fast(obj, "comm is not a sequence"));
     if (!seq)
       throw python_error();
     auto size = PySequence_Fast_GET_SIZE(seq.get());
-    comms = std::vector<ncclComm_t>(size);
+    comms = std::vector<pynccl::ncclComm_t>(size);
     for (const auto i : c10::irange(size)) {
       comms[i] = unpack_nccl_comm(PySequence_Fast_GET_ITEM(seq.get(), i));
     }
@@ -116,12 +117,12 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
       id_len,
       ")");
 
-  ncclUniqueId commId;
+  pynccl::ncclUniqueId commId;
   memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
-  ncclComm_t comm = nullptr;
+  pynccl::ncclComm_t comm = nullptr;
   {
     pybind11::gil_scoped_release no_gil;
-    comm = comm_init_rank(nranks, commId, rank);
+    comm = pynccl::comm_init_rank(nranks, commId, rank);
   }
   return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
   END_HANDLE_TH_ERRORS
@@ -153,7 +154,7 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
 
   {
     pybind11::gil_scoped_release no_gil;
-    torch::cuda::nccl::reduce(inputs, output, root, op, streams, user_comms);
+    pynccl::reduce(inputs, output, root, op, streams, user_comms);
   }
 
   Py_RETURN_NONE;
@@ -186,7 +187,7 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
 
   {
     pybind11::gil_scoped_release no_gil;
-    all_reduce(inputs, outputs, op, streams, user_comms);
+    pynccl::all_reduce(inputs, outputs, op, streams, user_comms);
   }
 
   Py_RETURN_NONE;
@@ -217,7 +218,7 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
 
   {
     pybind11::gil_scoped_release no_gil;
-    torch::cuda::nccl::broadcast(inputs, streams, user_comms);
+    pynccl::broadcast(inputs, streams, user_comms);
   }
 
   Py_RETURN_NONE;
@@ -249,7 +250,7 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
 
   {
     pybind11::gil_scoped_release no_gil;
-    all_gather(inputs, outputs, streams, user_comms);
+    pynccl::all_gather(inputs, outputs, streams, user_comms);
   }
 
   Py_RETURN_NONE;
@@ -282,7 +283,7 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
 
   {
     pybind11::gil_scoped_release no_gil;
-    reduce_scatter(inputs, outputs, op, streams, user_comms);
+    pynccl::reduce_scatter(inputs, outputs, op, streams, user_comms);
   }
 
   Py_RETURN_NONE;
 
@@ -12,7 +12,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/util/Exception.h>
-#include <nccl.h>
+#include <torch/csrc/cuda/nccl.h>
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <optional>
 
 
@@ -1,4 +1,5 @@
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/cuda/nccl.h>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
 #include <torch/csrc/distributed/c10d/quantization/quantization_utils.h>
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include <c10/cuda/CUDAGuard.h>`
	`2`	`+#include <torch/csrc/cuda/nccl.h>`
`2`	`3`	`#include <torch/csrc/distributed/c10d/Utils.hpp>`
`3`	`4`	`#include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>`
`4`	`5`	`#include <torch/csrc/distributed/c10d/quantization/quantization_utils.h>`