pytorch
diff --git a/‎torch/csrc/cuda/nccl.cpp
Lines changed: 1 addition & 1 deletion b/‎torch/csrc/cuda/nccl.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/csrc/cuda/nccl.h
Lines changed: 12 additions & 25 deletions b/‎torch/csrc/cuda/nccl.h
Lines changed: 12 additions & 25 deletions
diff --git a/‎torch/csrc/distributed/c10d/NCCLUtils.cpp
Lines changed: 0 additions & 10 deletions b/‎torch/csrc/distributed/c10d/NCCLUtils.cpp
Lines changed: 0 additions & 10 deletions
diff --git a/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Lines changed: 1 addition & 10 deletions b/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Lines changed: 1 addition & 10 deletions
diff --git a/‎torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
Lines changed: 3 additions & 3 deletions b/‎torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
Lines changed: 3 additions & 3 deletions
@@ -118,7 +118,7 @@ ncclDataType_t to_nccl_data_type(c10::ScalarType type) {
       return ncclDataType_t::ncclUint8;
 #endif
 
-#if HAS_NCCL_BF16_DATATYPE
+#ifdef HAS_NCCL_BF16_DATATYPE
     case at::kBFloat16:
       return ncclDataType_t::ncclBfloat16;
 #endif
 
@@ -9,52 +9,39 @@
 
 // NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.10+, or for
 // HIP 3.1+
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-#define HAS_NCCL_BF16_DATATYPE \
-  ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
+#define HAS_NCCL_BF16_DATATYPE
+#endif // defined(__CUDA_BF16_TYPES_EXIST__)
+#define NCCL_HAS_AVG
 #elif defined(USE_ROCM) && (TORCH_HIP_VERSION >= 301)
-#define HAS_NCCL_BF16_DATATYPE 1
-#else
-#define HAS_NCCL_BF16_DATATYPE 0
-#endif
-
-// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
-// and ncclCommGetAsyncError() are not supported in earlier versions.
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
-    (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 4))
-#define ENABLE_NCCL_ERROR_CHECKING
-#endif
-
-// P2P is enabled only for NCCL versions 2.7+ since ncclSend()
-// and ncclRecv() are not supported in earlier versions.
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
-    (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 7))
-#define ENABLE_NCCL_P2P_SUPPORT
-#endif
+#define HAS_NCCL_BF16_DATATYPE
+#endif // NCCL >= 2.10
 
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
+#if defined(NCCL_MAJOR) && \
     (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 11))
 #define ENABLE_NCCL_PREMUL_SUM_SUPPORT
 #endif
 
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
+#if defined(NCCL_MAJOR) && \
     (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 13))
 #define NCCL_HAS_REMOTE_ERROR
 #define ENABLE_NCCL_GET_LAST_ERROR
 #endif
 
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
+#if defined(NCCL_MAJOR) && \
     (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 14))
 #define NCCL_HAS_COMM_NONBLOCKING
 #endif
 
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
+#if defined(NCCL_MAJOR) && \
     (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 17))
 #define NCCL_HAS_COMM_CTA_CGA
 #define NCCL_HAS_COMM_SPLIT
 #endif
 
-#if defined(NCCL_MAJOR) && defined(NCCL_MINOR) && \
+#if defined(NCCL_MAJOR) && \
     (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR >= 19))
 #define NCCL_HAS_COMM_REGISTER
 #endif
 
@@ -245,7 +245,6 @@ void NCCLComm::destroy() {
 void NCCLComm::abort(std::optional<std::string> commFailureReason) {
   LockType lock(mutex_);
   at::cuda::OptionalCUDAGuard gpuGuard(deviceIndex_);
-#ifdef ENABLE_NCCL_ERROR_CHECKING
   if (aborted_ && !initialized_) {
     // Should not abort twice.
     return;
@@ -285,10 +284,6 @@ void NCCLComm::abort(std::optional<std::string> commFailureReason) {
   if (ncclAsyncErr_ == ncclSuccess) {
     ncclAsyncErr_ = ncclSystemError;
   }
-#else
-  // This is a NOOP, if error checks are disabled.
-  return;
-#endif
 }
 
 bool NCCLComm::isInitialized() const {
@@ -307,17 +302,12 @@ uint64_t NCCLComm::getCommSplitCounter() const {
 
 ncclResult_t NCCLComm::checkForNcclError() {
   LockType lock(mutex_);
-#ifdef ENABLE_NCCL_ERROR_CHECKING
   if (ncclAsyncErr_ != ncclSuccess) {
     return ncclAsyncErr_;
   }
   C10D_NCCL_CHECK(
       ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_);
   return ncclAsyncErr_;
-#else
-  // Always return success, if error checks are disabled.
-  return ncclSuccess;
-#endif
 }
 
 ncclResult_t NCCLComm::registerSegment(void* ptr, size_t size) {
 
@@ -38,11 +38,6 @@ constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";
 
 namespace {
 
-#if defined(NCCL_MAJOR) && \
-    ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
-#define NCCL_HAS_AVG 1
-#endif // NCCL version >= 2.10
-
 // NCCL op mapping
 const std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
     {ReduceOp::MIN, ncclMin},
@@ -68,7 +63,7 @@ std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kFloat8_e4m3fn, ncclUint8},
     {at::kFloat8_e4m3fnuz, ncclUint8},
     {at::kFloat8_e5m2fnuz, ncclUint8},
-#if HAS_NCCL_BF16_DATATYPE
+#ifdef HAS_NCCL_BF16_DATATYPE
     {at::kBFloat16, ncclBfloat16},
 #endif // HAS_NCCL_BF16_DATATYPE
 };
@@ -928,10 +923,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
   globalStore_ =
       prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
-#ifdef ENABLE_NCCL_ERROR_CHECKING
   enableTiming_.store(
       getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug_);
-#endif // ENABLE_NCCL_ERROR_CHECKING
   avoidRecordStreams_ = getCvarBool(TORCH_NCCL_AVOID_RECORD_STREAMS, false);
 #ifdef NCCL_HAS_COMM_REGISTER
   useTensorRegisterAllocatorHook_ =
@@ -960,15 +953,13 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     }
   }
 
-#ifdef ENABLE_NCCL_ERROR_CHECKING
   // in blockingWait mode, we don't need to enable the watchdog thread to check
   // the timeout or nccl error because the main thread would throw an exception
   // and it is the user's responsibility to handle the exception.
   if (!blockingWait_) {
     ncclCommWatchdogThread_ =
         std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
   }
-#endif // ENABLE_NCCL_ERROR_CHECKING
 
   init();
   const std::string OFF = "OFF";
 
@@ -66,7 +66,7 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
 
   auto output = at::empty(
       {nrows, ncols},
-#if HAS_NCCL_BF16_DATATYPE
+#ifdef HAS_NCCL_BF16_DATATYPE
       input.options().dtype(at::kBFloat16));
 #else
       input.options().dtype(at::kHalf));
@@ -92,7 +92,7 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
       input.const_data_ptr<float>(),
       nrows,
       ncols,
-#if HAS_NCCL_BF16_DATATYPE
+#ifdef HAS_NCCL_BF16_DATATYPE
       reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::BFloat16>())
 #else
       reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::Half>())
@@ -137,7 +137,7 @@ at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
       blockDim,
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-#if HAS_NCCL_BF16_DATATYPE
+#ifdef HAS_NCCL_BF16_DATATYPE
       reinterpret_cast<const uint16_t*>(input.const_data_ptr<at::BFloat16>()),
 #else
       reinterpret_cast<const uint16_t*>(input.const_data_ptr<at::Half>()),