8000 [RFC][BE] assume error checking is on by default (#141914) · pytorch/pytorch@f874f37 · GitHub
[go: up one dir, main page]

Skip to content

Commit f874f37

Browse files
c-p-i-ofacebook-github-bot
authored andcommitted
[RFC][BE] assume error checking is on by default (#141914)
Summary: Remove conditional MACRO `ENABLE_NCCL_ERROR_CHECKING` and assume that error checking is always on. These checks were wrapped in a macro because older NCCL libraries didn't have the pre-requisite functions to do error checks. This check was put in several years ago. Pull request #142023 adds a static_assert that NCCL version should be 2.7 or above for PyTorch to work. 2.4 released about 2 years ago so it's relatively safe to assume that everyone has upgraded. Assume that the world has all upgraded to later version of NCCL library. Release note for PyTorch must specify that going forward, PyTorch will only work with NCCL version 2.7 and above. Test Plan: Unit tests. cc H-Huang awgu kwen2501 wanchaol fegin fduwjj wz337 wconstab d4l3k Reviewed By: wconstab, fduwjj, kwen2501 Differential Revision: D66672062 Pulled By: c-p-i-o
1 parent 1d091e4 commit f874f37

File tree

3 files changed

+2
-13
lines changed

3 files changed

+2
-13
lines changed

test/distributed/test_c10d_nccl.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,8 @@ def test_abort_pg(self):
335335
# Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
336336
# abort the process group.
337337
os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
338+
# set heartbeat timeout to low value
339+
os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "5"
338340

339341
store = c10d.FileStore(self.file_name, self.world_size)
340342
self._create_process_group_nccl(store, self.opts())

torch/csrc/distributed/c10d/NCCLUtils.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,6 @@ static_assert(
4343
(NCCL_MAJOR == 2 && NCCL_MINOR >= 7) || (NCCL_MAJOR > 2),
4444
"NCCL version must be 2.7 or later");
4545

46-
// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
47-
// and ncclCommGetAsyncError() are not supported in earlier versions.
48-
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
49-
(NCCL_MINOR >= 4)
50-
#define ENABLE_NCCL_ERROR_CHECKING
51-
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
52-
#define ENABLE_NCCL_ERROR_CHECKING
53-
#endif
54-
5546
// P2P is enabled only for NCCL versions 2.7+ since ncclSend()
5647
// and ncclRecv() are not supported in earlier versions.
5748
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -918,10 +918,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
918918
PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
919919
globalStore_ =
920920
prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
921-
#ifdef ENABLE_NCCL_ERROR_CHECKING
922921
enableTiming_.store(
923922
getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug_);
924-
#endif
925923
avoidRecordStreams_ = getCvarBool(TORCH_NCCL_AVOID_RECORD_STREAMS, false);
926924
#ifdef NCCL_HAS_COMM_REGISTER
927925
useTensorRegisterAllocatorHook_ =
@@ -950,15 +948,13 @@ ProcessGroupNCCL::ProcessGroupNCCL(
950948
}
951949
}
952950

953-
#ifdef ENABLE_NCCL_ERROR_CHECKING
954951
// in blockingWait mode, we don't need to enable the watchdog thread to check
955952
// the timeout or nccl error because the main thread would throw an exception
956953
// and it is the user's responsibility to handle the exception.
957954
if (!blockingWait_) {
958955
ncclCommWatchdogThread_ =
959956
std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
960957
}
961-
#endif
962958

963959
init();
964960
const std::string OFF = "OFF";

0 commit comments

Comments
 (0)
0