8000 [3/N] Set correct device to CUDA guards (#134357) · pytorch/pytorch@26aea27 · GitHub
[go: up one dir, main page]

Skip to content

Commit 26aea27

Browse files
kwen2501pytorchmergebot
authored andcommitted
[3/N] Set correct device to CUDA guards (#134357)
In `collective()`, `pointToPoint()` and `collectiveCoalesced()`, CUDA guards were created with an unset (default) CUDA device. This is the reason for the IMA facing the NaN checker in issue #134062. With this fix, `torch.cuda.set_device(device)` is not needed to work around the IMA. Also refactored a couple places where the guard is created -- preferably we create the guard with a known device, rather than setting the device later. Pull Request resolved: #134357 Approved by: https://github.com/wconstab, https://github.com/shuqiangzhang ghstack dependencies: #134345
1 parent d503217 commit 26aea27

File tree

2 files changed

+29
-8
lines changed

2 files changed

+29
-8
lines changed

test/distributed/test_c10d_nccl.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ def test_close_pg(self):
350350
@parametrize("type", [torch.float16, torch.float32, torch.float64, torch.bfloat16])
351351
@skip_if_rocm
352352
def test_nan_assert(self, type):
353+
# Expecting a device-side error when NaN is detected
353354
os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
354355
store = c10d.FileStore(self.file_name, self.world_size)
355356
pg = self._create_process_group_nccl(store, self.opts())
@@ -388,6 +389,24 @@ def test_nan_p2p(self):
388389
# reset env
389390
os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
390391

392+
@requires_nccl()
393+
@skip_if_lt_x_gpu(2)
394+
def test_nan_check(self):
395+
# Not expecting an error, NaN check should not make legit code fail
396+
os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
397+
store = c10d.FileStore(self.file_name, self.world_size)
398+
device = torch.device("cuda:%d" % self.rank)
399+
c10d.init_process_group(
400+
backend="nccl", store=store, rank=self.rank, world_size=self.world_size
401+
)
402+
x = torch.ones((10,), dtype=torch.bfloat16, device=device) * self.rank
403+
t = torch.ones(3, 4, dtype=torch.bfloat16, device=device)
404+
c10d.broadcast(x, src=0)
405+
c10d.all_reduce(t)
406+
c10d.destroy_process_group()
407+
# reset env
408+
os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
409+
391410
@requires_nccl()
392411
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
393412
def test_destruct_before_terminate_pg(self):

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,10 @@ void ProcessGroupNCCL::abortCommsFromMap(
11491149
at::cuda::OptionalCUDAGuard gpuGuard;
11501150
at::DeviceIndex deviceIndex = getIndexFromDeviceKey(devName);
11511151
if (deviceIndex >= 0) {
1152+
// For P2P comms, the deviceIndex could be -1 (invalid), as the keys in
1153+
// the map could be non deviceIndex, but rank to rank numbers. So we
1154+
// indeed need to check if deviceIndex >= 0
1155+
// TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
11521156
gpuGuard.set_index(deviceIndex);
11531157
}
11541158
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
@@ -2142,7 +2146,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
21422146
bool batchP2P = ncclActiveGroupCounter_ > 0;
21432147
bool singleP2POp = isP2POp(opType, batchP2P);
21442148

2145-
at::cuda::OptionalCUDAGuard gpuGuard;
2149+
// Get the device index
2150+
auto deviceIndex = device.index();
2151+
at::cuda::OptionalCUDAGuard gpuGuard(device);
21462152

21472153
// [Group Start/End Note] This is used to ensure that nccl communicator will
21482154
// be created before communication primitives are called. Let's look at this
@@ -2182,10 +2188,6 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
21822188
rank = p2pRank;
21832189
}
21842190

2185-
// Get the device index
2186-
auto deviceIndex = device.index();
2187-
gpuGuard.set_index(deviceIndex);
2188-
21892191
#ifdef NCCL_HAS_COMM_SPLIT
21902192
if (options_->split_from) {
21912193
TORCH_CHECK(
@@ -2695,7 +2697,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
26952697
work->stashed_for_allocator_safety_->push_back(input);
26962698
}
26972699

2698-
at::cuda::OptionalCUDAGuard gpuGuard;
2700+
at::cuda::OptionalCUDAGuard gpuGuard(device);
26992701

27002702
if (nanCheck) {
27012703
checkForNan(input, ncclStream);
@@ -2860,7 +2862,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
28602862
std::make_shared<std::vector<at::Tensor>>(inputs);
28612863
}
28622864

2863-
at::cuda::OptionalCUDAGuard gpuGuard;
2865+
at::cuda::OptionalCUDAGuard gpuGuard(device);
28642866

28652867
// Start event should only be recorded before the ncclGroupStart() (which
28662868
// happens inside AutoNcclGroup guard below)
@@ -3128,7 +3130,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
31283130
}
31293131

31303132
// is gpuGuard needed for the if block below, or can i swap them
3131-
at::cuda::OptionalCUDAGuard gpuGuard;
3133+
at::cuda::OptionalCUDAGuard gpuGuard(device);
31323134

31333135
// Only check for NaN for send ops, for recv ops `tensor` can be a random
31343136
// placeholder

0 commit comments

Comments
 (0)
0