@@ -1162,6 +1162,10 @@ void ProcessGroupNCCL::abortCommsFromMap(
1162
1162
at::cuda::OptionalCUDAGuard gpuGuard;
1163
1163
at::DeviceIndex deviceIndex = getIndexFromDeviceKey (devName);
1164
1164
if (deviceIndex >= 0 ) {
1165
+ // For P2P comms, the deviceIndex could be -1 (invalid), as the keys in
1166
+ // the map could be non deviceIndex, but rank to rank numbers. So we
1167
+ // indeed need to check if deviceIndex >= 0
1168
+ // TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
1165
1169
gpuGuard.set_index (deviceIndex);
1166
1170
}
1167
1171
LOG (INFO) << logPrefix () << " ProcessGroupNCCL destroying ncclComm_ "
@@ -2162,7 +2166,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
2162
2166
bool batchP2P = ncclActiveGroupCounter_ > 0 ;
2163
2167
bool singleP2POp = isP2POp (opType, batchP2P);
2164
2168
2165
- at::cuda::OptionalCUDAGuard gpuGuard;
2169
+ // Get the device index
2170
+ auto deviceIndex = device.index ();
2171
+ at::cuda::OptionalCUDAGuard gpuGuard (device);
2166
2172
2167
2173
// [Group Start/End Note] This is used to ensure that nccl communicator will
2168
2174
// be created before communication primitives are called. Let's look at this
@@ -2202,10 +2208,6 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
2202
2208
rank = p2pRank;
2203
2209
}
2204
2210
2205
- // Get the device index
2206
- auto deviceIndex = device.index ();
2207
- gpuGuard.set_index (deviceIndex);
2208
-
2209
2211
#ifdef NCCL_HAS_COMM_SPLIT
2210
2212
if (options_->split_from ) {
2211
2213
TORCH_CHECK (
@@ -2715,7 +2717,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
2715
2717
work->stashed_for_allocator_safety_ ->push_back (input);
2716
2718
}
2717
2719
2718
- at::cuda::OptionalCUDAGuard gpuGuard;
2720
+ at::cuda::OptionalCUDAGuard gpuGuard (device) ;
2719
2721
2720
2722
if (nanCheck) {
2721
2723
checkForNan (input, ncclStream);
@@ -2880,7 +2882,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
2880
2882
std::make_shared<std::vector<at::Tensor>>(inputs);
2881
2883
}
2882
2884
2883
- at::cuda::OptionalCUDAGuard gpuGuard;
2885
+ at::cuda::OptionalCUDAGuard gpuGuard (device) ;
2884
2886
2885
2887
// Start event should only be recorded before the ncclGroupStart() (which
2886
2888
// happens inside AutoNcclGroup guard below)
@@ -3148,7 +3150,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
3148
3150
}
3149
3151
3150
3152
// is gpuGuard needed for the if block below, or can i swap them
3151
- at::cuda::OptionalCUDAGuard gpuGuard;
3153
+ at::cuda::OptionalCUDAGuard gpuGuard (device) ;
3152
3154
3153
3155
// Only check for NaN for send ops, for recv ops `tensor` can be a random
3154
3156
// placeholder
0 commit comments