@@ -364,11 +364,20 @@ static void attachAllocatorHooks() {
364
364
static std::
365
365
unordered_map<std::string, std::unordered_map<std::string, std::string>>
366
366
getNCCLCommDumpMap () {
367
- #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
367
+ #if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP) && \
368
+ defined (NCCL_COMM_GET_UNIQUE_HASH)
368
369
std::unordered_map<
369
- std::string /* ncclUniqueID */ ,
370
+ std::string /* CommHash */ ,
370
371
std::unordered_map<std::string, std::string> /* dump from this comm */ >
371
372
ncclDumpMap;
373
+ #ifdef NCCL_COMM_DUMP_ALL
374
+ auto res = ncclCommDumpAll (ncclDumpMap);
375
+ if (res == ncclSuccess) {
376
+ return ncclDumpMap;
377
+ }
378
+ // Fall back to dump from each comm if ncclCommDumpAll failed
379
+ #endif // NCCL_COMM_DUMP_ALL
380
+
372
381
// dump_nccl_trace is only called from the default PG (local_id_=0), but we
373
382
// want to dump from all comms so we need to iterate over ncclCommMemPoolMap,
374
383
// which is static
@@ -382,8 +391,11 @@ static std::
382
391
}
383
392
}
384
393
for (auto & ncclComm : allNCCLComms) {
385
- std::string ncclUniqueIDStr = buildNcclUniqueIdStr (ncclComm->getNcclId ());
386
- ncclDumpMap[ncclUniqueIDStr] = ncclComm->ncclCommDump ();
394
+ std::stringstream ss;
395
+ ss << std::hex << ncclComm->getNcclUniqueHash ();
396
+ std::string ncclUniqueHashStr = ss.str ();
397
+
398
+ ncclDumpMap[ncclUniqueHashStr] = ncclComm->ncclCommDump ();
387
399
}
388
400
return ncclDumpMap;
389
401
#else
0 commit comments