8000 [c10d] Fix `new_subgroups(group=)` bug (#153798) · pytorch/pytorch@6487ea3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6487ea3

Browse files
tsunghsienleepytorchmergebot
authored andcommitted
[c10d] Fix new_subgroups(group=) bug (#153798)
Summary: The bug, introduced in #152765, was caused by passing the `group` parameter to the `get_rank()` function, which caused the function to return the rank of the entire group instead of the rank of the current process. The fix involves removing the `group` parameter from the `get_rank()` function call. Test Plan: contbuild & OSS CI Differential Revision: D74964213 Pull Request resolved: #153798 Approved by: https://github.com/Skylion007
1 parent b0e5402 commit 6487ea3

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

torch/distributed/distributed_c10d.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5455,7 +5455,7 @@ def new_subgroups(
54555455
)
54565456
subgroups.append(subgroup)
54575457

5458-
if rank := get_rank(group=group) in ranks_in_subgroup:
5458+
if rank := get_rank() in ranks_in_subgroup:
54595459
cur_subgroup = subgroup
54605460
logger.info("Rank %s is assigned to subgroup %s", rank, ranks_in_subgroup)
54615461

torch/testing/_internal/distributed/distributed_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,40 @@ def test_new_subgroups(self):
915915
for subgroup in subgroups:
916916
dist.destroy_process_group(subgroup)
917917

918+
@skip_but_pass_in_sandcastle_if(
919+
BACKEND not in DistTestCases.backend_feature["subgroup"],
920+
f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
921+
)
922+
@require_world_size(4)
923+
@skip_if_lt_x_gpu(4)
924+
def test_new_subgroups_with_group_param(self):
925+
# Initialize global test environment
926+
self._init_global_test()
927+
# Set up GPU devices for each rank
928+
init_multigpu_helper(dist.get_world_size(), BACKEND)
929+
# Create two subgroups: one with ranks [0,2] and another with ranks [1,3]
930+
cur_subgroup, subgroups = dist.new_subgroups_by_enumeration(
931+
ranks_per_subgroup_list=[[0, 2], [1, 3]]
932+
)
933+
934+
# Further divide the current subgroup into sub-subgroups of size 1
935+
cur_sub_subgroup, sub_subgroups = dist.new_subgroups(
936+
group_size=1, group=cur_subgroup
937+
)
938+
# Verify we have 2 sub-subgroups (one for each rank in the original subgroup)
939+
self.assertEqual(len(sub_subgroups), 2)
940+
# Verify the current process's sub-subgroup has size 1
941+
self.assertEqual(cur_sub_subgroup.size(), 1)
942+
# Verify the current process is in its assigned sub-subgroup
943+
self.assertFalse(dist._rank_not_in_group(group=cur_sub_subgroup))
944+
945+
# Clean up by destroying all created process groups
946+
for sub_subgroup in sub_subgroups:
947+
dist.destroy_process_group(sub_subgroup)
948+
949+
for subgroup in subgroups:
950+
dist.destroy_process_group(subgroup)
951+
918952
@skip_but_pass_in_sandcastle_if(
919953
BACKEND not in DistTestCases.backend_feature["subgroup"],
920954
f"The {BACKEND} backend does not support creating subgroups on CUDA devices",

0 commit comments

Comments
 (0)
0