-
Notifications
You must be signed in to change notification settings - Fork 24.8k
[c10d] Remove Option for ProcessGroup and Expose backend Options to reflect the correct code structure #132931
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
ba4354f
04a9472
fe2d504
7396198
144800c
06bd161
951c247
45e7d67
37ab0e9
13359b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…ptions to reflect the correct code structure" We introduced the dispatchable backend for a ProcessGroup and collective in #86225. This PR is a follow-up cleanup to clean up the option of a ProcessGroup and ask users to either set timeout or backend later on or directly create backend after creating a PG. Also PGNCCL is using option class from ProcessGroup but we actually should use Option from backend class. So this PR is to make the type or name to be aligned with what we are doing in cpp side. I don't change the signature for the public API, so they still use args named "pg_options" cc XilunWu H-Huang awgu kwen2501 wanchaol fegin wz337 wconstab d4l3k c-p-i-o ezyang gchanan [ghstack-poisoned]
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,6 +63,7 @@ | |
"BroadcastOptions", | ||
"BuiltinCommHookType", | ||
"Callable", | ||
"C10dBackend", | ||
"DebugLevel", | ||
"Dict", | ||
"Enum", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,7 +145,6 @@ def _export_c_types() -> None: | |
AllToAllOptions, | ||
BarrierOptions, | ||
BroadcastOptions, | ||
C10dBackend, | ||
GatherOptions, | ||
PrefixStore, | ||
ProcessGroup, | ||
|
@@ -269,6 +268,7 @@ class Backend(str): | |
GLOO: ProcessGroup.BackendType.GLOO, | ||
NCCL: ProcessGroup.BackendType.NCCL, | ||
UCC: ProcessGroup.BackendType.UCC, | ||
MPI: ProcessGroup.BackendType.MPI, | ||
} | ||
|
||
def __new__(cls, name: str): | ||
|
@@ -1714,6 +1714,8 @@ def _new_process_group_helper( | |
group_rank, | ||
group_size, | ||
) | ||
assert backend in Backend.backend_type_map, f"Unknown backend type {backend}" | ||
pg._set_default_backend(Backend.backend_type_map[backend]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if backend isn't provided (e.g. I'm not sure if we have tests that cover these cases, but could be worthwhile to check. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The backend is not optional for this function. Do we see the use case when |
||
if device_id: | ||
pg.bound_device_id = device_id | ||
backend_config = BackendConfig(backend) | ||
|
@@ -4451,7 +4453,9 @@ def split_group( | |
group_rank, | ||
len(my_group), | ||
) | ||
backend_type = ProcessGroup.BackendType.NCCL | ||
pg.bound_device_id = device_id | ||
pg._set_default_backend(backend_type) | ||
|
||
pg_options._timeout = timeout | ||
pg_options.split_from = parent_backend | ||
|
@@ -4461,7 +4465,6 @@ def split_group( | |
backend_class = ProcessGroupNCCL( | ||
prefix_store, group_rank, len(my_group), pg_options | ||
) | ||
backend_type = ProcessGroup.BackendType.NCCL | ||
backend_class._set_sequence_number_for_group() | ||
|
||
pg._register_backend(torch.device("cuda"), backend_type, backend_class) | ||
|
@@ -4608,6 +4611,7 @@ def _new_group_with_tag( | |
if not backend: | ||
backend = default_backend | ||
backend = Backend(backend) | ||
print(backend) | ||
fduwjj marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# this timeout defaulting/validation is used for all the new_groups/new_subgroups variants, | ||
# which may just pass their timeout value (or None) | ||
|
Uh oh!
There was an error while loading. Please reload this page.