8000 elastic: do not shutdown rendezvous on leaving workers by georgkaleido · Pull Request #152525 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

elastic: do not shutdown rendezvous on leaving workers #152525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions test/distributed/elastic/agent/server/test/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,7 @@ def __init__(self, spec):
self.stop_workers_call_count = 0
self.start_workers_call_count = 0

def _stop_workers(
self, worker_group: WorkerGroup, is_restart: bool = False
) -> None:
def _stop_workers(self, worker_group: WorkerGroup) -> None:
# workers are fake, nothing to stop; just clear the rdzv info
worker_group.group_rank = None
worker_group.group_world_size = None
Expand Down
10 changes: 3 additions & 7 deletions torch/distributed/elastic/agent/server/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,9 +457,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
raise NotImplementedError

@abc.abstractmethod
def _stop_workers(
self, worker_group: WorkerGroup, is_restart: bool = False
) -> None:
def _stop_workers(self, worker_group: WorkerGroup) -> None:
r"""Stop all workers in the given worker group.

Implementors must deal with workers in all states defined by
Expand All @@ -477,9 +475,7 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
raise NotImplementedError

@abc.abstractmethod
def _shutdown(
self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
) -> None:
def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
"""Clean up any resources that were allocated during the agent's work.

Args:
Expand Down Expand Up @@ -698,7 +694,7 @@ def _restart_workers(self, worker_group: WorkerGroup) -> None:
"""Restart (stops, rendezvous, starts) all local workers in the group."""
role = worker_group.spec.role
logger.info("[%s] Stopping worker group", role)
self._stop_workers(worker_group, is_restart=True)
self._stop_workers(worker_group)
worker_group.state = WorkerState.STOPPED
self._initialize_workers(worker_group)

Expand Down
12 changes: 3 additions & 9 deletions torch/distributed/elastic/agent/server/local_elastic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,8 @@ def _log_watchdog_event(
# pyre-fixme[56]: Pyre was not able to infer the type of the decorator
# `torch.distributed.elastic.metrics.prof`.
@prof
def _stop_workers(
self, worker_group: WorkerGroup, is_restart: bool = False
) -> None:
self._shutdown(is_restart=is_restart)
def _stop_workers(self, worker_group: WorkerGroup) -> None:
self._shutdown()

# pyre-fixme[56]: Pyre was not able to infer the type of the decorator
# `torch.distributed.elastic.metrics.prof`.
Expand Down Expand Up @@ -359,9 +357,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:

return self._pcontext.pids()

def _shutdown(
self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
) -> None:
def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
if self._worker_watchdog is not None:
self._worker_watchdog.stop()
self._worker_watchdog = None
Expand All @@ -370,8 +366,6 @@ def _shutdown(
self._health_check_server = None
if self._pcontext:
self._pcontext.close(death_sig)
if not is_restart and self._rdzv_handler:
self._rdzv_handler.shutdown()

# pyre-fixme[56]: Pyre was not able to infer the type of the decorator
# `torch.distributed.elastic.metrics.prof`.
Expand Down
Loading
0