8000 elastic: do not shutdown rendezvous on leaving workers · georgkaleido/pytorch@df77163 · GitHub
[go: up one dir, main page]

Skip to content

Commit df77163

Browse files
georgkaleidopytorchmergebot
authored andcommitted
elastic: do not shutdown rendezvous on leaving workers
In pytorch#117066, shutdown of the rendezvous was added if a worker shuts down. This is incorrect, because the rendezvous is actually shutdown in [this file](https://github.com/pytorch/pytorch/blob/fa6f9eb2be07f6289d2ab4e781077f7fc75dbe55/torch/distributed/launcher/api.py#L290) but should not be shutdown if a signal is received. See also [this pull request](pytorch#67749). pytorch#124819 then tried to remediate the situation by fixing the faulty shutdown for the restart case. But this is only triggered if the agent restarts the training, but not if the shutdown of the rendezvous happened before. Removing both these changes restores the original behavior. The rendezvous should only be shutdown if a run completes or fails, not for a single worker leaving.
1 parent 7243c69 commit df77163

File tree

3 files changed

+7
-9
lines changed

3 files changed

+7
-9
lines changed

test/distributed/elastic/agent/server/test/api_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def __init__(self, spec):
128128
self.start_workers_call_count = 0
129129

130130
def _stop_workers(
131-
self, worker_group: WorkerGroup, is_restart: bool = False
131+
self, worker_group: WorkerGroup
132132
) -> None:
133133
# workers are fake, nothing to stop; just clear the rdzv info
134134
worker_group.group_rank = None

torch/distributed/elastic/agent/server/api.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
458458

459459
@abc.abstractmethod
460460
def _stop_workers(
461-
self, worker_group: WorkerGroup, is_restart: bool = False
461+
self, worker_group: WorkerGroup
462462
) -> None:
463463
r"""Stop all workers in the given worker group.
464464
@@ -478,7 +478,7 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
478478

479479
@abc.abstractmethod
480480
def _shutdown(
481-
self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
481+
self, death_sig: signal.Signals = signal.SIGTERM
482482
) -> None:
483483
"""Clean up any resources that were allocated during the agent's work.
484484
@@ -698,7 +698,7 @@ def _restart_workers(self, worker_group: WorkerGroup) -> None:
698698
"""Restart (stops, rendezvous, starts) all local workers in the group."""
699699
role = worker_group.spec.role
700700
logger.info("[%s] Stopping worker group", role)
701-
self._stop_workers(worker_group, is_restart=True)
701+
self._stop_workers(worker_group)
702702
worker_group.state = WorkerState.STOPPED
703703
self._initialize_workers(worker_group)
704704

torch/distributed/elastic/agent/server/local_elastic_agent.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,9 @@ def _log_watchdog_event(
281281
# `torch.distributed.elastic.metrics.prof`.
282282
@prof
283283
def _stop_workers(
284-
self, worker_group: WorkerGroup, is_restart: bool = False
284+
self, worker_group: WorkerGroup
285285
) -> None:
286-
self._shutdown(is_restart=is_restart)
286+
self._shutdown()
287287

288288
# pyre-fixme[56]: Pyre was not able to infer the type of the decorator
289289
# `torch.distributed.elastic.metrics.prof`.
@@ -360,7 +360,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
360360
return self._pcontext.pids()
361361

362362
def _shutdown(
363-
self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
363+
self, death_sig: signal.Signals = signal.SIGTERM
364364
) -> None:
365365
if self._worker_watchdog is not None:
366366
self._worker_watchdog.stop()
@@ -370,8 +370,6 @@ def _shutdown(
370370
self._health_check_server = None
371371
if self._pcontext:
372372
self._pcontext.close(death_sig)
373-
if not is_restart and self._rdzv_handler:
374-
self._rdzv_handler.shutdown()
375373

376374
# pyre-fixme[56]: Pyre was not able to infer the type of the decorator
377375
# `torch.distributed.elastic.metrics.prof`.

0 commit comments

Comments
 (0)
0