pytorch · georgkaleido · Apr 30, 2025 · May 13, 2025
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
@@ -127,9 +127,7 @@ def __init__(self, spec):
         self.stop_workers_call_count = 0
         self.start_workers_call_count = 0
 
-    def _stop_workers(
-        self, worker_group: WorkerGroup, is_restart: bool = False
-    ) -> None:
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
         # workers are fake, nothing to stop; just clear the rdzv info
         worker_group.group_rank = None
         worker_group.group_world_size = None

diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
@@ -457,9 +457,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
         raise NotImplementedError
 
     @abc.abstractmethod
-    def _stop_workers(
-        self, worker_group: WorkerGroup, is_restart: bool = False
-    ) -> None:
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
         r"""Stop all workers in the given worker group.
 
         Implementors must deal with workers in all states defined by
@@ -477,9 +475,7 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
         raise NotImplementedError
 
     @abc.abstractmethod
-    def _shutdown(
-        self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
-    ) -> None:
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
         """Clean up any resources that were allocated during the agent's work.
 
         Args:
@@ -698,7 +694,7 @@ def _restart_workers(self, worker_group: WorkerGroup) -> None:
         """Restart (stops, rendezvous, starts) all local workers in the group."""
         role = worker_group.spec.role
         logger.info("[%s] Stopping worker group", role)
-        self._stop_workers(worker_group, is_restart=True)
+        self._stop_workers(worker_group)
         worker_group.state = WorkerState.STOPPED
         self._initialize_workers(worker_group)
 

diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -280,10 +280,8 @@ def _log_watchdog_event(
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
-    def _stop_workers(
-        self, worker_group: WorkerGroup, is_restart: bool = False
-    ) -> None:
-        self._shutdown(is_restart=is_restart)
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+        self._shutdown()
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
@@ -359,9 +357,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
 
         return self._pcontext.pids()
 
-    def _shutdown(
-        self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
-    ) -> None:
+        def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
         if self._worker_watchdog is not None:
             self._worker_watchdog.stop()
             self._worker_watchdog = None
@@ -370,8 +366,6 @@ def _shutdown(
             self._health_check_server = None
         if self._pcontext:
             self._pcontext.close(death_sig)
-        if not is_restart and self._rdzv_handler:
-            self._rdzv_handler.shutdown()
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.