pytorch
diff --git a/‎test/test_rpc.py
Lines changed: 2 additions & 0 deletions b/‎test/test_rpc.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/rpc/ProcessGroupAgent.cpp
Lines changed: 10 additions & 2 deletions b/‎torch/csrc/distributed/rpc/ProcessGroupAgent.cpp
Lines changed: 10 additions & 2 deletions
diff --git a/‎torch/csrc/distributed/rpc/ProcessGroupAgent.h
Lines changed: 2 additions & 0 deletions b/‎torch/csrc/distributed/rpc/ProcessGroupAgent.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/rpc/RpcAgent.h
Lines changed: 11 additions & 0 deletions b/‎torch/csrc/distributed/rpc/RpcAgent.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/rpc/init.cpp
Lines changed: 8 additions & 2 deletions b/‎torch/csrc/distributed/rpc/init.cpp
Lines changed: 8 additions & 2 deletions
diff --git a/‎torch/distributed/rpc.py
Lines changed: 10 additions & 0 deletions b/‎torch/distributed/rpc.py
Lines changed: 10 additions & 0 deletions
@@ -30,6 +30,8 @@ def wrapper(self):
                                 world_size=self.world_size, store=store)
         dist.init_rpc('worker%d' % self.rank)
         func(self)
+        dist.destroy_rpc()
+        dist.destroy_process_group(dist.group.WORLD)
 
     return wrapper
 
 
@@ -78,14 +78,20 @@ ProcessGroupAgent::ProcessGroupAgent(
 }
 
 ProcessGroupAgent::~ProcessGroupAgent() {
-  //TORCH_CHECK(stop_, "Cannot destroy ProcessGroupAgent before shutdown.");
+  if (!stop_) {
+    AT_ERROR(stop_, "Must call ProcessGroupAgent::shutdown before destructor");
+  }
+}
+
 
+void ProcessGroupAgent::shutdown() {
   // Every process i sends a SHUTDOWN message to process i + 1. This is
   // necessary for now because:
   // 1. There is no abort API for ProcessGroup::recvAnysource yet. We have to
   //    feed it a message or kill the thread.
   // 2. A GLOO process cannot send message to itself. (there is an ongoing
   //    effort to fix this problem).
+  pg_->barrier()->wait();
   int dst = (pg_->getRank() + 1) % pg_->getSize();
   enqueue(SendWork(dst, Message({}, {}, MessageType::SHUTDOWN)));
   std::unique_lock<std::mutex> lock(sendQueueMutex_);
@@ -96,8 +102,10 @@ ProcessGroupAgent::~ProcessGroupAgent() {
   workProduceCV_.notify_all();
   sendThread_.join();
   listenerThread_.join();
+  pg_->barrier()->wait();
 }
 
+
 std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
     const std::string& to, Message&& message) {
 
@@ -170,7 +178,7 @@ void ProcessGroupAgent::sendLoop() {
 }
 
 void ProcessGroupAgent::listenLoop() {
-  while (!stop_) {
+  while (true) {
     // rank, tensor size
     std::vector<torch::Tensor> preamble = {torch::empty({2}, {torch::kInt64})};
     pg_->recvAnysource(preamble, pg_->getRank())->wait();
 
@@ -38,6 +38,8 @@ class ProcessGroupAgent : public RpcAgent {
   std::shared_ptr<FutureMessage> send(
       const std::string& to, Message&& message) override;
 
+  void shutdown() override;
+
  private:
   // put SendWork into a queue and notify the sendLoop thread
   void enqueue(SendWork work);
 
@@ -47,6 +47,17 @@ class RpcAgent {
   virtual std::shared_ptr<FutureMessage> send(
       const std::string& to, Message&& message) = 0;
 
+  // This is a temporary solution to gracefully stop the listening loop.
+  // ProcessGroupAgent does this by sending a SHUTDOWN message to the
+  // (rank + 1) % world_size peer, which means we cannot create
+  // ProcessGroupAgent with world_size == 1. We can drop this in the future when
+  // we find a way to gracefully exit the blocking recvAnysource call.
+  //
+  // FIXME: putting its implementation in destructor sometimes causes
+  // "Connection reset by peer" error. It seems somehow ProcessGroup object get
+  // destructed before RpcAgent object?
+  virtual void shutdown() = 0;
+
  protected:
   const std::string workerName_;
   const RequestCallback cb_;
 
@@ -29,7 +29,10 @@ PyObject* rpc_init(PyObject* /* unused */) {
 
   auto module = py::handle(dist_module).cast<py::module>();
 
-  auto rpcAgent = shared_ptr_class_<RpcAgent>(module, "RpcAgent");
+  auto rpcAgent = shared_ptr_class_<RpcAgent>(module, "RpcAgent")
+      .def("shutdown",
+         &RpcAgent::shutdown,
+         py::call_guard<py::gil_scoped_release>());
 
   auto futureMessage = shared_ptr_class_<FutureMessage>(module, "FutureMessage")
       .def("wait",
@@ -43,7 +46,10 @@ PyObject* rpc_init(PyObject* /* unused */) {
           module, "ProcessGroupAgent", rpcAgent)
           .def(py::init<std::string,
                         std::unordered_map<std::string, int>,
-                        std::shared_ptr<::c10d::ProcessGroup>>());
+                        std::shared_ptr<::c10d::ProcessGroup>>())
+          .def("shutdown",
+               &ProcessGroupAgent::shutdown,
+               py::call_guard<py::gil_scoped_release>());
 
   module.def("invoke_rpc", [](
       RpcAgent& agent,
 
@@ -32,6 +32,16 @@ def _collect_worker_names(name, group):
     return names
 
 
+def destroy_rpc():
+    r"""
+    Destroy the local RPC agent. This is blocking until globally all RPC agents
+    are destroyed.
+    """
+    global _agent
+    _agent.shutdown()
+    _agent = None
+
+
 def init_rpc(name, backend='pg'):
     r"""
     Initialize the local RPC agent which immediately becomes ready to make and