thiagocrepaldi
diff --git a/‎caffe2/CMakeLists.txt
Lines changed: 10 additions & 1 deletion b/‎caffe2/CMakeLists.txt
Lines changed: 10 additions & 1 deletion
diff --git a/‎test/cpp/dist_autograd/test_dist_autograd.cpp
Lines changed: 28 additions & 6 deletions b/‎test/cpp/dist_autograd/test_dist_autograd.cpp
Lines changed: 28 additions & 6 deletions
diff --git a/‎test/dist_autograd_test.py
Lines changed: 50 additions & 27 deletions b/‎test/dist_autograd_test.py
Lines changed: 50 additions & 27 deletions
diff --git a/‎test/dist_utils.py
Lines changed: 36 additions & 0 deletions b/‎test/dist_utils.py
Lines changed: 36 additions & 0 deletions
@@ -481,18 +481,27 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if (NOT INTERN_BUILD_MOBILE)
     list(APPEND TORCH_SRCS
       ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_container.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_context.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/functions/recvrpc_backward.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/autograd/functions/sendrpc_backward.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/future_message.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/message.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_call.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_resp.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/request_callback.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/rpc_with_autograd.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_remote_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_rref_proto.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_ret.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_resp.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/export.cpp
       ${TORCH_SRC_DIR}/csrc/jit/import_legacy.cpp
       ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp
       ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
+      ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
     )
   endif()
 
 
@@ -1,20 +1,39 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_context.h>
 #include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
 #include <torch/torch.h>
 
-TEST(DistAutogradTest, TestSendFunction) {
+using namespace torch::distributed::autograd;
+using namespace torch::distributed::rpc;
+
+class DistAutogradTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    autogradContainer_ = &DistAutogradContainer::init(0);
+  }
+  static DistAutogradContainer* autogradContainer_;
+};
+
+DistAutogradContainer* DistAutogradTest::autogradContainer_ = nullptr;
+
+TEST_F(DistAutogradTest, TestSendFunction) {
   // Initialize input tensors requiring grad.
   auto options = at::TensorOptions().requires_grad(true);
   auto in1 = torch::ones({3, 3}, options);
   auto in2 = torch::ones({3, 3}, options);
   ASSERT_FALSE(in1.grad().defined());
   ASSERT_FALSE(in2.grad().defined());
 
+  autogradContainer_->newContext();
+  DistAutogradContext& autogradContext = autogradContainer_->currentContext();
   // Attach the send autograd function to tensors.
-  auto send_function =
-      torch::distributed::autograd::addSendRpcBackward({in1, in2});
+  std::vector<torch::Tensor> tensors = {in1, in2};
+  addSendRpcBackward(autogradContext, AutogradMetadata(1, 1), tensors);
+  auto send_function = autogradContext.sendFunctions()[1];
   ASSERT_NE(send_function, nullptr);
 
   // Build loss and attach it as input to send autograd function.
@@ -33,14 +52,17 @@ TEST(DistAutogradTest, TestSendFunction) {
   ASSERT_TRUE(in2.grad().defined());
 }
 
-TEST(DistAutogradTest, TestSendFunctionInvalidInputs) {
+TEST_F(DistAutogradTest, TestSendFunctionInvalidInputs) {
   auto options = at::TensorOptions().requires_grad(true);
   auto in1 = torch::ones({3, 3}, options);
   auto in2 = torch::ones({3, 3}, options);
 
+  autogradContainer_->newContext();
+  DistAutogradContext& autogradContext = autogradContainer_->currentContext();
   // Attach the send autograd function to tensors.
-  auto send_function =
-      torch::distributed::autograd::addSendRpcBackward({in1, in2});
+  std::vector<torch::Tensor> tensors = {in1, in2};
+  addSendRpcBackward(autogradContext, AutogradMetadata(1, 1), tensors);
+  auto send_function = autogradContext.sendFunctions()[1];
 
   // Build loss and attach it as input to send autograd function.
   auto loss = torch::autograd::Variable(torch::ones({3, 3}));
 
@@ -1,35 +1,20 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
-from functools import wraps
+from dist_utils import dist_init
 import six
 import unittest
 import torch
+import time
 
-if not dist.is_available():
-    print("c10d not available, skipping tests")
-    sys.exit(0)
-
-def dist_init(func):
-    """
-    We use this decorator for setting up and tearing down state since
-    MultiProcessTestCase runs each `test*` method in a separate process and
-    each process just runs the `test*` method without actually calling
-    'setUp' and 'tearDown' methods of unittest.
-    """
-    @wraps(func)
-    def wrapper(self):
-        self.worker_id = self.rank
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(backend='gloo', rank=self.rank,
-                                world_size=self.world_size, store=store)
-        dist.init_model_parallel('worker%d' % self.rank)
-        func(self)
-        dist.join_rpc()
-
-    return wrapper
+prev_rank_rpc_done = False
+prev_rank_context_id = 0
+def _set_rpc_done(context_id):
+    global prev_rank_rpc_done
+    global prev_rank_context_id
+    prev_rank_rpc_done = True
+    prev_rank_context_id = context_id
 
 @unittest.skipIf(not six.PY3, "Pytorch distributed autograd package "
                  "does not support python2")
@@ -41,6 +26,10 @@ def world_size(self):
 
     @dist_init
     def test_autograd_context(self):
+        # Verify max possible id.
+        max_auto_increment = 281474976710655
+        self.assertEqual(max_auto_increment + (self.worker_id << 48), dist_autograd._get_max_id())
+
         context_ids = []
         for i in range(1000):
             with dist_autograd.context() as context_id:
@@ -54,12 +43,13 @@ def test_autograd_context(self):
                 dist_autograd._retrieve_context(context_id)
 
     @dist_init
-    def test_autograd_send_function(self):
+    def test_autograd_functions(self):
         dst_rank = (self.rank + 1) % self.world_size
         with dist_autograd.context() as context_id:
             t1 = torch.ones(3, 3, requires_grad=True)
             t2 = torch.zeros(3, 3, requires_grad=True)
             ret = dist.rpc_sync('worker{}'.format(dst_rank), torch.add, args=(t1, t2))
+            dist.rpc_sync('worker{}'.format(dst_rank), _set_rpc_done, args=(context_id,))
 
             # Get send function.
             ctx = dist_autograd._current_context()
@@ -68,7 +58,7 @@ def test_autograd_send_function(self):
             self.assertEqual(1, len(send_functions))
 
             # Retrieve the next functions in the graph.
-            next_funcs = send_functions[0].next_functions
+            next_funcs = list(send_functions.values())[0].next_functions
             self.assertEqual(2, len(next_funcs))
 
             # We should now hit t1 and t2 in the autograd graph.
@@ -79,6 +69,39 @@ def test_autograd_send_function(self):
             self.assertEqual(t2, next_funcs[1][0].variable)
             self.assertEqual(0, next_funcs[1][1])
 
+            # Test recv functions.
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(1, len(recv_functions))
+            self.assertEqual(ret.grad_fn, list(recv_functions.values())[0])
+
+            # We should have send/recv functions from the previous rank, get all
+            # contexts in this node to find them.
+
+            # Wait for the prev rank to be done with rpc.
+            while not prev_rank_rpc_done:
+                time.sleep(0.1)
+                pass
+
+            # Now verify the autograd graph.
+            ctx = dist_autograd._retrieve_context(prev_rank_context_id)
+
+            # Get the send function.
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+
+            # Verify next function is AddBackward0
+            next_funcs = list(send_functions.values())[0].next_functions
+            self.assertEqual(1, len(next_funcs))
+            add_backward_fn = next_funcs[0][0]
+            self.assertEqual('AddBackward0', add_backward_fn.name())
+
+            # Verify the next two functions are the same recv backward function.
+            next_funcs = add_backward_fn.next_functions
+            self.assertEqual(2, len(next_funcs))
+            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[0][0].name())
+            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[1][0].name())
+            self.assertEqual(next_funcs[0][0], next_funcs[1][0])
+
         # autograd context should be cleaned up by now.
         with self.assertRaises(RuntimeError):
             ctx = dist_autograd._retrieve_context(context_id)
@@ -99,7 +122,7 @@ def test_rpc_complex_args(self):
             self.assertEqual(torch.stack(tensors), ret)
 
             # Verify appropriate tensors have been attached the autograd graph.
-            next_funcs = dist_autograd._current_context()._send_functions()[0].next_functions
+            next_funcs = list(dist_autograd._current_context()._send_functions().values())[0].next_functions
             idx = 0
             for i in range(num_tensors):
                 if i % 2 == 0:
 
@@ -0,0 +1,36 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from os import getenv
+from functools import wraps
+import torch.distributed as dist
+from torch.distributed.rpc_api import RpcBackend
+
+if not dist.is_available():
+    print("c10d not available, skipping tests")
+    sys.exit(0)
+
+
+BACKEND = getenv('RPC_BACKEND', RpcBackend.PROCESS_GROUP)
+RPC_INIT_URL = getenv('RPC_INIT_URL', '')
+
+def dist_init(func):
+    """
+    We use this decorator for setting up and tearing down state since
+    MultiProcessTestCase runs each `test*` method in a separate process and
+    each process just runs the `test*` method without actually calling
+    'setUp' and 'tearDown' methods of unittest.
+    """
+    @wraps(func)
+    def wrapper(self):
+        self.worker_id = self.rank
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(backend='gloo', rank=self.rank,
+                                world_size=self.world_size, store=store)
+        dist.init_model_parallel(self_name='worker%d' % self.rank,
+                                 backend=BACKEND,
+                                 self_rank=self.rank,
+                                 init_method=RPC_INIT_URL)
+        func(self)
+        dist.join_rpc()
+
+    return wrapper