pytorch
diff --git a/‎torch/_dynamo/compiled_autograd.py
Lines changed: 1 addition & 0 deletions b/‎torch/_dynamo/compiled_autograd.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/autograd/function_hook.h
Lines changed: 8 additions & 0 deletions b/‎torch/csrc/autograd/function_hook.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/utils/lambda_post_hook.h
Lines changed: 22 additions & 6 deletions b/‎torch/csrc/autograd/utils/lambda_post_hook.h
Lines changed: 22 additions & 6 deletions
diff --git a/‎torch/csrc/distributed/c10d/reducer.cpp
Lines changed: 37 additions & 5 deletions b/‎torch/csrc/distributed/c10d/reducer.cpp
Lines changed: 37 additions & 5 deletions
diff --git a/‎torch/csrc/dynamo/compiled_autograd.h
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/dynamo/compiled_autograd.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/dynamo/python_compiled_autograd.cpp
Lines changed: 2 additions & 0 deletions b/‎torch/csrc/dynamo/python_compiled_autograd.cpp
Lines changed: 2 additions & 0 deletions
@@ -779,6 +779,7 @@ def pre_hook(self, inputs, hook_id):
         return inputs
 
     def post_hook(self, outputs, inputs, hook_id):
+        breakpoint()
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
 
@@ -42,6 +42,14 @@ struct TORCH_API FunctionPostHook {
         std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
         typeid(*this).name());
   }
+
+  virtual void apply_with_saved(
+      Variable& tensor,
+      torch::dynamo::autograd::SwapSavedVariables& saved) const {
+    throw std::runtime_error(
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+        typeid(*this).name());
+  }
 };
 
 struct TORCH_API PostAccumulateGradHook {
 
@@ -10,16 +10,23 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
   using variable_list = std::vector<torch::autograd::Variable>;
   using fn_type =
       std::function<variable_list(const variable_list&, const variable_list&)>;
-  using compiled_fn_type = std::function<void(CompiledNodeArgs&)>;
+  using compiled_args_fn_type = std::function<void(CompiledNodeArgs&)>;
+  using compiled_apply_fn_type =
+      std::function<void(Variable&, SwapSavedVariables&)>;
 
  public:
   // The lambda function takes as arguments the outputs and inputs of the
   // autograd function and can modify the outputs of the autograd function by
   // returning a new output if needed.
   /* implicit */ LambdaPostHook(fn_type fn) : fn_(std::move(fn)) {}
 
-  LambdaPostHook(fn_type fn, compiled_fn_type compiled_fn)
-      : fn_(std::move(fn)), compiled_fn_(std::move(compiled_fn)) {}
+  LambdaPostHook(
+      fn_type fn,
+      compiled_args_fn_type compiled_args_fn,
+      compiled_apply_fn_type compiled_apply_fn)
+      : fn_(std::move(fn)),
+        compiled_args_fn_(std::move(compiled_args_fn)),
+        compiled_apply_fn_(std::move(compiled_apply_fn)) {}
 
   variable_list operator()(
       const variable_list& outputs,
@@ -28,15 +35,24 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
   }
 
   void compiled_args(CompiledNodeArgs& args) const override {
-    if (compiled_fn_ != nullptr) {
-      return compiled_fn_(args);
+    if (compiled_args_fn_ != nullptr) {
+      return compiled_args_fn_(args);
     }
     return FunctionPostHook::compiled_args(args);
   }
 
+  void apply_with_saved(Variable& inputs, SwapSavedVariables& saved)
+      const override {
+    if (compiled_apply_fn_ != nullptr) {
+      return compiled_apply_fn_(inputs, saved);
+    }
+    return FunctionPostHook::apply_with_saved(inputs, saved);
+  }
+
  protected:
   std::function<variable_list(const variable_list&, const variable_list&)> fn_;
-  compiled_fn_type compiled_fn_{};
+  compiled_args_fn_type compiled_args_fn_{};
+  compiled_apply_fn_type compiled_apply_fn_{};
 };
 
 } // namespace torch::autograd::utils
@@ -126,7 +126,7 @@ Reducer::Reducer(
       use_python_reducer_(use_python_reducer) {
   C10_LOG_API_USAGE_ONCE("torch.distributed.ddp.reducer");
   TORCH_INTERNAL_ASSERT(!params_.empty(), "Expected at least one parameter.");
-
+  std::cout << "hello from c++ reducer" << std::endl;
   if (ddp_debug_level_ != c10d::DebugLevel::Off) {
     LOG(INFO) << "Reducer initialized with bucket_bytes_cap: "
               << bucket_bytes_cap_
@@ -174,6 +174,7 @@ Reducer::Reducer(
   // can be marked as ready for reduction.
   {
     const auto variable_count = params_.size();
+    std::cout << "reducer found " << variable_count << " variables" << std::endl;
     grad_accumulators_.resize(variable_count);
     for (const auto variable_index : c10::irange(variable_count)) {
       auto& variable = params_[variable_index];
@@ -187,6 +188,7 @@ Reducer::Reducer(
       using torch::distributed::autograd::ThreadLocalDistAutogradContext;
 #endif
       // Hook to execute after the gradient accumulator has executed.
+      std::cout << "registering the post hook"  << std::endl;
       hooks_.emplace_back(
           grad_accumulator->add_post_hook(std::make_unique<
                                           torch::autograd::utils::
@@ -201,12 +203,41 @@ Reducer::Reducer(
                 this->autograd_hook(variable_index);
                 return outputs;
               },
-              [this](torch::autograd::CompiledNodeArgs& args) {
-                TORCH_CHECK(
-                    this->use_python_reducer_,
-                    "Compiled autograd is not compatible with C++ DDP Reducer, please use torch._dynamo.config.optimize_ddp=\"python_reducer\".");
+              [this, variable_index](torch::autograd::CompiledNodeArgs& args) {
+                std::cout << "collecting the post hook on variable_index=" << variable_index << std::endl;
+                if (this->use_python_reducer_) {
+                  return;
+                }
+
+                // filters out unsupported DDP arguments
+                auto str =
+                    "Compiled autograd is not compatible with C++ DDP Reducer, please use torch._dynamo.config.optimize_ddp=\"python_reducer\".";
+                TORCH_CHECK(!mixed_precision_param_dtype_.has_value(), str);
+                TORCH_CHECK(!find_unused_parameters_, str);
+                TORCH_CHECK(ddp_debug_level_ == c10d::DebugLevel::Off, str);
+                TORCH_CHECK(rpc_context_.context_ptr.load() == nullptr, str);
+                if (static_graph_) {
+                  TORCH_WARN_ONCE(
+                      "static_graph ignored, compiled autograd always rebuilds buckets when param ready order changes.");
+                }
+
+                // Attempt to trace C++ Reducer
+                args.collect(variable_index);
+                // args.cpp_post_hook();
+                // at::Tensor& param = get_param_from_index(variable_index);
+              },
+              [this, variable_index](
+                  torch::autograd::Variable& variable,
+                  torch::autograd::SwapSavedVariables& saved) {
+                // update bucketing state in tracker
+                // saved.compiler_call.update_reducer_state
+                // issue bucketing op with the correct tensors
+                // pycompiler.call_ddp_autograd_hook(bucket: List[Tensor])
+                //   then bucket and issue collective
+                return;
               })),
           grad_accumulator);
+      std::cout << "registered post hook on " << &(*grad_accumulator) << std::endl;
 
       // Map raw function pointer to parameter index.
       // This is used later on when the autograd graph is traversed
@@ -2401,6 +2432,7 @@ void verify_params_across_processes(
 }
 
 void Reducer::remove_autograd_hooks() {
+  std::cout << "===========================REMOVING AUTOGRAD HOOKS======================" << std::endl;
   // Remove all hooks on variables registered by this Reducer. This is necessary
   // to make DDP failure recoverable. Otherwise, multiple Reducer instances
   // (from recoveries) will add their hooks to the original model, and those
 
@@ -367,6 +367,8 @@ struct AutogradCompilerCall {
   // pynode -> backward and backward state idx
   std::unordered_map<const Node*, std::pair<size_t, std::optional<size_t>>>
       pynode_objs;
+  // C++ reducer state
+
 };
 
 class CompiledNodeArgs {
@@ -611,6 +613,7 @@ class CompiledNodeArgs {
 #undef COLLECT_AS_BYTES
 
   void collect_hooks_from(Node* fn) {
+    std::cout << "collecting hooks from " << fn->name() << "(" << fn << ")" << std::endl;
     for (auto& i : fn->tensor_pre_hooks()) {
       i->compiled_args(*this);
     }
@@ -621,6 +624,7 @@ class CompiledNodeArgs {
       i->compiled_args(*this);
     }
     for (auto& i : fn->post_hooks()) {
+      std::cout << "found post hook" << std::endl;
       i->compiled_args(*this);
     }
     collect_size(_node_call.tensor_pre_hooks.size());
 
@@ -1053,6 +1053,7 @@ static CacheNode* _compiled_autograd_impl(
       }
 
       SwapSavedVariables saved(compiler_call, state, py_compiler.get(), call);
+      std::cout << call.node->name() << " has post hooks before apply_with_saved? " << (!call.post_hooks.empty()) << std::endl;
       variable_list outputs = call.node->apply_with_saved(inputs, saved);
       saved.debug_asserts();
       saved.before(call.node->next_edges());
@@ -1104,6 +1105,7 @@ static CacheNode* _compiled_autograd_impl(
       saved.after(call.node->next_edges());
       saved.debug_asserts();
 
+      std::cout << call.node->name() << " has post hooks after apply_with_saved? " << (!call.post_hooks.empty()) << std::endl;
       if (!call.post_hooks.empty()) {
         THPObjectPtr pyinputs(THPVariable_WrapList(inputs));
         THPObjectPtr pyoutputs(THPVariable_WrapList(outputs));