diff --git a/BUILD.bazel b/BUILD.bazel
index 6b73b899f19f94..d15c72c787eb18 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -511,6 +511,7 @@ filegroup(
         "caffe2/core/db.cc",
         "caffe2/core/event.cc",
         "caffe2/core/export_c10_op_to_caffe2.cc",
+        "caffe2/core/export_caffe2_op_to_c10.cc",
         "caffe2/core/graph.cc",
         "caffe2/core/init.cc",
         "caffe2/core/init_denormals.cc",
diff --git a/caffe2/core/export_caffe2_op_to_c10.cc b/caffe2/core/export_caffe2_op_to_c10.cc
new file mode 100644
index 00000000000000..dd260f29f7dd16
--- /dev/null
+++ b/caffe2/core/export_caffe2_op_to_c10.cc
@@ -0,0 +1,148 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <caffe2/core/export_caffe2_op_to_c10.h>
+#undef TORCH_ASSERT_NO_OPERATORS
+
+#if defined(EXPOSE_C2_OPS) ||                               \
+  !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#include <torch/library.h>
+
+namespace caffe2 {
+namespace detail {
+
+// We only manipulate the stack inside of this function, so the header
+// does not depend on Tensor or IValue at all.
+void call_caffe2_op_from_c10(
+    const OperatorHandle &opHandle,
+    c10::Stack* stack,
+    _CallCaffe2OpFunc* call_op) {
+  // precondition: on the stack, there's one IValue for each argument of the
+  // c10 schema. The last argument is an optional tensor list that
+  // (if not ivalue::None) contains a preallocated output tensor for each
+  // operator output.
+
+  // As an invariant, we don't want any autograd gradients to be tracked in
+  // Caffe2 operators.
+  at::NoGradGuard guard;
+
+  const auto &schema = opHandle.schema();
+  AT_ASSERT(
+      schema.arguments().size() != 0 &&
+      schema.arguments().back().type()->isSubtypeOf(
+          *OptionalType::create(ListType::ofTensors())));
+  IValue preallocated_outputs = torch::jit::pop(*stack);
+
+  c10::List<at::Tensor> outputs;
+  if (preallocated_outputs.isNone()) {
+    // either the schema doesn't support preallocated outputs or it does but
+    // they haven't been passed in. Pass a list of uninitialized tensors to
+    // the caffe2 operator as preallocated outputs.
+    outputs.resize(schema.returns().size());
+  } else {
+    AT_ASSERT(preallocated_outputs.isTensorList());
+    outputs = std::move(preallocated_outputs).toTensorList();
+  }
+
+  // -1 because the last argument is the list of preallocated tensors
+  const size_t num_inputs = schema.arguments().size() - 1;
+
+  // TODO Avoid vector allocation. One idea would be to keep the std::vector
+  // instances in the cache.
+  std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
+
+  // Convert outputs to caffe2::Tensor
+  const size_t num_outputs = outputs.size();
+  std::vector<caffe2::Tensor> outputs_c2(num_outputs);
+  for (auto i : c10::irange(num_outputs)) {
+    outputs_c2[i] = caffe2::Tensor(outputs.extract(i));
+  }
+
+  outputs_c2 = (*call_op)(schema, inputs, std::move(outputs_c2));
+  TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size());
+
+
+  bool return_tensor_list = false;
+  if (schema.returns().size() == 1) {
+    auto type = schema.returns()[0].type();
+    if (c10::ListTypePtr list_type = type->cast<c10::ListType>()) {
+      if (list_type->getElementType()->kind() == c10::TypeKind::TensorType) {
+        return_tensor_list = true;
+      }
+    }
+  }
+  if (return_tensor_list) {
+    for (const auto i : c10::irange(num_outputs)) {
+      outputs.set(i, at::Tensor(std::move(outputs_c2[i])));
+    }
+    torch::jit::push(*stack, outputs);
+  } else {
+    for (const auto i : c10::irange(num_outputs)) {
+      torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i])));
+    }
+  }
+
+  // postcondition: All inputs are cleared from the stack, there's now one
+  //                IValue for each output which holds the result. This
+  //                might reuse one of the preallocated tensors but doesn't have
+  //                to.
+}
+
+static FunctionSchema make_function_schema_for_c10(
+  const char* schema_str,
+  c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
+#if !defined(EXPOSE_C2_OPS) &&                              \
+  (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
+  throw std::logic_error(
+      "We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
+#else
+  c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str);
+  std::vector<c10::Argument> arguments = parsed_schema.arguments();
+  arguments.emplace_back(
+      PREALLOCATED_OUTPUT_ARGNAME,
+      c10::OptionalType::create(c10::ListType::ofTensors()),
+      nullopt,
+      IValue());
+
+  auto schema = FunctionSchema(
+      parsed_schema.name(),
+      parsed_schema.overload_name(),
+      std::move(arguments),
+      parsed_schema.returns(),
+      parsed_schema.is_vararg(),
+      parsed_schema.is_varret());
+  if (optional_alias_analysis_kind) {
+    schema.setAliasAnalysis(*optional_alias_analysis_kind);
+  }
+  return schema;
+#endif
+}
+
+template <c10::DispatchKey key>
+RegisterDefinition<key>::RegisterDefinition(const char *name, c10::BoxedKernel kernel) {
+  if (c10::impl::dispatch_key_allowlist_check(key)) {
+    static torch::Library m(
+        torch::Library::IMPL, "_caffe2", key, __FILE__, __LINE__);
+    m.impl(name, torch::CppFunction::makeFromBoxedKernel(std::move(kernel)));
+  }
+}
+
+template struct RegisterDefinition<c10::DispatchKey::CPU>;
+template struct RegisterDefinition<c10::DispatchKey::CUDA>;
+template struct RegisterDefinition<c10::DispatchKey::HIP>;
+
+RegisterSchema::RegisterSchema(
+    const char *schema_str,
+    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
+  static torch::Library m(
+    torch::Library::FRAGMENT, "_caffe2", c10::nullopt,
+      __FILE__, __LINE__);
+  m.def(make_function_schema_for_c10(schema_str, optional_alias_analysis_kind));
+}
+
+}  // namespace detail
+}  // namespace caffe2
+
+#endif
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 0dfd5e0366ddad..f02136b9e18945 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -5,16 +5,17 @@
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <ATen/core/dispatch/OperatorOptions.h>
-#include <ATen/core/function_schema.h>
-#include <ATen/core/grad_mode.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <c10/core/CompileTimeFunctionPointer.h>
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/frontend/function_schema_parser.h>
-#include <torch/library.h>
+#include <ATen/core/boxing/BoxedKernel.h>
 #include <caffe2/core/tensor.h>
 #include <vector>
 
+namespace c10 {
+struct FunctionSchema;
+struct IValue;
+class OperatorHandle;
+using Stack = std::vector<IValue>;
+}
+
 namespace caffe2 {
 namespace detail {
 
@@ -36,238 +37,75 @@ inline std::vector<caffe2::Tensor> _call_caffe2_op(
   return std::move(op).move_output_tensors();
 }
 
-// This function is inline in the hope that compilers optimizing for speed will
-// inline it into call_caffe2_op_from_c10, allowing call_op to be inlined and
-// avoiding the function pointer indirection, while compilers optimizing for
-// binary size will keep it a separate function instead of inlining it into
-// a template and will reuse the binary code of this function between ops.
-// We measured and confirmed that binary size off the instagram ios app is
-// reduced when having _call_caffe2_op_from_c10 separate from the templated
-// call_caffe2_op_from_c10.
-inline void _call_caffe2_op_from_c10(
-    c10::Stack* stack,
-    const c10::FunctionSchema& schema,
-    _CallCaffe2OpFunc* call_op) {
-  // precondition: on the stack, there's one IValue for each argument of the
-  // c10 schema. The last argument is an optional tensor list that
-  // (if not ivalue::None) contains a preallocated output tensor for each
-  // operator output.
-
-  // As an invariant, we don't want any autograd gradients to be tracked in
-  // Caffe2 operators.
-  at::NoGradGuard guard;
-
-  AT_ASSERT(
-      schema.arguments().size() != 0 &&
-      schema.arguments().back().type()->isSubtypeOf(
-          *OptionalType::create(ListType::ofTensors())));
-  IValue preallocated_outputs = torch::jit::pop(*stack);
-
-  const size_t num_inputs = schema.arguments().size() -
-      1; // -1 because the last argument is the list of preallocated tensors
-
-  c10::List<at::Tensor> outputs;
-  if (preallocated_outputs.isNone()) {
-    // either the schema doesn't support preallocated outputs or it does but
-    // they haven't been passed in. Pass a list of uninitialized tensors to
-    // the caffe2 operator as preallocated outputs.
-    outputs.resize(schema.returns().size());
-  } else {
-    AT_ASSERT(preallocated_outputs.isTensorList());
-    outputs = std::move(preallocated_outputs).toTensorList();
-  }
-
-  // TODO Avoid vector allocation. One idea would be to keep the std::vector
-  // instances in the cache.
-  std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
-
-  // Convert outputs to caffe2::Tensor
-  const size_t num_outputs = outputs.size();
-  std::vector<caffe2::Tensor> outputs_c2(num_outputs);
-  for (auto i : c10::irange(num_outputs)) {
-    outputs_c2[i] = caffe2::Tensor(outputs.extract(i));
-  }
-
-  outputs_c2 = (*call_op)(schema, std::move(inputs), std::move(outputs_c2));
-  TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size());
-
-  bool return_tensor_list = false;
-  if (schema.returns().size() == 1) {
-    auto type = schema.returns()[0].type();
-    if (c10::ListTypePtr list_type = type->cast<c10::ListType>()) {
-      if (list_type->getElementType()->kind() == c10::TypeKind::TensorType) {
-        return_tensor_list = true;
-      }
-    }
-  }
-  if (return_tensor_list) {
-    for (const auto i : c10::irange(num_outputs)) {
-      outputs.set(i, at::Tensor(std::move(outputs_c2[i])));
-    }
-    torch::jit::push(*stack, outputs);
-  } else {
-    for (const auto i : c10::irange(num_outputs)) {
-      torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i])));
-    }
-  }
-
-  // postcondition: All inputs are cleared from the stack, there's now one
-  //                IValue for each output which holds the result. This
-  //                might reuse one of the preallocated tensors but doesn't have
-  //                to.
+TORCH_API void call_caffe2_op_from_c10(
+    const c10::OperatorHandle &opHandle,
+    c10::Stack *stack,
+    _CallCaffe2OpFunc *call_op);
+
+template <typename Caffe2Operator>
+void boxed_caffe2_operator(const OperatorHandle& opHandle, c10::Stack* stack) {
+  call_caffe2_op_from_c10(
+      opHandle,
+      stack,
+      &_call_caffe2_op<Caffe2Operator>);
 }
 
-template <const c10::FunctionSchema& (*Schema)(), class Caffe2Operator>
-void call_caffe2_op_from_c10(
-    const c10::OperatorHandle& /*opHandle*/,
-    c10::Stack* stack) {
-  _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op<Caffe2Operator>);
-}
+template <c10::DispatchKey key>
+struct TORCH_API RegisterDefinition {
+  RegisterDefinition(const char *name, c10::BoxedKernel kernel);
+};
 
-inline FunctionSchema make_function_schema_for_c10(
-    const char* schema_str,
-    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
-#if !defined(EXPOSE_C2_OPS) && \
-    (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
-  throw std::logic_error(
-      "We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
-#else
-  c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str);
-  std::vector<c10::Argument> arguments = parsed_schema.arguments();
-  arguments.emplace_back(
-      PREALLOCATED_OUTPUT_ARGNAME,
-      c10::OptionalType::create(c10::ListType::ofTensors()),
-      nullopt,
-      IValue());
+extern template struct RegisterDefinition<c10::DispatchKey::CPU>;
+extern template struct RegisterDefinition<c10::DispatchKey::CUDA>;
+extern template struct RegisterDefinition<c10::DispatchKey::HIP>;
 
-  auto schema = FunctionSchema(
-      parsed_schema.name(),
-      parsed_schema.overload_name(),
-      std::move(arguments),
-      parsed_schema.returns(),
-      parsed_schema.is_vararg(),
-      parsed_schema.is_varret());
-  if (optional_alias_analysis_kind) {
-    schema.setAliasAnalysis(*optional_alias_analysis_kind);
-  }
-  return schema;
-#endif
-}
+struct TORCH_API RegisterSchema {
+  RegisterSchema(
+    const char *schema_str,
+    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind);
+};
 
 } // namespace detail
 } // namespace caffe2
 
-/**
- * To register a caffe2 operator caffe2::MyOperator with the c10 dispatcher,
- * call:
- *
- * In caffe2/operators/MyOperator.h:
- *
- * > C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(C10MyOperator) // C10MyOperator is the
- * name
- *                                              // used by c10 for this operator
- *
- * In caffe2/operators/MyOperator.cc
- *
- * > C10_EXPORT_CAFFE2_OP_TO_C10_CPU (
- * >    C10MyOperator,
- * >    "_caffe2::C10MyOperator(Tensor input1, int argument2, float argument3)
- * -> (Tensor output1, Tensor output2)" > caffe2::MyOperator<caffe2::CPUContext>
- * // This is the caffe2 operator >                                           //
- * class template > )
- *
- * In caffe2/operators/MyOperator.cu
- *
- * > C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(C10MyOperator ,
- *   caffe2::MyOperator<caffe2::CUDAContext>)
- *
- * Notes:
- * - all macros must be defined in the top level namespace, not in namespace
- *   caffe2.
- * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and
- *   C10_EXPORT_CAFFE2_OP_TO_C10_CPU .
- * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if
- *   you don't want to expose the operator for CUDA operations.
- * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor
- *   inputs must precede any non-tensor inputs.
- *
- * More complex use cases:
- * - If your operator has a variable number of input tensors, make the first (!)
- *   input an input of type TensorList. There must be no other tensor inputs.
- */
-#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)  \
-  namespace caffe2 {                                       \
-  namespace _c10_ops {                                     \
-  TORCH_API const FunctionSchema& schema_##OperatorName(); \
-  }                                                        \
-  }
-
-#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(             \
-    OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \
-  /* Register the op schema with the c10 dispatcher */       \
-  namespace caffe2 {                                         \
-  namespace _c10_ops {                                       \
-  C10_EXPORT const FunctionSchema& schema_##OperatorName() { \
-    static const FunctionSchema schema =                     \
-        ::caffe2::detail::make_function_schema_for_c10(      \
-            OperatorSchema, OptionalAliasAnalysisKind);      \
-    return schema;                                           \
-  }                                                          \
-  TORCH_LIBRARY_FRAGMENT(_caffe2, m) {                       \
-    m.def(::caffe2::detail::make_function_schema_for_c10(    \
-        OperatorSchema, OptionalAliasAnalysisKind));         \
-  }                                                          \
-  }                                                          \
-  }
-
-#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(                         \
-    OperatorName, OperatorClass)                                             \
-  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-  TORCH_LIBRARY_IMPL(_caffe2, CPU, m) {                                      \
-    m.impl(                                                                  \
-        "_caffe2::" #OperatorName,                                           \
-        torch::CppFunction::makeFromBoxedFunction<                           \
-            ::caffe2::detail::call_caffe2_op_from_c10<                       \
-                ::caffe2::_c10_ops::schema_##OperatorName,                   \
-                OperatorClass>>());                                          \
-  }
+#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)
 
-#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(          \
-    OperatorName, OperatorSchema, OperatorClass)  \
-  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(        \
-      OperatorName, OperatorSchema, c10::nullopt) \
+#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(                        \
+    OperatorName, OperatorSchema, OptionalAliasAnalysisKind)            \
+  /* Register the op schema with the c10 dispatcher */                  \
+  static const caffe2::detail::RegisterSchema                           \
+  C10_ANONYMOUS_VARIABLE(RegisterSchema_static_init_)(                  \
+    OperatorSchema, OptionalAliasAnalysisKind);
+
+#define _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(                                     \
+    OperatorName, OperatorClass, Key)                                         \
+  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */  \
+  static const caffe2::detail::RegisterDefinition<c10::DispatchKey::Key>      \
+    C10_ANONYMOUS_VARIABLE(Register##Key##Definition_static_init_)(           \
+        "_caffe2::" #OperatorName,                                            \
+        c10::BoxedKernel::makeFromFunction<                                   \
+            &::caffe2::detail::boxed_caffe2_operator<OperatorClass>>());
+
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(                    \
+    OperatorName, OperatorClass)                                        \
+  _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, CPU)
+
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(                                \
+    OperatorName, OperatorSchema, OperatorClass)                        \
+  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(                              \
+    OperatorName, OperatorSchema, c10::nullopt)                         \
   C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
 
-#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS(                \
-    OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \
-  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(                                  \
-      OperatorName, OperatorSchema, OptionalAliasAnalysisKind)              \
-  C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)   \
+  _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, CUDA)
 
-#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)        \
-  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-  TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) {                                     \
-    m.impl(                                                                  \
-        "_caffe2::" #OperatorName,                                           \
-        torch::CppFunction::makeFromBoxedFunction<                           \
-            ::caffe2::detail::call_caffe2_op_from_c10<                       \
-                ::caffe2::_c10_ops::schema_##OperatorName,                   \
-                OperatorClass>>());                                          \
-  }
 
 // You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro .
 // The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically
 // rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify .
-#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)         \
-  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-  TORCH_LIBRARY_IMPL(_caffe2, HIP, m) {                                      \
-    m.impl(                                                                  \
-        "_caffe2::" #OperatorName,                                           \
-        torch::CppFunction::makeFromBoxedFunction<                           \
-            ::caffe2::detail::call_caffe2_op_from_c10<                       \
-                ::caffe2::_c10_ops::schema_##OperatorName,                   \
-                OperatorClass>>());                                          \
-  }
+#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)    \
+  _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, HIP)
+
 
 #else
 // Don't use c10 dispatcher on mobile because of binary size
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
index 434a9b797cf51b..bdf94e2b4193fe 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cu
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <c10/core/GeneratorImpl.h>
 #include <algorithm>