pytorch
diff --git a/‎test/inductor/custom_ops.cpp
Lines changed: 38 additions & 0 deletions b/‎test/inductor/custom_ops.cpp
Lines changed: 38 additions & 0 deletions
diff --git a/‎test/inductor/test_aot_inductor_custom_ops.py
Lines changed: 33 additions & 0 deletions b/‎test/inductor/test_aot_inductor_custom_ops.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 17 additions & 1 deletion b/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 17 additions & 1 deletion
diff --git a/‎torch/_inductor/config.py
Lines changed: 5 additions & 0 deletions b/‎torch/_inductor/config.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch/_inductor/cpp_builder.py
Lines changed: 3 additions & 0 deletions b/‎torch/_inductor/cpp_builder.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/_inductor/fuzzer.py
Lines changed: 8 additions & 6 deletions b/‎torch/_inductor/fuzzer.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎torch/_inductor/ir.py
Lines changed: 4 additions & 1 deletion b/‎torch/_inductor/ir.py
Lines changed: 4 additions & 1 deletion
@@ -1,5 +1,8 @@
 #include <torch/csrc/api/include/torch/types.h>  // @manual=fbcode//caffe2:libtorch
 
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
 #include <cstdint>
 #include <iostream>
 #include <string>
@@ -310,8 +313,40 @@ void fn_out_variant_without_return_meta(
     Tensor& out) {
 }
 
+Tensor fn_square_impl(const Tensor& tensor) {
+  return tensor * tensor;
+}
+
+Tensor fn_square_meta(const Tensor& tensor) {
+  return at::empty_like(tensor);
+}
 } // namespace at
 
+
+extern "C" {
+  AOTI_TORCH_EXPORT AOTITorchError
+  aoti_torch_cpu_fn_square(
+      AtenTensorHandle input,
+      AtenTensorHandle* ret) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto tmp_result = at::fn_square_impl(
+          torch::aot_inductor::resolve_tensor_dispatch_flags(input));
+      *ret = torch::aot_inductor::new_tensor_handle(std::move(tmp_result));
+    });
+  }
+
+  AOTI_TORCH_EXPORT AOTITorchError
+  aoti_torch_cuda_fn_square(
+      AtenTensorHandle input,
+      AtenTensorHandle* ret) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto tmp_result = at::fn_square_impl(
+          torch::aot_inductor::resolve_tensor_dispatch_flags(input));
+      *ret = torch::aot_inductor::new_tensor_handle(std::move(tmp_result));
+    });
+  }
+}
+
 TORCH_LIBRARY(aoti_custom_ops, m) {
   m.def("custom_add(Tensor t1, Tensor t2) -> Tensor");
   m.def(
@@ -354,6 +389,7 @@ TORCH_LIBRARY(aoti_custom_ops, m) {
       "fn_with_input_mutation(Tensor(a!) t0, Tensor t1, Tensor(b!) t2) -> (Tensor, Tensor)");
 
   m.def("fn_out_variant_without_return(Tensor x, Tensor(a!) out) -> ()");
+  m.def("fn_square(Tensor x) -> Tensor");
 }
 
 TORCH_LIBRARY_IMPL(aoti_custom_ops, CompositeExplicitAutograd, m) {
@@ -365,6 +401,7 @@ TORCH_LIBRARY_IMPL(aoti_custom_ops, CompositeExplicitAutograd, m) {
   m.impl("fn_with_mix_outputs", at::fn_with_mix_outputs_impl);
   m.impl("fn_with_input_mutation", at::fn_with_input_mutation_impl);
   m.impl("fn_out_variant_without_return", at::fn_out_variant_without_return_impl);
+  m.impl("fn_square", at::fn_square_impl);
 }
 
 TORCH_LIBRARY_IMPL(aoti_custom_ops, Meta, m) {
@@ -375,4 +412,5 @@ TORCH_LIBRARY_IMPL(aoti_custom_ops, Meta, m) {
   m.impl("fn_with_mix_outputs", at::fn_with_mix_outputs_meta);
   m.impl("fn_with_input_mutation", at::fn_with_input_mutation_meta);
   m.impl("fn_out_variant_without_return", at::fn_out_variant_without_return_meta);
+  m.impl("fn_square", at::fn_square_meta);
 }
@@ -20,6 +20,8 @@
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
+    skipIfRocm,
+    skipIfXpu,
 )
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import HAS_CUDA
@@ -356,6 +358,37 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertEqual(len(inps), 0)
         self.assertTrue(sentinel_seen)
 
+    @skipIfXpu
+    @skipIfRocm
+    def test_custom_op_square(self) -> None:
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aoti_custom_ops.fn_square(x)
+
+        m = Model().to(device=self.device)
+        args = (torch.randn(2, 3, device=self.device),)
+        with config.patch(
+            "aot_inductor.custom_ops_to_c_shims",
+            {
+                torch.ops.aoti_custom_ops.fn_square.default: [
+                    """
+                AOTITorchError
+                aoti_torch_cpu_fn_square(
+                    AtenTensorHandle input,
+                    AtenTensorHandle* ret)""",
+                    """
+                AOTITorchError
+                aoti_torch_cuda_fn_square(
+                    AtenTensorHandle input,
+                    AtenTensorHandle* ret)""",
+                ],
+            },
+        ), config.patch(
+            "aot_inductor.custom_op_libs",
+            ["aoti_custom_ops"],
+        ):
+            self.check_model(m, args)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
 
@@ -6,7 +6,7 @@
 import os
 import sys
 import textwrap
-from itertools import count
+from itertools import chain, count
 from typing import Callable, Optional, Protocol, TYPE_CHECKING, Union
 
 import sympy
@@ -237,6 +237,22 @@ def write_prefix(self):
         if V.graph.is_const_graph:
             # We do not write prefix for constant graph, it will be written by main module.
             return
+        if config.aot_inductor.custom_ops_to_c_shims:
+            # custom_ops_to_c_shims contains declaration of custom ops with C shim.
+            # TODO: this could be auto-generated from a passed-in custom op schema
+            custom_c_shims = list(
+                chain(*config.aot_inductor.custom_ops_to_c_shims.values())
+            )
+            declarations = "\n".join(
+                [f"extern {textwrap.dedent(shim)};" for shim in custom_c_shims]
+            )
+            self.prefix.splice(
+                f"""
+                extern "C" {{
+                    {declarations}
+                }}
+                """
+            )
         if V.graph.aot_mode:
             self.prefix.writeline("namespace torch::aot_inductor {")
 
 
@@ -1317,6 +1317,11 @@ class aot_inductor:
     # Embed generated .cubin files into the .so
     embed_cubin: bool = False
 
+    # Custom ops that have implemented C shim wrappers, defined as an op to C shim declaration dict
+    custom_ops_to_c_shims: dict[torch._ops.OpOverload, list[str]] = {}
+    # custom op libs that have implemented C shim wrappers
+    custom_op_libs: Optional[list[str]] = None
+
 
 class cuda:
     """Settings for cuda backend, today this consists of cutlass"""
 
@@ -1323,6 +1323,9 @@ def get_cpp_torch_device_options(
                     # Only add link args, when compile_only is false.
                     passthrough_args = ["-Wl,-Bstatic -lcudart_static -Wl,-Bdynamic"]
 
+    if config.aot_inductor.custom_op_libs:
+        libraries += config.aot_inductor.custom_op_libs
+
     return (
         definitions,
         include_dirs,
 
@@ -220,9 +220,9 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
-                [type(default[0])] if len(default) else [type(None)],
+                [type(default[0])] if default and len(default) else [type(None)],
             )[0]
-            new_default = default[0] if len(default) > 0 else None
+            new_default = default[0] if default and len(default) > 0 else None
             return [
                 SamplingMethod._generate_value_for_type(
                     random_sample, field_name, elem_type, new_default
@@ -234,9 +234,9 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
-                [type(indexable[0])] if len(default) else [type(None)],
+                [type(indexable[0])] if default and len(default) else [type(None)],
             )[0]
-            new_default = indexable[0] if len(default) > 0 else None
+            new_default = indexable[0] if default and len(default) > 0 else None
             return {  # noqa: set_linter
                 SamplingMethod._generate_value_for_type(
                     random_sample, field_name, elem_type, new_default
@@ -248,9 +248,9 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
-                [type(indexable[0])] if len(default) else [type(None)],
+                [type(indexable[0])] if default and len(default) else [type(None)],
             )[0]
-            new_default = indexable[0] if len(default) > 0 else None
+            new_default = indexable[0] if default and len(default) > 0 else None
             return OrderedSet(
                 [
                     SamplingMethod._generate_value_for_type(
@@ -363,6 +363,8 @@ def dummy_function(*args, **kwargs):  # type: ignore[no-untyped-def]
                 )
 
             return dummy_function
+        elif type_hint == torch._ops.OpOverload:
+            return torch.ops.aten.add.default
         elif TypeExemplars.contains(type_hint):
             return TypeExemplars.example(type_hint)
         elif type_hint == Any:
 
@@ -7032,7 +7032,10 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             assert isinstance(kernel, torch._ops.OpOverload)
         elif V.graph.cpp_wrapper:
             # For non-aten OpOverload, i.e. custom ops
-            self.use_runtime_dispatch = True
+            # If the op is in custom_ops_to_c_shims, generate direct function call
+            self.use_runtime_dispatch = (
+                kernel not in config.aot_inductor.custom_ops_to_c_shims
+            )
 
         def do_runtime_dispatch() -> None:
             args = None