pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 1 deletion b/‎.lintrunner.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 3 additions & 0 deletions b/‎setup.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/_inductor/codecache.py
Lines changed: 0 additions & 1 deletion b/‎torch/_inductor/codecache.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎torch/_inductor/codegen/common.py
Lines changed: 0 additions & 3 deletions b/‎torch/_inductor/codegen/common.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 18 additions & 66 deletions b/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
Lines changed: 18 additions & 66 deletions
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
Lines changed: 5 additions & 13 deletions b/‎torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
Lines changed: 5 additions & 13 deletions
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_gpu.py
Lines changed: 0 additions & 3 deletions b/‎torch/_inductor/codegen/cpp_wrapper_gpu.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torch/_inductor/codegen/cuda/device_op_overrides.py
Lines changed: 0 additions & 3 deletions b/‎torch/_inductor/codegen/cuda/device_op_overrides.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torch/_inductor/codegen/debug_utils.py
Lines changed: 8 additions & 5 deletions b/‎torch/_inductor/codegen/debug_utils.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎torch/_inductor/codegen/wrapper.py
Lines changed: 2 additions & 1 deletion b/‎torch/_inductor/codegen/wrapper.py
Lines changed: 2 additions & 1 deletion
@@ -556,7 +556,7 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=#include <pybind11\/',
+    '--pattern=#include <pybind11\/(^|[^(gil\.h)])',
     '--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
     '--linter-name=PYBIND11_INCLUDE',
     '--match-first-only',
 
@@ -1284,13 +1284,16 @@ def main():
         "include/torch/csrc/distributed/autograd/rpc_messages/*.h",
         "include/torch/csrc/dynamo/*.h",
         "include/torch/csrc/inductor/*.h",
+        "include/torch/csrc/inductor/aoti_include/*.h",
         "include/torch/csrc/inductor/aoti_package/*.h",
         "include/torch/csrc/inductor/aoti_runner/*.h",
         "include/torch/csrc/inductor/aoti_runtime/*.h",
         "include/torch/csrc/inductor/aoti_torch/*.h",
         "include/torch/csrc/inductor/aoti_torch/c/*.h",
         "include/torch/csrc/inductor/aoti_torch/generated/*.h",
         "include/torch/csrc/inductor/aoti_torch/generated/extend/*.h",
+        "include/torch/csrc/inductor/cpp_wrapper/*.h",
+        "include/torch/csrc/inductor/cpp_wrapper/device_internal/*.h",
         "include/torch/csrc/jit/*.h",
         "include/torch/csrc/jit/backends/*.h",
         "include/torch/csrc/jit/generated/*.h",
 
@@ -688,7 +688,6 @@ def get_code_hash(root: str) -> bytes:
                 # a hash representing the state of the source code.
                 extra_files = (
                     "codegen/aoti_runtime/interface.cpp",
-                    "codegen/aoti_runtime/implementation.cpp",
                     "codegen/cpp_prefix.h",
                     "script.ld",
                 )
 
@@ -250,9 +250,6 @@ def kernel_header(self):
     def kernel_driver(self):
         raise NotImplementedError
 
-    def abi_compatible_header(self):
-        raise NotImplementedError
-
     def cpp_stream_type(self):
         raise NotImplementedError
 
 
@@ -18,7 +18,7 @@
 from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import config, ir
-from ..utils import _align, ALIGN_BYTES, cache_on_self, normalize_name
+from ..utils import _align, cache_on_self, normalize_name
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides, IndentedBuffer, Kernel
@@ -126,85 +126,35 @@ def write_constant(self, name, hashed):
         # include a hash so our code cache gives different constants different files
         self.header.writeline(f"// {name} {hashed}")
 
+    def get_device_include(self):
+        if V.graph.aot_mode:
+            return f"#include <torch/csrc/inductor/aoti_include/{self.device}.h>"
+        return f"#include <torch/csrc/inductor/cpp_wrapper/{self.device}.h>"
+
     def write_header(self):
         if V.graph.is_const_graph:
             # We do not write header for constant graph, it will be written by main module.
             return
 
-        if V.graph.aot_mode:
-            self.header.splice(
-                """
-                #include <torch/csrc/inductor/aoti_runtime/interface.h>
-                #include <torch/csrc/inductor/aoti_runtime/model.h>
-                """
-            )
-            with open(
-                os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp")
-            ) as f:
-                self.header.splice(f.read())
-        else:
+        if not V.graph.aot_mode:
             self.header.splice(
                 """
                 import torch
                 from torch._inductor.codecache import CppWrapperCodeCache
 
                 cpp_wrapper_src = (
                 '''
-                #include <optional>
-                #include <Python.h>
-
-                #define PYBIND11_SIMPLE_GIL_MANAGEMENT
-                #include <pybind11/gil.h>
-                namespace py = pybind11;
-
-                class RAIIPyObject {
-                public:
-                    RAIIPyObject() : obj_(nullptr) {}
-                    RAIIPyObject(PyObject* obj) : obj_(obj) {}
-                    ~RAIIPyObject() {
-                        Py_XDECREF(obj_);
-                    }
-                    RAIIPyObject& operator=(const RAIIPyObject& other) {
-                        if (this != &other) {
-                            Py_XDECREF(obj_);
-                            obj_ = other.obj_;
-                            Py_XINCREF(obj_);
-                        }
-                        return *this;
-                    }
-                    operator PyObject*() {
-                        return obj_;
-                    }
-                    PyObject* get() {
-                        return obj_;
-                    }
-                private:
-                    PyObject* obj_;
-                };
-
-                #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
-                #include <torch/csrc/inductor/aoti_runtime/utils.h>
-                using namespace torch::aot_inductor;
                 """
             )
 
-        self.header.splice(
-            f"""
-            #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
-            #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
-            #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
-            #include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>
-
-            #include <c10/util/generic_math.h>
-            typedef at::Half half;
-            typedef at::BFloat16 bfloat16;
-
-            // Round up to the nearest multiple of {ALIGN_BYTES}
-            [[maybe_unused]] static int64_t align(int64_t nbytes) {{
-              return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES};
-            }}
-            """
-        )
+        self.header.splice(self.get_device_include())
+
+        if V.graph.aot_mode:
+            with open(
+                os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp")
+            ) as f:
+                self.header.splice(f.read())
+
         extend_aoti_c_shim_include = (
             f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
         )
@@ -1517,8 +1467,10 @@ def create_dtypeview_call(reinterpret_call: str) -> tuple[str, List[str]]:
         return final_tmp_name
 
     def codegen_device_copy(self, src, dst, non_blocking: bool):
+        """This function is overridden by cpp_wrapper_cpu_array_ref, so we don't need to
+        handle cases where dst is not an AtenTensorHandle."""
         self.writeline(
-            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_copy_(expensive_copy_to_tensor_if_needed({dst}), {src}, {non_blocking}));"
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_copy_({dst}, {src}, {non_blocking}));"
         )
 
     def codegen_multi_output(self, name, value):
 
@@ -1,5 +1,4 @@
 # mypy: allow-untyped-defs
-import os
 from itertools import count
 from typing import Callable, Dict, List, Optional
 
@@ -82,18 +81,11 @@ def get_input_cpp_type(input):
             return DTYPE_TO_CPP[dtype]
         return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
 
-    def write_header(self):
-        if V.graph.is_const_graph:
-            # We do not write header for constant graph, it will be written by main module.
-            return
-
-        super().write_header()
-        with open(
-            os.path.join(
-                os.path.dirname(__file__), "aoti_runtime", "implementation.cpp"
-            )
-        ) as f:
-            self.header.splice(f.read())
+    def get_device_include(self):
+        assert self.device == "cpu", "ArrayRef only supported on CPU!"
+        if V.graph.aot_mode:
+            return "#include <torch/csrc/inductor/aoti_include/array_ref.h>"
+        return "#include <torch/csrc/inductor/cpp_wrapper/array_ref.h>"
 
     def codegen_input_numel_asserts(self):
         for name, buf in V.graph.graph_inputs.items():
 
@@ -203,9 +203,6 @@ def write_header(self):
             return
 
         super().write_header()
-
-        self.header.splice("#include <filesystem>")
-        self.header.splice(self.device_codegen.abi_compatible_header())
         self.header.splice(
             maybe_hipify_code_wrapper(self.device_codegen.kernel_driver())
         )
 
@@ -225,9 +225,6 @@ def tma_descriptor_helpers(self):
             #endif
         """
 
-    def abi_compatible_header(self):
-        return "#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>"
-
     def cpp_stream_type(self):
         return "cudaStream_t"
 
 
@@ -53,13 +53,13 @@ class DebugPrinterManager:
     def __init__(
         self,
         debug_printer_level,
+        use_array_ref: bool,
         args_to_print_or_save: Optional[List[str]] = None,
         kernel_name: str = "",
         kernel=None,
-        arg_signatures: Optional[List[type]] = None,
-        kernel_type=None,
     ):
         self.debug_printer_level = IntermediateValueDebuggingLevel(debug_printer_level)
+        self.use_array_ref = use_array_ref
         if args_to_print_or_save is None:
             args_to_print_or_save = []
         self.args_to_print_or_save = args_to_print_or_save
@@ -155,12 +155,15 @@ def set_printer_args(
             ]
             self.args_to_print_or_save = args_to_print_or_save_extern
         elif kernel_type == "cpp":
-            args_to_print_or_save_cpp = [
-                f"copy_arrayref_tensor_to_tensor({arg})"
+            self.args_to_print_or_save = [
+                (
+                    f"copy_arrayref_tensor_to_tensor({arg})"
+                    if self.use_array_ref
+                    else arg
+                )
                 for arg in args_to_print_or_save
                 if arg.startswith(("buf", "arg"))
             ]
-            self.args_to_print_or_save = args_to_print_or_save_cpp
         else:
             self.args_to_print_or_save = args_to_print_or_save
         self.kernel_name = kernel_name
 
@@ -721,7 +721,8 @@ def add_import_once(line: str) -> None:
 
         # intermediate tensor value printing utility
         self.debug_printer = DebugPrinterManager(
-            debug_printer_level=config.aot_inductor.debug_intermediate_value_printer
+            debug_printer_level=config.aot_inductor.debug_intermediate_value_printer,
+            use_array_ref=config.aot_inductor.allow_stack_allocation,
         )
 
         # Additional files that are dependent to the wrapper (ex. cubin files)
Original file line number	Diff line number	Diff line change
`@@ -688,7 +688,6 @@ def get_code_hash(root: str) -> bytes:`
`688`	`688`	`# a hash representing the state of the source code.`
`689`	`689`	`extra_files = (`
`690`	`690`	`"codegen/aoti_runtime/interface.cpp",`
`691`		`- "codegen/aoti_runtime/implementation.cpp",`
`692`	`691`	`"codegen/cpp_prefix.h",`
`693`	`692`	`"script.ld",`
`694`	`693`	`)`
Original file line number	Diff line number	Diff line change
@@ -203,9 +203,6 @@ def write_header(self):
`203`	`203`	`return`
`204`	`204`
`205`	`205`	`super().write_header()`
`206`		`-`
`207`		`- self.header.splice("#include <filesystem>")`
`208`		`- self.header.splice(self.device_codegen.abi_compatible_header())`
`209`	`206`	`self.header.splice(`
`210`	`207`	`maybe_hipify_code_wrapper(self.device_codegen.kernel_driver())`
`211`	`208`	`)`
Original file line number	Diff line number	Diff line change
`@@ -721,7 +721,8 @@ def add_import_once(line: str) -> None:`
`721`	`721`
`722`	`722`	`# intermediate tensor value printing utility`
`723`	`723`	`self.debug_printer = DebugPrinterManager(`
`724`		`- debug_printer_level=config.aot_inductor.debug_intermediate_value_printer`
	`724`	`+ debug_printer_level=config.aot_inductor.debug_intermediate_value_printer,`
	`725`	`+ use_array_ref=config.aot_inductor.allow_stack_allocation,`
`725`	`726`	`)`
`726`	`727`
`727`	`728`	`# Additional files that are dependent to the wrapper (ex. cubin files)`