pytorch
diff --git a/‎caffe2/CMakeLists.txt
+3-1 b/‎caffe2/CMakeLists.txt
+3-1
diff --git a/‎test/inductor/test_aot_inductor.py
+12 b/‎test/inductor/test_aot_inductor.py
+12
diff --git a/‎test/inductor/test_aot_inductor_standalone.py
+121 b/‎test/inductor/test_aot_inductor_standalone.py
+121
diff --git a/‎torch/_inductor/codecache.py
+49-4 b/‎torch/_inductor/codecache.py
+49-4
diff --git a/‎torch/_inductor/codegen/cpp_prefix.h
+4 b/‎torch/_inductor/codegen/cpp_prefix.h
+4
diff --git a/‎torch/_inductor/codegen/cpp_template.py
+7-2 b/‎torch/_inductor/codegen/cpp_template.py
+7-2
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
+5-1 b/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
+5-1
diff --git a/‎torch/_inductor/config.py
+6-1 b/‎torch/_inductor/config.py
+6-1
@@ -1301,7 +1301,9 @@ target_include_directories(torch_cpu PRIVATE
 target_include_directories(torch_cpu PRIVATE
   ${TORCH_ROOT}/third_party/nlohmann/include)
 
-install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
+install(DIRECTORY
+  "${TORCH_SRC_DIR}/csrc"
+  "${TORCH_SRC_DIR}/standalone"
   DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
   FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
 install(FILES
 
@@ -156,6 +156,18 @@ def forward(self, x, y):
                 model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
             )
 
+    def test_cos(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.cos(x)
+                return y
+
+        example_inputs = (torch.randn(16, 10, device=self.device),)
+        self.check_model(Model(), example_inputs)
+
     def test_small_constant(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
 
@@ -0,0 +1,121 @@
+# Owner(s): ["module: inductor"]
+import copy
+import functools
+import sys
+import unittest
+
+from torch._inductor import config
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
+from torch.testing._internal.inductor_utils import GPU_TYPE
+
+
+if IS_WINDOWS and IS_CI:
+    sys.stderr.write(
+        "Windows CI does not have necessary dependencies for test_torchinductor yet\n"
+    )
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("requires sympy/functorch/filelock")
+
+try:
+    try:
+        from .test_aot_inductor import (
+            AOTInductorTestsTemplate,
+            check_model,
+            check_model_with_multiple_inputs,
+            code_check_count,
+        )
+    except ImportError:
+        from test_aot_inductor import (  # @manual
+            AOTInductorTestsTemplate,
+            check_model,
+            check_model_with_multiple_inputs,
+            code_check_count,
+        )
+except (unittest.SkipTest, ImportError):
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise
+
+
+# Similar to copy_tests in test_torchinductor.py, but only takes a whitelist of tests
+def copy_tests(my_cls, other_cls, suffix, whitelist):  # noqa: B902
+    for name, value in my_cls.__dict__.items():
+        if name.startswith("test_") and name in whitelist:
+            # You cannot copy functions in Python, so we use closures here to
+            # create objects with different ids. Otherwise, unittest.skip
+            # would modify all methods sharing the same object id. Also, by
+            # using a default argument, we create a copy instead of a
+            # reference. Otherwise, we would lose access to the value.
+
+            @functools.wraps(value)
+            @config.patch(
+                {
+                    "aot_inductor.codegen_standalone": True,
+                    "max_autotune_gemm_backends": "TRITON",
+                    "max_autotune_conv_backends": "TRITON",
+                }
+            )
+            def new_test(self, value=value):
+                return value(self)
+
+            # Copy __dict__ which may contain test metadata
+            new_test.__dict__ = copy.deepcopy(value.__dict__)
+            setattr(other_cls, f"{name}_{suffix}", new_test)
+
+    # Special case convenience routine
+    if hasattr(my_cls, "is_dtype_supported"):
+        other_cls.is_dtype_supported = my_cls.is_dtype_supported
+
+
+test_list_cpu = {
+    # Need to sort out third-party library build issues, e.g. blas, sleef
+}
+
+
+class AOTInductorTestLibtorchFreeCpu(TestCase):
+    device = "cpu"
+    device_type = "cpu"
+    check_model = check_model
+    check_model_with_multiple_inputs = check_model_with_multiple_inputs
+    code_check_count = code_check_count
+    allow_stack_allocation = False
+    use_minimal_arrayref_interface = False
+
+
+copy_tests(
+    AOTInductorTestsTemplate,
+    AOTInductorTestLibtorchFreeCpu,
+    "cpu_standalone",
+    test_list_cpu,
+)
+
+test_list_gpu = {
+    "test_cos",
+}
+
+
+@unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
+class AOTInductorTestLibtorchFreeGpu(TestCase):
+    device = GPU_TYPE
+    device_type = GPU_TYPE
+    check_model = check_model
+    check_model_with_multiple_inputs = check_model_with_multiple_inputs
+    code_check_count = code_check_count
+    allow_stack_allocation = False
+    use_minimal_arrayref_interface = False
+
+
+copy_tests(
+    AOTInductorTestsTemplate,
+    AOTInductorTestLibtorchFreeGpu,
+    f"{GPU_TYPE}_standalone",
+    test_list_gpu,
+)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests(needs="filelock")
@@ -1532,6 +1532,12 @@ def get_keys(cls) -> KeysView[str]:
 
 
 class AotCodeCompiler:
+    """
+    AOTCodeCompiler is a class that handles the compilation of AOTInductor
+    kernels. It is responsible for generating the kernel and wrapper code,
+    compiling and packaging them.
+    """
+
     @classmethod
     def compile(
         cls,
@@ -1744,6 +1750,9 @@ def _compile_consts(consts: bytes, platform: str) -> str:
 
             metadata = config.aot_inductor.metadata
             metadata["AOTI_DEVICE_KEY"] = device_type
+            metadata["STANDALONE"] = (
+                "1" if config.aot_inductor.codegen_standalone else "0"
+            )
 
             # Save user provided metadata
             meta_json = str(
@@ -1878,6 +1887,27 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 
             log.debug("aot wrapper compilation command: %s", wrapper_compile_cmd)
             log.debug("aot kernel compilation command: %s", kernel_compile_cmd)
+
+            cuda_utils_o: list[str] = []
+            if config.aot_inductor.codegen_standalone and device_type == "cuda":
+                # TODO: seletively add additional cuda files
+                cuda_util_files: list[str] = []
+                cuda_build_options = CppTorchDeviceOptions(
+                    compiler="nvcc",
+                    compile_only=True,
+                    **compile_command,
+                )
+                for file in cuda_util_files:
+                    cuda_builder = CppBuilder(
+                        name=file,
+                        sources=file,
+                        output_dir=str(wrapper_path_operator.parent),
+                        BuildOption=cuda_build_options,
+                    )
+                    if not config.aot_inductor.package_cpp_only:
+                        cuda_builder.build()
+                        cuda_utils_o.append(cuda_builder.get_target_file_path())
+
             if config.aot_inductor.package_cpp_only:
                 # Not doing the actual compilation here
                 compile_flags = str(
@@ -2001,7 +2031,14 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 use_relative_path=use_relative_path,
             )
 
-            obj_srcs = [wrapper_o, kernel_o, consts_o, *gpu_kernels_o, *cubins_o]
+            obj_srcs = [
+                wrapper_o,
+                kernel_o,
+                consts_o,
+                *gpu_kernels_o,
+                *cubins_o,
+                *cuda_utils_o,
+            ]
             so_builder = CppBuilder(
                 name=output_name,
                 sources=obj_srcs,
@@ -2096,7 +2133,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 @clear_on_fresh_inductor_cache
 @functools.lru_cache
 def cpp_prefix_path() -> str:
-    path = Path(__file__).parent / "codegen/cpp_prefix.h"
+    path = Path(__file__).parent / "codegen" / "cpp_prefix.h"
     with path.open() as f:
         content = f.read()
         _, filename = write(
@@ -2571,7 +2608,11 @@ class CppWrapperCodeCache(CppPythonBindingsCodeCache):
     call_entry_function = "return inductor_entry_cpp({});"
     extra_parse_arg = textwrap.dedent(
         """
+        #ifdef AOTI_STANDALONE
+        #include <torch/csrc/inductor/aoti_standalone/c/shim.h>
+        #else
         #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+        #endif // AOTI_STANDALONE
 
         static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
             std::vector<AtenTensorHandle> result;
@@ -3215,7 +3256,7 @@ def _nvcc_host_compiler_options() -> list[str]:
     ]
 
 
-def _nvcc_compiler_options() -> list[str]:
+def _nvcc_get_arch_option() -> str:
     arch = cuda_env.get_cuda_arch()
     if arch == "90":
         # Required by cutlass compilation.
@@ -3225,13 +3266,17 @@ def _nvcc_compiler_options() -> list[str]:
     code = [f"sm_{arch}", f"compute_{arch}"]
     if config.cuda.enable_cuda_lto:
         code += [f"lto_{arch}"]
+    return f"gencode=arch=compute_{arch},code=[{','.join(code)}]"
+
+
+def _nvcc_compiler_options() -> list[str]:
     options = [
         "-t=0",
         "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
         "-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1",
         "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
         "-w",
-        f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
+        f"-{_nvcc_get_arch_option()}",
         config.cuda.compile_opt_level,
         "-std=c++17",
         "--expt-relaxed-constexpr",
 
@@ -29,7 +29,11 @@
 #include <c10/util/irange.h>
 #include <c10/util/Half.h>
 #include <c10/util/TypeCast.h>
+#ifdef AOTI_STANDALONE
+#include <torch/csrc/inductor/aoti_standalone/c/shim.h>
+#else
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#endif // AOTI_STANDALONE
 
 #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
 #define INDUCTOR_USE_VECTOR_TYPES() 1
 
@@ -124,8 +124,13 @@ def header(self) -> IndentedBuffer:
         res = IndentedBuffer()
         res.writeline(codecache.cpp_prefix())
         # TODO: add c10::ForcedUnroll test to test_aoti_abi_check
-        res.splice("""#include <c10/util/Unroll.h>""")
-        res.splice("""#include <torch/csrc/inductor/aoti_torch/c/shim.h>""")
+        res.splice("""
+            #include <c10/util/Unroll.h>
+            #ifdef AOTI_STANDALONE
+            #include <torch/csrc/inductor/aoti_standalone/c/shim.h>
+            #else
+            #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+            #endif // AOTI_STANDALONE""")
         enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
             "linux",
             "win32",
 
@@ -172,7 +172,9 @@ def add_device_include(self, device: str) -> None:
         # present.
         self.header.splice(self.get_device_include_path(device))
         extend_aoti_c_shim_include = (
-            f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
+            f"torch/csrc/inductor/aoti_standalone/{self.device}/c_shim_{self.device}.h"
+            if config.aot_inductor.codegen_standalone
+            else f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
         )
         extend_aoti_c_shim_path = os.path.join(
             os.path.dirname(torch.__file__),
@@ -942,6 +944,8 @@ def finalize_prefix(self):
             self.codegen_const_run_driver()
             aot_mode_decls.writeline("} // namespace torch::aot_inductor")
             aot_mode_decls.writeline("using namespace torch::aot_inductor;")
+            if config.aot_inductor.codegen_standalone:
+                aot_mode_decls.writeline("using namespac
741A
e torch::standalone;")
 
         self.prefix = cache_decls = IndentedBuffer()
         for dtype in self.used_cached_dtypes:
 
@@ -1250,7 +1250,7 @@ class aot_inductor:
     force_mmap_weights: bool = False
 
     package: bool = False
-    package_cpp_only: bool = False
+    package_cpp_only: bool = os.environ.get("AOT_INDUCTOR_PACKAGE_CPP_ONLY", "0") == "1"
 
     # Dictionary of metadata users might want to save to pass to the runtime.
     # TODO: Move this somewhere else, since it's no longer really a config
@@ -1295,6 +1295,11 @@ class aot_inductor:
     # Experimental.  Controls automatic precompiling of common AOTI include files.
     precompile_headers: bool = False
 
+    # Experimental.  Controls whether to generate model code in a standalone way.
+    codegen_standalone: bool = (
+        os.environ.get("AOT_INDUCTOR_CODEGEN_STANDALONE", "0") == "1"
+    )
+
     # Embed generated .cubin files into the .so
     embed_cubin: bool = False