From bbef6cacef793c691e8f2df6849f63867e0c0377 Mon Sep 17 00:00:00 2001
From: Benjamin Glass <bglass@quansight.com>
Date: Fri, 7 Mar 2025 18:35:57 +0000
Subject: [PATCH 1/2] Update

[ghstack-poisoned]
---
 torch/_inductor/codecache.py               | 153 +++++++++++++++------
 torch/_inductor/codegen/cpp_wrapper_cpu.py |  26 ++--
 torch/_inductor/graph.py                   |  22 +--
 3 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 0ea5a7155d8c..a0fe062862d0 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -126,6 +126,7 @@ def use_global_cache() -> bool:  # type: ignore[misc]
     from concurrent.futures import Future
 
     from .compile_fx import _CompileFxKwargs, CompiledFxGraph
+    from .cpp_builder import BuildOptionsBase
     from .graph import GraphLowering
     from .ir import ChoiceCaller
     from .output_code import CompiledFxGraphConstants, OutputCode
@@ -1968,7 +1969,7 @@ def _get_file_checksum(filename: str) -> str:
     os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
     _worker_compile_cpp(
         os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
-        cpp_builder,
+        (cpp_builder,),
     )
 
     return header_full_path
@@ -2035,10 +2036,11 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
     @classmethod
     def load_async(
         cls,
-        source_code: str,
+        main_code: str,
         device_type: str = "cpu",
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
+        optimized_code: Optional[str] = None,
     ) -> Any:
         compile_command = {
             **cls.cpp_compile_command_flags,
@@ -2050,46 +2052,112 @@ def load_async(
 
         _set_gpu_runtime_env()  # cpp_extension consults the env
 
-        cpp_build_option = CppTorchDeviceOptions(**compile_command)
-        command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
-        # write function will calc source_code hash, the same source code with different
-        # ISA level should be generate different hash.
-        # So we need get a command_line which contains isa related parameter as a part of hash key.
-        # And then pass the command_line to below write function as extra parameter to
-        # guarantee the source code hash contains ISA difference.
-        vec_isa_cmd = repr(command_gen.get_command_line())
-        key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
+        # Note the distinction between the two booleans.  We do minimal optimization if
+        # the optimized_code argument is present at all, since that's how the user of
+        # this function opts in, but we do compilation and linking in one step if the
+        # optimized_code argument is empty (as a micro-optimization).
+        main_build_option = CppTorchDeviceOptions(
+            compile_only=bool(optimized_code),
+            min_optimize=optimized_code is not None,
+            **compile_command,
+        )
+        optimized_build_option = CppTorchDeviceOptions(
+            compile_only=True, **compile_command
+        )
+
+        def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
+            """Writing the code to file will calculate a hash, which we need to vary if
+            the command line flags change.  This implements a mostly-generic way of
+            validating that."""
+            return CppBuilder(
+                name="o", sources="i", BuildOption=build_option
+            ).get_command_line()
+
+        main_cmd_line = get_hashable_command_line(main_build_option)
+        optimized_cmd_line = get_hashable_command_line(optimized_build_option)
+
+        key, main_path = write(
+            main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
+        )
+
+        # Don't bother writing if the argument is empty.
+        if optimized_code:
+            _, optimized_path = write(
+                optimized_code, "optimized.cpp", extra=optimized_cmd_line
+            )
+        else:
+            # Unused, but makes type checkers happy.
+            optimized_path = os.devnull
 
         if key not in cls.cache:
             from torch.utils._filelock import FileLock
 
             lock_path = os.path.join(get_lock_dir(), key + ".lock")
-            output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
             future: Optional[Future[Any]] = None
             lib = None
 
             # if requested, pre-compile any headers
-            if config.cpp_cache_precompile_headers and (
-                header_file := cls._get_uncompiled_header(device_type)
-            ):
-                cpp_build_option.precompiled_header = _precompile_header(
-                    header_file,
-                    vec_isa_cmd,
-                    **compile_command,
-                )
+            if config.cpp_cache_precompile_headers:
+                if header := cls._get_uncompiled_header(device_type):
+                    main_build_option.precompiled_header = _precompile_header(
+                        header,
+                        main_cmd_line,
+                        min_optimize=optimized_code is not None,
+                        **compile_command,
+                    )
 
-            cpp_builder = CppBuilder(
-                name=output_name,
-                sources=input_path,
+                # Currently, the optimized_code field is only used for cpp kernel code,
+                # so go ahead and precompile the relevant header here.  Revisit this
+                # decision if that ever changes.
+                if optimized_code and (header := _get_cpp_prefix_header(device_type)):
+                    optimized_build_option.precompiled_header = _precompile_header(
+                        header,
+                        optimized_cmd_line,
+                        **compile_command,
+                    )
+
+            main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
+            main_builder = CppBuilder(
+                name=main_name,
+                sources=main_path,
+                BuildOption=main_build_option,
                 output_dir=output_dir,
-                BuildOption=cpp_build_option,
-            )
-            worker_fn = functools.partial(
-                _worker_compile_cpp,
-                lock_path,
-                cpp_builder,
             )
-            binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
+
+            if optimized_code:
+                optimized_name, _ = get_name_and_dir_from_output_file_path(
+                    optimized_path
+                )
+                optimized_builder = CppBuilder(
+                    name=optimized_name,
+                    sources=optimized_path,
+                    BuildOption=optimized_build_option,
+                    output_dir=output_dir,
+                )
+
+                linker = CppBuilder(
+                    name=main_name,
+                    sources=[
+                        main_builder.get_target_file_path(),
+                        optimized_builder.get_target_file_path(),
+                    ],
+                    BuildOption=CppTorchDeviceOptions(**compile_command),
+                    output_dir=output_dir,
+                )
+
+                worker_fn = functools.partial(
+                    _worker_compile_cpp,
+                    lock_path,
+                    (main_builder, optimized_builder, linker),
+                )
+                binary_path = normalize_path_separator(linker.get_target_file_path())
+            else:
+                worker_fn = functools.partial(
+                    _worker_compile_cpp, lock_path, (main_builder,)
+                )
+                binary_path = normalize_path_separator(
+                    main_builder.get_target_file_path()
+                )
 
             def load_fn() -> Any:
                 nonlocal lib
@@ -2112,19 +2180,20 @@ def load_fn() -> Any:
         return cls.cache[key]
 
     @classmethod
-    def load(cls, source_code: str, device_type: str = "cpu") -> Any:
-        return cls.load_async(source_code, device_type)()
+    def load(cls, *args: Any, **kwargs: Any) -> Any:
+        return cls.load_async(*args, **kwargs)()
 
 
 def _worker_compile_cpp(
     lock_path: str,
-    cpp_builder: CppBuilder,
+    cpp_builders: Sequence[CppBuilder],
 ) -> None:
     from torch.utils._filelock import FileLock
 
     with FileLock(lock_path, timeout=LOCK_TIMEOUT):
-        if not os.path.exists(cpp_builder.get_target_file_path()):
-            cpp_builder.build()
+        for builder in cpp_builders:
+            if not os.path.exists(builder.get_target_file_path()):
+                builder.build()
 
 
 # Customized Python binding for cpp kernels
@@ -2253,19 +2322,24 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
     @classmethod
     def load_pybinding_async(
         cls,
-        argtypes: list[str],
-        source_code: str,
+        argtypes: Sequence[str],
+        main_code: str,
         device_type: str = "cpu",
         num_outputs: int = -1,
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
+        kernel_code: Optional[str] = None,
     ) -> Any:
         """
         Wrap a C++ function in fast Python bindings.
 
         Args:
             argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
-            source_code: C++ source code containing a ENTRY_FUNCTION() function
+            main_code: C++ source code containing ENTRY_FUNCTION().  Will be built at
+                -O3 if kernel_code is None (to maximize performance in any kernels that
+                are present), or -O1 otherwise (to minimize compile time).
+            kernel_code: If present, C++ source code that will be built at -O3 and
+                linked to main_code.
 
         Returns:
             A python version of ENTRY_FUNCTION()
@@ -2287,10 +2361,11 @@ def load_pybinding_async(
             cls.entry_function,
         )
         get_result = cls.load_async(
-            source_code + suffix,
+            main_code + suffix,
             device_type,
             submit_fn=submit_fn,
             extra_flags=extra_flags,
+            optimized_code=kernel_code,
         )
         result = None
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 7a6d87193a43..a2b0c9f126d0 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1011,19 +1011,29 @@ def generate_end(self, result):
                 result.writeline("} // namespace torch::aot_inductor\n\n\n")
             return
 
-        # Add any kernel definitions into the wrapped code.  We currently only build
-        # them in separate files in AOT mode.
-        result.splice(self.kernel_declarations.getvalue())
-        self.kernel_declarations.clear()
+        # Close the wrapper code block, then write any kernel definitions.
+        result.splice("'''\n)")
+        if self.kernel_declarations:
+            result.splice("\nkernel_src = (\nr'''")
+            result.splice(self.kernel_declarations.getvalue())
+            result.splice("'''\n)")
+        else:
+            result.splice(
+                """
+                kernel_src = ''
+                """
+            )
 
         # cpp entry function for JIT with cpp wrapper
         result.splice(
             f"""
-            '''
-            )
-
             inductor_entry = CppWrapperCodeCache.load_pybinding(
-                ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
+                argtypes=["std::vector<AtenTensorHandle>"],
+                main_code=cpp_wrapper_src,
+                device_type="{self.device}",
+                num_outputs={len(V.graph.graph_outputs)},
+                kernel_code=kernel_src,
+            )
             """
         )
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 399a8c06e0f3..0c0729a4db03 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -2085,9 +2085,9 @@ def compile_to_module(self) -> ModuleType:
     def _compile_to_module(self) -> ModuleType:
         from .codecache import PyCodeCache
 
-        # Currently, if we're here, we don't have to worry about the kernel code, which
-        # is only available in AOTInductor mode.
-        wrapper_code, _ = (
+        # If we're here, we don't have to worry about the kernel code, which is only
+        # returned separately in AOTInductor mode.
+        src_code, _ = (
             self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
         )
         if config.triton.autotune_at_compile_time:
@@ -2098,33 +2098,33 @@ def _compile_to_module(self) -> ModuleType:
                 + self.wrapper_code.kernel_autotune_calls.getvalue()
                 + '"""\n'
             )
-            wrapper_code.value = tuning_code + wrapper_code.value
+            src_code.value = tuning_code + src_code.value
         if GraphLowering.save_output_code is not None:
-            GraphLowering.save_output_code(wrapper_code.value)
-        output_code_log.debug("Output code: \n%s", wrapper_code.value)
+            GraphLowering.save_output_code(src_code.value)
+        output_code_log.debug("Output code: \n%s", src_code.value)
 
         inductor_meta = autotune_cache.inductor_meta_from_config()
-        AutotuneCacheBundler.begin_compile(inductor_meta, code=wrapper_code.value)
+        AutotuneCacheBundler.begin_compile(inductor_meta, code=src_code.value)
 
         try:
             linemap = [
                 (line_no, node.stack_trace)  # type: ignore[attr-defined]
-                for line_no, node in wrapper_code.line_map
+                for line_no, node in src_code.line_map
             ]
-            key, path = PyCodeCache.write(wrapper_code.value)
+            key, path = PyCodeCache.write(src_code.value)
             output_code_log.debug("Output code written to: %s", path)
         except Exception:
             trace_structured(
                 "inductor_output_code",
                 # Just omit the filename, I still want the code though!
-                payload_fn=lambda: wrapper_code.value,
+                payload_fn=lambda: src_code.value,
             )
             raise
         else:
             trace_structured(
                 "inductor_output_code",
                 lambda: {"filename": path},
-                payload_fn=lambda: wrapper_code.value,
+                payload_fn=lambda: src_code.value,
             )
         with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
             mod = PyCodeCache.load_by_key_path(

From c3aadf00470598ebeff4df591e4955b597797bec Mon Sep 17 00:00:00 2001
From: Benjamin Glass <bglass@quansight.com>
Date: Mon, 10 Mar 2025 16:24:05 +0000
Subject: [PATCH 2/2] Update

[ghstack-poisoned]
---
 torch/_inductor/graph.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 0c0729a4db03..15ed8bc7bc40 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -2087,7 +2087,7 @@ def _compile_to_module(self) -> ModuleType:
 
         # If we're here, we don't have to worry about the kernel code, which is only
         # returned separately in AOTInductor mode.
-        src_code, _ = (
+        wrapper_code, _ = (
             self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
         )
         if config.triton.autotune_at_compile_time:
@@ -2098,33 +2098,33 @@ def _compile_to_module(self) -> ModuleType:
                 + self.wrapper_code.kernel_autotune_calls.getvalue()
                 + '"""\n'
             )
-            src_code.value = tuning_code + src_code.value
+            wrapper_code.value = tuning_code + wrapper_code.value
         if GraphLowering.save_output_code is not None:
-            GraphLowering.save_output_code(src_code.value)
-        output_code_log.debug("Output code: \n%s", src_code.value)
+            GraphLowering.save_output_code(wrapper_code.value)
+        output_code_log.debug("Output code: \n%s", wrapper_code.value)
 
         inductor_meta = autotune_cache.inductor_meta_from_config()
-        AutotuneCacheBundler.begin_compile(inductor_meta, code=src_code.value)
+        AutotuneCacheBundler.begin_compile(inductor_meta, code=wrapper_code.value)
 
         try:
             linemap = [
                 (line_no, node.stack_trace)  # type: ignore[attr-defined]
-                for line_no, node in src_code.line_map
+                for line_no, node in wrapper_code.line_map
             ]
-            key, path = PyCodeCache.write(src_code.value)
+            key, path = PyCodeCache.write(wrapper_code.value)
             output_code_log.debug("Output code written to: %s", path)
         except Exception:
             trace_structured(
                 "inductor_output_code",
                 # Just omit the filename, I still want the code though!
-                payload_fn=lambda: src_code.value,
+                payload_fn=lambda: wrapper_code.value,
             )
             raise
         else:
             trace_structured(
                 "inductor_output_code",
                 lambda: {"filename": path},
-                payload_fn=lambda: src_code.value,
+                payload_fn=lambda: wrapper_code.value,
             )
         with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
             mod = PyCodeCache.load_by_key_path(