From bbef6cacef793c691e8f2df6849f63867e0c0377 Mon Sep 17 00:00:00 2001 From: Benjamin Glass Date: Fri, 7 Mar 2025 18:35:57 +0000 Subject: [PATCH 1/2] Update [ghstack-poisoned] --- torch/_inductor/codecache.py | 153 +++++++++++++++------ torch/_inductor/codegen/cpp_wrapper_cpu.py | 26 ++-- torch/_inductor/graph.py | 22 +-- 3 files changed, 143 insertions(+), 58 deletions(-) diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index 0ea5a7155d8c..a0fe062862d0 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -126,6 +126,7 @@ def use_global_cache() -> bool: # type: ignore[misc] from concurrent.futures import Future from .compile_fx import _CompileFxKwargs, CompiledFxGraph + from .cpp_builder import BuildOptionsBase from .graph import GraphLowering from .ir import ChoiceCaller from .output_code import CompiledFxGraphConstants, OutputCode @@ -1968,7 +1969,7 @@ def _get_file_checksum(filename: str) -> str: os.makedirs(_HEADER_LOCK_DIR, exist_ok=True) _worker_compile_cpp( os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"), - cpp_builder, + (cpp_builder,), ) return header_full_path @@ -2035,10 +2036,11 @@ def _get_uncompiled_header(cls, device: str) -> str | None: @classmethod def load_async( cls, - source_code: str, + main_code: str, device_type: str = "cpu", submit_fn: Any = None, extra_flags: Sequence[str] = (), + optimized_code: Optional[str] = None, ) -> Any: compile_command = { **cls.cpp_compile_command_flags, @@ -2050,46 +2052,112 @@ def load_async( _set_gpu_runtime_env() # cpp_extension consults the env - cpp_build_option = CppTorchDeviceOptions(**compile_command) - command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option) - # write function will calc source_code hash, the same source code with different - # ISA level should be generate different hash. - # So we need get a command_line which contains isa related parameter as a part of hash key. - # And then pass the command_line to below write function as extra parameter to - # guarantee the source code hash contains ISA difference. - vec_isa_cmd = repr(command_gen.get_command_line()) - key, input_path = write(source_code, "cpp", extra=vec_isa_cmd) + # Note the distinction between the two booleans. We do minimal optimization if + # the optimized_code argument is present at all, since that's how the user of + # this function opts in, but we do compilation and linking in one step if the + # optimized_code argument is empty (as a micro-optimization). + main_build_option = CppTorchDeviceOptions( + compile_only=bool(optimized_code), + min_optimize=optimized_code is not None, + **compile_command, + ) + optimized_build_option = CppTorchDeviceOptions( + compile_only=True, **compile_command + ) + + def get_hashable_command_line(build_option: BuildOptionsBase) -> str: + """Writing the code to file will calculate a hash, which we need to vary if + the command line flags change. This implements a mostly-generic way of + validating that.""" + return CppBuilder( + name="o", sources="i", BuildOption=build_option + ).get_command_line() + + main_cmd_line = get_hashable_command_line(main_build_option) + optimized_cmd_line = get_hashable_command_line(optimized_build_option) + + key, main_path = write( + main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}" + ) + + # Don't bother writing if the argument is empty. + if optimized_code: + _, optimized_path = write( + optimized_code, "optimized.cpp", extra=optimized_cmd_line + ) + else: + # Unused, but makes type checkers happy. + optimized_path = os.devnull if key not in cls.cache: from torch.utils._filelock import FileLock lock_path = os.path.join(get_lock_dir(), key + ".lock") - output_name, output_dir = get_name_and_dir_from_output_file_path(input_path) future: Optional[Future[Any]] = None lib = None # if requested, pre-compile any headers - if config.cpp_cache_precompile_headers and ( - header_file := cls._get_uncompiled_header(device_type) - ): - cpp_build_option.precompiled_header = _precompile_header( - header_file, - vec_isa_cmd, - **compile_command, - ) + if config.cpp_cache_precompile_headers: + if header := cls._get_uncompiled_header(device_type): + main_build_option.precompiled_header = _precompile_header( + header, + main_cmd_line, + min_optimize=optimized_code is not None, + **compile_command, + ) - cpp_builder = CppBuilder( - name=output_name, - sources=input_path, + # Currently, the optimized_code field is only used for cpp kernel code, + # so go ahead and precompile the relevant header here. Revisit this + # decision if that ever changes. + if optimized_code and (header := _get_cpp_prefix_header(device_type)): + optimized_build_option.precompiled_header = _precompile_header( + header, + optimized_cmd_line, + **compile_command, + ) + + main_name, output_dir = get_name_and_dir_from_output_file_path(main_path) + main_builder = CppBuilder( + name=main_name, + sources=main_path, + BuildOption=main_build_option, output_dir=output_dir, - BuildOption=cpp_build_option, - ) - worker_fn = functools.partial( - _worker_compile_cpp, - lock_path, - cpp_builder, ) - binary_path = normalize_path_separator(cpp_builder.get_target_file_path()) + + if optimized_code: + optimized_name, _ = get_name_and_dir_from_output_file_path( + optimized_path + ) + optimized_builder = CppBuilder( + name=optimized_name, + sources=optimized_path, + BuildOption=optimized_build_option, + output_dir=output_dir, + ) + + linker = CppBuilder( + name=main_name, + sources=[ + main_builder.get_target_file_path(), + optimized_builder.get_target_file_path(), + ], + BuildOption=CppTorchDeviceOptions(**compile_command), + output_dir=output_dir, + ) + + worker_fn = functools.partial( + _worker_compile_cpp, + lock_path, + (main_builder, optimized_builder, linker), + ) + binary_path = normalize_path_separator(linker.get_target_file_path()) + else: + worker_fn = functools.partial( + _worker_compile_cpp, lock_path, (main_builder,) + ) + binary_path = normalize_path_separator( + main_builder.get_target_file_path() + ) def load_fn() -> Any: nonlocal lib @@ -2112,19 +2180,20 @@ def load_fn() -> Any: return cls.cache[key] @classmethod - def load(cls, source_code: str, device_type: str = "cpu") -> Any: - return cls.load_async(source_code, device_type)() + def load(cls, *args: Any, **kwargs: Any) -> Any: + return cls.load_async(*args, **kwargs)() def _worker_compile_cpp( lock_path: str, - cpp_builder: CppBuilder, + cpp_builders: Sequence[CppBuilder], ) -> None: from torch.utils._filelock import FileLock with FileLock(lock_path, timeout=LOCK_TIMEOUT): - if not os.path.exists(cpp_builder.get_target_file_path()): - cpp_builder.build() + for builder in cpp_builders: + if not os.path.exists(builder.get_target_file_path()): + builder.build() # Customized Python binding for cpp kernels @@ -2253,19 +2322,24 @@ def _get_uncompiled_header(cls, device: str) -> str | None: @classmethod def load_pybinding_async( cls, - argtypes: list[str], - source_code: str, + argtypes: Sequence[str], + main_code: str, device_type: str = "cpu", num_outputs: int = -1, submit_fn: Any = None, extra_flags: Sequence[str] = (), + kernel_code: Optional[str] = None, ) -> Any: """ Wrap a C++ function in fast Python bindings. Args: argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"] - source_code: C++ source code containing a ENTRY_FUNCTION() function + main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at + -O3 if kernel_code is None (to maximize performance in any kernels that + are present), or -O1 otherwise (to minimize compile time). + kernel_code: If present, C++ source code that will be built at -O3 and + linked to main_code. Returns: A python version of ENTRY_FUNCTION() @@ -2287,10 +2361,11 @@ def load_pybinding_async( cls.entry_function, ) get_result = cls.load_async( - source_code + suffix, + main_code + suffix, device_type, submit_fn=submit_fn, extra_flags=extra_flags, + optimized_code=kernel_code, ) result = None diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py index 7a6d87193a43..a2b0c9f126d0 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py @@ -1011,19 +1011,29 @@ def generate_end(self, result): result.writeline("} // namespace torch::aot_inductor\n\n\n") return - # Add any kernel definitions into the wrapped code. We currently only build - # them in separate files in AOT mode. - result.splice(self.kernel_declarations.getvalue()) - self.kernel_declarations.clear() + # Close the wrapper code block, then write any kernel definitions. + result.splice("'''\n)") + if self.kernel_declarations: + result.splice("\nkernel_src = (\nr'''") + result.splice(self.kernel_declarations.getvalue()) + result.splice("'''\n)") + else: + result.splice( + """ + kernel_src = '' + """ + ) # cpp entry function for JIT with cpp wrapper result.splice( f""" - ''' - ) - inductor_entry = CppWrapperCodeCache.load_pybinding( - ["std::vector"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)}) + argtypes=["std::vector"], + main_code=cpp_wrapper_src, + device_type="{self.device}", + num_outputs={len(V.graph.graph_outputs)}, + kernel_code=kernel_src, + ) """ ) diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 399a8c06e0f3..0c0729a4db03 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -2085,9 +2085,9 @@ def compile_to_module(self) -> ModuleType: def _compile_to_module(self) -> ModuleType: from .codecache import PyCodeCache - # Currently, if we're here, we don't have to worry about the kernel code, which - # is only available in AOTInductor mode. - wrapper_code, _ = ( + # If we're here, we don't have to worry about the kernel code, which is only + # returned separately in AOTInductor mode. + src_code, _ = ( self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() ) if config.triton.autotune_at_compile_time: @@ -2098,33 +2098,33 @@ def _compile_to_module(self) -> ModuleType: + self.wrapper_code.kernel_autotune_calls.getvalue() + '"""\n' ) - wrapper_code.value = tuning_code + wrapper_code.value + src_code.value = tuning_code + src_code.value if GraphLowering.save_output_code is not None: - GraphLowering.save_output_code(wrapper_code.value) - output_code_log.debug("Output code: \n%s", wrapper_code.value) + GraphLowering.save_output_code(src_code.value) + output_code_log.debug("Output code: \n%s", src_code.value) inductor_meta = autotune_cache.inductor_meta_from_config() - AutotuneCacheBundler.begin_compile(inductor_meta, code=wrapper_code.value) + AutotuneCacheBundler.begin_compile(inductor_meta, code=src_code.value) try: linemap = [ (line_no, node.stack_trace) # type: ignore[attr-defined] - for line_no, node in wrapper_code.line_map + for line_no, node in src_code.line_map ] - key, path = PyCodeCache.write(wrapper_code.value) + key, path = PyCodeCache.write(src_code.value) output_code_log.debug("Output code written to: %s", path) except Exception: trace_structured( "inductor_output_code", # Just omit the filename, I still want the code though! - payload_fn=lambda: wrapper_code.value, + payload_fn=lambda: src_code.value, ) raise else: trace_structured( "inductor_output_code", lambda: {"filename": path}, - payload_fn=lambda: wrapper_code.value, + payload_fn=lambda: src_code.value, ) with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True): mod = PyCodeCache.load_by_key_path( From c3aadf00470598ebeff4df591e4955b597797bec Mon Sep 17 00:00:00 2001 From: Benjamin Glass Date: Mon, 10 Mar 2025 16:24:05 +0000 Subject: [PATCH 2/2] Update [ghstack-poisoned] --- torch/_inductor/graph.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 0c0729a4db03..15ed8bc7bc40 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -2087,7 +2087,7 @@ def _compile_to_module(self) -> ModuleType: # If we're here, we don't have to worry about the kernel code, which is only # returned separately in AOTInductor mode. - src_code, _ = ( + wrapper_code, _ = ( self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() ) if config.triton.autotune_at_compile_time: @@ -2098,33 +2098,33 @@ def _compile_to_module(self) -> ModuleType: + self.wrapper_code.kernel_autotune_calls.getvalue() + '"""\n' ) - src_code.value = tuning_code + src_code.value + wrapper_code.value = tuning_code + wrapper_code.value if GraphLowering.save_output_code is not None: - GraphLowering.save_output_code(src_code.value) - output_code_log.debug("Output code: \n%s", src_code.value) + GraphLowering.save_output_code(wrapper_code.value) + output_code_log.debug("Output code: \n%s", wrapper_code.value) inductor_meta = autotune_cache.inductor_meta_from_config() - AutotuneCacheBundler.begin_compile(inductor_meta, code=src_code.value) + AutotuneCacheBundler.begin_compile(inductor_meta, code=wrapper_code.value) try: linemap = [ (line_no, node.stack_trace) # type: ignore[attr-defined] - for line_no, node in src_code.line_map + for line_no, node in wrapper_code.line_map ] - key, path = PyCodeCache.write(src_code.value) + key, path = PyCodeCache.write(wrapper_code.value) output_code_log.debug("Output code written to: %s", path) except Exception: trace_structured( "inductor_output_code", # Just omit the filename, I still want the code though! - payload_fn=lambda: src_code.value, + payload_fn=lambda: wrapper_code.value, ) raise else: trace_structured( "inductor_output_code", lambda: {"filename": path}, - payload_fn=lambda: src_code.value, + payload_fn=lambda: wrapper_code.value, ) with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True): mod = PyCodeCache.load_by_key_path(