8000 cpp_wrapper: build non-performance-sensitive code at O1 by benjaminglass1 · Pull Request #148773 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

cpp_wrapper: build non-performance-sensitive code at O1 #148773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
bbef6ca
Update
benjaminglass1 Mar 7, 2025
861c944
Update
benjaminglass1 Mar 10, 2025
c3aadf0
Update
benjaminglass1 Mar 10, 2025
346193b
Update
benjaminglass1 Mar 10, 2025
c0aeefc
Update
benjaminglass1 Mar 10, 2025
d3f4c9b
Update
benjaminglass1 Mar 11, 2025
a7c23c3
Update
benjaminglass1 Mar 11, 2025
77aa708
Update
benjaminglass1 Mar 11, 2025
e820ce7
Update
benjaminglass1 Mar 11, 2025
78e4ebf
Update
benjaminglass1 Mar 12, 2025
1297557
Update
benjaminglass1 Mar 12, 2025
64aca9b
Update
benjaminglass1 Mar 13, 2025
ef1a5b8
Update
benjaminglass1 Mar 13, 2025
2810db0
Update
benjaminglass1 Mar 24, 2025
c60363c
Update
benjaminglass1 Mar 27, 2025
e16b5a0
Update
benjaminglass1 Mar 28, 2025
f3c2261
Update
benjaminglass1 Apr 4, 2025
2d499ff
Update
benjaminglass1 Apr 8, 2025
a8b4b4a
Update
benjaminglass1 Apr 10, 2025
c6f35c4
Update
benjaminglass1 Apr 15, 2025
cbe11b8
Update
benjaminglass1 Apr 15, 2025
af6a247
Update
benjaminglass1 Apr 16, 2025
42856c5
Update
benjaminglass1 Apr 16, 2025
0d0d6ae
Update
benjaminglass1 Apr 17, 2025
f900290
Update
benjaminglass1 Apr 21, 2025
9fcfc80
Update
benjaminglass1 Apr 30, 2025
9d837fd
Update
benjaminglass1 May 1, 2025
0b7db46
Update
benjaminglass1 May 2, 2025
a1211f5
Update
benjaminglass1 May 7, 2025
51a58ab
Update
benjaminglass1 May 10, 2025
2db6819
Update
benjaminglass1 May 13, 2025
df55464
Update
benjaminglass1 May 15, 2025
c5a57ac
Update
benjaminglass1 May 15, 2025
3f8f457
Update
benjaminglass1 May 16, 2025
3a47af7
Update
benjaminglass1 May 20, 2025
bc7e457
Update
benjaminglass1 May 20, 2025
b19abc5
Update
benjaminglass1 May 20, 2025
5cf851b
Update
benjaminglass1 May 21, 2025
3e27ace
Update
benjaminglass1 May 22, 2025
063110f
Update
benjaminglass1 May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
add_loop_eager,compile_time_instruction_count,2987000000,0.015
add_loop_eager,compile_time_instruction_count,2960000000,0.015



add_loop_eager_dynamic,compile_time_instruction_count,5928000000,0.025
add_loop_eager_dynamic,compile_time_instruction_count,5827000000,0.025



add_loop_inductor,compile_time_instruction_count,29370000000,0.015



add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44480000000,0.025
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44080000000,0.025



Expand All @@ -34,15 +34,31 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000



update_hint_regression,compile_time_instruction_count,1715000000,0.02
basic_InlineMod_eager,compile_time_instruction_count,7101000000,0.015



float_args,compile_time_instruction_count,444500000,0.015
update_hint_regression,compile_time_instruction_count,1683000000,0.02



sum_floordiv_regression,compile_time_instruction_count,1009000000,0.015
float_args,compile_time_instruction_count,455100000,0.015



mm_loop_inductor_gpu,compile_time_instruction_count,4407000000,0.015



mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,7381000000,0.015



basic_NestedModule_eager,compile_time_instruction_count,8241000000,0.015



sum_floordiv_regression,compile_time_instruction_count,1000000000,0.015



Expand All @@ -66,7 +82,7 @@ aotdispatcher_partitioner_cpu,compile_time_instruction_count,8630000000,0.015



aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1900000000,0.015
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1890000000,0.015



Expand Down
160 changes: 119 additions & 41 deletions torch/_inductor/codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def use_global_cache() -> bool:
from concurrent.futures import Future

from .compile_fx import _CompileFxKwargs
from .cpp_builder import BuildOptionsBase
from .graph import GraphLowering
from .ir import ChoiceCaller
from .output_code import CompiledFxGraphConstants, OutputCode
Expand Down Expand Up @@ -2225,7 +2226,7 @@ def _get_file_checksum(filename: str) -> str:
os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
_worker_compile_cpp(
os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
cpp_builder,
(cpp_builder,),
)

return header_full_path
Expand All @@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:

@clear_on_fresh_inductor_cache
class CppCodeCache:
"""Compiles and caches C++ libraries. Users of this class supply the source code to
be compiled, while compilation flags are set by CppBuilder."""

cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
cache_clear = staticmethod(cache.clear)
cpp_compile_command_flags: dict[str, Any] = {}
Expand Down Expand Up @@ -2292,11 +2296,14 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
@classmethod
def load_async(
cls,
source_code: str,
main_code: str,
device_type: str = "cpu",
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
optimized_code: Optional[str] = None,
) -> Any:
"""Compile and load a C++ library. Returns a callable that returns the loaded
library."""
compile_command = {
**cls.cpp_compile_command_flags,
"device_type": device_type,
Expand All @@ -2307,48 +2314,112 @@ def load_async(

_set_gpu_runtime_env() # cpp_extension consults the env

cpp_build_option = CppTorchDeviceOptions(**compile_command)
command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
# write function will calc source_code hash, the same source code with different
# ISA level should be generate different hash.
# So we need get a command_line which contains isa related parameter as a part of hash key.
# And then pass the command_line to below write function as extra parameter to
# guarantee the source code hash contains ISA difference.
vec_isa_cmd = repr(command_gen.get_command_line())
key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
# Note the distinction between the two booleans. We do minimal optimization if
# the optimized_code argument is present at all, since that's how the user of
# this function opts in, but we do compilation and linking in one step if the
# optimized_code argument is empty (as a micro-optimization).
main_build_option = CppTorchDeviceOptions(
compile_only=bool(optimized_code),
min_optimize=optimized_code is not None,
**compile_command,
)
optimized_build_option = CppTorchDeviceOptions(
compile_only=True, **compile_command
)

def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
"""Writing the code to file will calculate a hash, which we need to vary if
the command line flags change. This implements a mostly-generic way of
validating that."""
return CppBuilder(
name="o", sources="i", BuildOption=build_option
).get_command_line()

main_cmd_line = get_hashable_command_line(main_build_option)
optimized_cmd_line = get_hashable_command_line(optimized_build_option)

key, main_path = write(
main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
)

# Don't bother writing if the argument is empty.
if optimized_code:
_, optimized_path = write(
optimized_code, "optimized.cpp", extra=optimized_cmd_line
)
else:
# Unused, but makes type checkers happy.
optimized_path = os.devnull

if key not in cls.cache:
from torch.utils._filelock import FileLock

lock_path = os.path.join(get_lock_dir(), key + ".lock")
output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
future: Optional[Future[Any]] = None
lib = None

# if requested, pre-compile any headers
if (
config.cpp_cache_precompile_headers
and not _IS_WINDOWS
and (header_file := cls._get_uncompiled_header(device_type))
):
cpp_build_option.precompiled_header = _precompile_header(
header_file,
vec_isa_cmd,
**compile_command,
)
if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
if header := cls._get_uncompiled_header(device_type):
main_build_option.precompiled_header = _precompile_header(
header,
main_cmd_line,
min_optimize=optimized_code is not None,
**compile_command,
)

cpp_builder = CppBuilder(
name=output_name,
sources=input_path,
# Currently, the optimized_code field is only used for cpp kernel code,
# so go ahead and precompile the relevant header here. Revisit this
# decision if that ever changes.
Comment on lines +2371 to +2373
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reviewers, this decision is something I'd like review on.

if optimized_code and (header := _get_cpp_prefix_header(device_type)):
optimized_build_option.precompiled_header = _precompile_header(
header,
optimized_cmd_line,
**compile_command,
)

main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
main_builder = CppBuilder(
name=main_name,
sources=main_path,
BuildOption=main_build_option,
output_dir=output_dir,
BuildOption=cpp_build_option,
)
worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
cpp_builder,
)
binary_path = normalize_path_separator(cpp_builder.get_target_file_path())

if optimized_code:
optimized_name, _ = get_name_and_dir_from_output_file_path(
optimized_path
)
optimized_builder = CppBuilder(
name=optimized_name,
sources=optimized_path,
BuildOption=optimized_build_option,
output_dir=output_dir,
)

linker = CppBuilder(
name=main_name,
sources=[
main_builder.get_target_file_path(),
optimized_builder.get_target_file_path(),
],
BuildOption=CppTorchDeviceOptions(**compile_command),
output_dir=output_dir,
)

worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
(main_builder, optimized_builder, linker),
)
binary_path = normalize_path_separator(linker.get_target_file_path())
else:
worker_fn = functools.partial(
_worker_compile_cpp, lock_path, (main_builder,)
)
binary_path = normalize_path_separator(
main_builder.get_target_file_path()
)

def load_fn() -> Any:
nonlocal lib
Expand All @@ -2371,19 +2442,20 @@ def load_fn() -> Any:
return cls.cache[key]

@classmethod
def load(cls, source_code: str, device_type: str = "cpu") -> Any:
return cls.load_async(source_code, device_type)()
def load(cls, *args: Any, **kwargs: Any) -> Any:
return cls.load_async(*args, **kwargs)()


def _worker_compile_cpp(
lock_path: str,
cpp_builder: CppBuilder,
cpp_builders: Sequence[CppBuilder],
) -> None:
from torch.utils._filelock import FileLock

with FileLock(lock_path, timeout=LOCK_TIMEOUT):
if not os.path.exists(cpp_builder.get_target_file_path()):
cpp_builder.build()
for builder in cpp_builders:
if not os.path.exists(builder.get_target_file_path()):
builder.build()


# Customized Python binding for cpp kernels
Expand Down Expand Up @@ -2513,19 +2585,24 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
@classmethod
def load_pybinding_async(
cls,
argtypes: list[str],
source_code: str,
argtypes: Sequence[str],
main_code: str,
device_type: str = "cpu",
num_outputs: int = -1,
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
kernel_code: Optional[str] = None,
) -> Any:
"""
Wrap a C++ function in fast Python bindings.

Args:
argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
source_code: C++ source code containing a ENTRY_FUNCTION() function
main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at
-O3 if kernel_code is None (to maximize performance in any kernels that
are present), or -O1 otherwise (to minimize compile time).
kernel_code: If present, C++ source code that will be built at -O3 and
linked to main_code.

Returns:
A python version of ENTRY_FUNCTION()
Expand All @@ -2541,10 +2618,11 @@ def load_pybinding_async(
extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
)
get_result = cls.load_async(
source_code + suffix,
main_code + suffix,
device_type,
submit_fn=submit_fn,
extra_flags=extra_flags,
optimized_code=kernel_code,
)
result = None

Expand Down
27 changes: 19 additions & 8 deletions torch/_inductor/codegen/cpp_wrapper_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,26 +1074,37 @@ def generate_before_suffix(self, result):
result.writeline("} // inductor_entry_impl")

def generate_end(self, result):
"""Generates the end of the code block, and any code needed to call it."""
if V.graph.aot_mode:
if V.graph.is_const_graph:
result.writeline("} // AOTInductorModel::_const_run_impl")
else:
result.writeline("} // namespace torch::aot_inductor\n\n\n")
return

# Add any kernel definitions into the wrapped code. We currently only build
# them in separate files in AOT mode.
result.splice(self.kernel_declarations.getvalue())
self.kernel_declarations.clear()
# Close the wrapper code block, then write any kernel definitions.
result.splice("'''\n)")
if self.kernel_declarations:
result.splice("\nkernel_src = (\nr'''")
result.splice(self.kernel_declarations.getvalue())
result.splice("'''\n)")
else:
result.splice(
"""
kernel_src = ''
"""
)

# cpp entry function for JIT with cpp wrapper
result.splice(
f"""
'''
)

inductor_entry = CppWrapperCodeCache.load_pybinding(
["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
argtypes=["std::vector<AtenTensorHandle>"],
main_code=cpp_wrapper_src,
device_type="{self.device}",
num_outputs={len(V.graph.graph_outputs)},
kernel_code=kernel_src,
)
"""
)

Expand Down
4 changes: 2 additions & 2 deletions torch/_inductor/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2284,8 +2284,8 @@ def compile_to_module(self) -> CompiledModule:
return self._compile_to_module()

def _compile_to_module(self) -> CompiledModule:
# Currently, if we're here, we don't have to worry about the kernel code, which
# is only available in AOTInductor mode.
# If we're here, we don't have to worry about the kernel code, which is only
# returned separately in AOTInductor mode.
wrapper_code, _ = (
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
)
Expand Down
Loading
0