-
Notifications
You must be signed in to change notification settings - Fork 24.3k
cpp_wrapper: build non-performance-sensitive code at O1 #148773
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
benjaminglass1
wants to merge
40
commits into
gh/benjaminglass1/77/base
from
gh/benjaminglass1/77/head
Closed
Changes from all commits
Commits
Show all changes
40 commits
Select commit
Hold shift + click to select a range
bbef6ca
Update
benjaminglass1 861c944
Update
benjaminglass1 c3aadf0
Update
benjaminglass1 346193b
Update
benjaminglass1 c0aeefc
Update
benjaminglass1 d3f4c9b
Update
benjaminglass1 a7c23c3
Update
benjaminglass1 77aa708
Update
benjaminglass1 e820ce7
Update
benjaminglass1 78e4ebf
Update
benjaminglass1 1297557
Update
benjaminglass1 64aca9b
Update
benjaminglass1 ef1a5b8
Update
benjaminglass1 2810db0
Update
benjaminglass1 c60363c
Update
benjaminglass1 e16b5a0
Update
benjaminglass1 f3c2261
Update
benjaminglass1 2d499ff
Update
benjaminglass1 a8b4b4a
Update
benjaminglass1 c6f35c4
Update
benjaminglass1 cbe11b8
Update
benjaminglass1 af6a247
Update
benjaminglass1 42856c5
Update
benjaminglass1 0d0d6ae
Update
benjaminglass1 f900290
Update
benjaminglass1 9fcfc80
Update
benjaminglass1 9d837fd
Update
benjaminglass1 0b7db46
Update
benjaminglass1 a1211f5
Update
benjaminglass1 51a58ab
Update
benjaminglass1 2db6819
Update
benjaminglass1 df55464
Update
benjaminglass1 c5a57ac
Update
benjaminglass1 3f8f457
Update
benjaminglass1 3a47af7
Update
benjaminglass1 bc7e457
Update
benjaminglass1 b19abc5
Update
benjaminglass1 5cf851b
Update
benjaminglass1 3e27ace
Update
benjaminglass1 063110f
Update
benjaminglass1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,6 +138,7 @@ def use_global_cache() -> bool: | |
from concurrent.futures import Future | ||
|
||
from .compile_fx import _CompileFxKwargs | ||
from .cpp_builder import BuildOptionsBase | ||
from .graph import GraphLowering | ||
from .ir import ChoiceCaller | ||
from .output_code import CompiledFxGraphConstants, OutputCode | ||
|
@@ -2225,7 +2226,7 @@ def _get_file_checksum(filename: str) -> str: | |
os.makedirs(_HEADER_LOCK_DIR, exist_ok=True) | ||
_worker_compile_cpp( | ||
os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"), | ||
cpp_builder, | ||
(cpp_builder,), | ||
) | ||
|
||
return header_full_path | ||
|
@@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str: | |
|
||
@clear_on_fresh_inductor_cache | ||
class CppCodeCache: | ||
"""Compiles and caches C++ libraries. Users of this class supply the source code to | ||
be compiled, while compilation flags are set by CppBuilder.""" | ||
|
||
cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {} | ||
cache_clear = staticmethod(cache.clear) | ||
cpp_compile_command_flags: dict[str, Any] = {} | ||
|
@@ -2292,11 +2296,14 @@ def _get_uncompiled_header(cls, device: str) -> str | None: | |
@classmethod | ||
def load_async( | ||
cls, | ||
source_code: str, | ||
main_code: str, | ||
device_type: str = "cpu", | ||
submit_fn: Any = None, | ||
extra_flags: Sequence[str] = (), | ||
optimized_code: Optional[str] = None, | ||
) -> Any: | ||
"""Compile and load a C++ library. Returns a callable that returns the loaded | ||
library.""" | ||
compile_command = { | ||
**cls.cpp_compile_command_flags, | ||
"device_type": device_type, | ||
|
@@ -2307,48 +2314,112 @@ def load_async( | |
|
||
_set_gpu_runtime_env() # cpp_extension consults the env | ||
|
||
cpp_build_option = CppTorchDeviceOptions(**compile_command) | ||
command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option) | ||
# write function will calc source_code hash, the same source code with different | ||
# ISA level should be generate different hash. | ||
# So we need get a command_line which contains isa related parameter as a part of hash key. | ||
# And then pass the command_line to below write function as extra parameter to | ||
# guarantee the source code hash contains ISA difference. | ||
vec_isa_cmd = repr(command_gen.get_command_line()) | ||
key, input_path = write(source_code, "cpp", extra=vec_isa_cmd) | ||
# Note the distinction between the two booleans. We do minimal optimization if | ||
# the optimized_code argument is present at all, since that's how the user of | ||
# this function opts in, but we do compilation and linking in one step if the | ||
# optimized_code argument is empty (as a micro-optimization). | ||
main_build_option = CppTorchDeviceOptions( | ||
compile_only=bool(optimized_code), | ||
min_optimize=optimized_code is not None, | ||
**compile_command, | ||
) | ||
optimized_build_option = CppTorchDeviceOptions( | ||
compile_only=True, **compile_command | ||
) | ||
|
||
def get_hashable_command_line(build_option: BuildOptionsBase) -> str: | ||
"""Writing the code to file will calculate a hash, which we need to vary if | ||
the command line flags change. This implements a mostly-generic way of | ||
validating that.""" | ||
return CppBuilder( | ||
name="o", sources="i", BuildOption=build_option | ||
).get_command_line() | ||
|
||
main_cmd_line = get_hashable_command_line(main_build_option) | ||
optimized_cmd_line = get_hashable_command_line(optimized_build_option) | ||
|
||
key, main_path = write( | ||
main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}" | ||
) | ||
|
||
# Don't bother writing if the argument is empty. | ||
if optimized_code: | ||
_, optimized_path = write( | ||
optimized_code, "optimized.cpp", extra=optimized_cmd_line | ||
) | ||
else: | ||
# Unused, but makes type checkers happy. | ||
optimized_path = os.devnull | ||
|
||
if key not in cls.cache: | ||
from torch.utils._filelock import FileLock | ||
|
||
lock_path = os.path.join(get_lock_dir(), key + ".lock") | ||
output_name, output_dir = get_name_and_dir_from_output_file_path(input_path) | ||
future: Optional[Future[Any]] = None | ||
lib = None | ||
|
||
# if requested, pre-compile any headers | ||
if ( | ||
config.cpp_cache_precompile_headers | ||
and not _IS_WINDOWS | ||
and (header_file := cls._get_uncompiled_header(device_type)) | ||
): | ||
cpp_build_option.precompiled_header = _precompile_header( | ||
header_file, | ||
vec_isa_cmd, | ||
**compile_command, | ||
) | ||
if config.cpp_cache_precompile_headers and not _IS_WINDOWS: | ||
if header := cls._get_uncompiled_header(device_type): | ||
main_build_option.precompiled_header = _precompile_header( | ||
header, | ||
main_cmd_line, | ||
min_optimize=optimized_code is not None, | ||
**compile_command, | ||
) | ||
|
||
cpp_builder = CppBuilder( | ||
name=output_name, | ||
sources=input_path, | ||
# Currently, the optimized_code field is only used for cpp kernel code, | ||
# so go ahead and precompile the relevant header here. Revisit this | ||
# decision if that ever changes. | ||
Comment on lines
+2371
to
+2373
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reviewers, this decision is something I'd like review on. |
||
if optimized_code and (header := _get_cpp_prefix_header(device_type)): | ||
optimized_build_option.precompiled_header = _precompile_header( | ||
header, | ||
optimized_cmd_line, | ||
**compile_command, | ||
) | ||
|
||
main_name, output_dir = get_name_and_dir_from_output_file_path(main_path) | ||
main_builder = CppBuilder( | ||
name=main_name, | ||
sources=main_path, | ||
BuildOption=main_build_option, | ||
output_dir=output_dir, | ||
BuildOption=cpp_build_option, | ||
) | ||
worker_fn = functools.partial( | ||
_worker_compile_cpp, | ||
lock_path, | ||
cpp_builder, | ||
) | ||
binary_path = normalize_path_separator(cpp_builder.get_target_file_path()) | ||
|
||
if optimized_code: | ||
optimized_name, _ = get_name_and_dir_from_output_file_path( | ||
optimized_path | ||
) | ||
optimized_builder = CppBuilder( | ||
name=optimized_name, | ||
sources=optimized_path, | ||
BuildOption=optimized_build_option, | ||
output_dir=output_dir, | ||
) | ||
|
||
linker = CppBuilder( | ||
name=main_name, | ||
sources=[ | ||
main_builder.get_target_file_path(), | ||
optimized_builder.get_target_file_path(), | ||
], | ||
BuildOption=CppTorchDeviceOptions(**compile_command), | ||
output_dir=output_dir, | ||
) | ||
|
||
worker_fn = functools.partial( | ||
_worker_compile_cpp, | ||
lock_path, | ||
(main_builder, optimized_builder, linker), | ||
) | ||
binary_path = normalize_path_separator(linker.get_target_file_path()) | ||
else: | ||
worker_fn = functools.partial( | ||
_worker_compile_cpp, lock_path, (main_builder,) | ||
) | ||
binary_path = normalize_path_separator( | ||
main_builder.get_target_file_path() | ||
) | ||
|
||
def load_fn() -> Any: | ||
nonlocal lib | ||
|
@@ -2371,19 +2442,20 @@ def load_fn() -> Any: | |
return cls.cache[key] | ||
|
||
@classmethod | ||
def load(cls, source_code: str, device_type: str = "cpu") -> Any: | ||
return cls.load_async(source_code, device_type)() | ||
def load(cls, *args: Any, **kwargs: Any) -> Any: | ||
return cls.load_async(*args, **kwargs)() | ||
|
||
|
||
def _worker_compile_cpp( | ||
lock_path: str, | ||
cpp_builder: CppBuilder, | ||
cpp_builders: Sequence[CppBuilder], | ||
) -> None: | ||
from torch.utils._filelock import FileLock | ||
|
||
with FileLock(lock_path, timeout=LOCK_TIMEOUT): | ||
if not os.path.exists(cpp_builder.get_target_file_path()): | ||
cpp_builder.build() | ||
for builder in cpp_builders: | ||
if not os.path.exists(builder.get_target_file_path()): | ||
builder.build() | ||
|
||
|
||
# Customized Python binding for cpp kernels | ||
|
@@ -2513,19 +2585,24 @@ def _get_uncompiled_header(cls, device: str) -> str | None: | |
@classmethod | ||
def load_pybinding_async( | ||
cls, | ||
argtypes: list[str], | ||
source_code: str, | ||
argtypes: Sequence[str], | ||
main_code: str, | ||
device_type: str = "cpu", | ||
num_outputs: int = -1, | ||
submit_fn: Any = None, | ||
extra_flags: Sequence[str] = (), | ||
kernel_code: Optional[str] = None, | ||
) -> Any: | ||
""" | ||
Wrap a C++ function in fast Python bindings. | ||
|
||
Args: | ||
argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"] | ||
source_code: C++ source code containing a ENTRY_FUNCTION() function | ||
main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at | ||
-O3 if kernel_code is None (to maximize performance in any kernels that | ||
are present), or -O1 otherwise (to minimize compile time). | ||
kernel_code: If present, C++ source code that will be built at -O3 and | ||
linked to main_code. | ||
|
||
Returns: | ||
A python version of ENTRY_FUNCTION() | ||
|
@@ -2541,10 +2618,11 @@ def load_pybinding_async( | |
extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs), | ||
) | ||
get_result = cls.load_async( | ||
source_code + suffix, | ||
main_code + suffix, | ||
device_type, | ||
submit_fn=submit_fn, | ||
extra_flags=extra_flags, | ||
optimized_code=kernel_code, | ||
) | ||
result = None | ||
|
||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.