10000 [AOTI] Add a SlimTensor representation · pytorch/pytorch@8f0e46b · GitHub
[go: up one dir, main page]

Skip to content

Commit 8f0e46b

Browse files
committed
[AOTI] Add a SlimTensor representation
Summary: [ghstack-poisoned]
1 parent 09b3516 commit 8f0e46b

37 files changed

+1756
-68
lines changed

caffe2/CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -1301,7 +1301,9 @@ target_include_directories(torch_cpu PRIVATE
13011301
target_include_directories(torch_cpu PRIVATE
13021302
${TORCH_ROOT}/third_party/nlohmann/include)
13031303

1304-
install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
1304+
install(DIRECTORY
1305+
"${TORCH_SRC_DIR}/csrc"
1306+
"${TORCH_SRC_DIR}/standalone"
13051307
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
13061308
FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
13071309
install(FILES

test/inductor/test_aot_inductor.py

+12
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,18 @@ def forward(self, x, y):
156156
model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
157157
)
158158

159+
def test_cos(self):
160+
class Model(torch.nn.Module):
161+
def __init__(self) -> None:
162+
super().__init__()
163+
164+
def forward(self, x):
165+
y = torch.cos(x)
166+
return y
167+
168+
example_inputs = (torch.randn(16, 10, device=self.device),)
169+
self.check_model(Model(), example_inputs)
170+
159171
def test_small_constant(self):
160172
class Model(torch.nn.Module):
161173
def __init__(self) -> None:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Owner(s): ["module: inductor"]
2+
import copy
3+
import functools
4+
import sys
5+
import unittest
6+
7+
from torch._inductor import config
8+
from torch._inductor.test_case import TestCase
9+
from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
10+
from torch.testing._internal.inductor_utils import GPU_TYPE
11+
12+
13+
if IS_WINDOWS and IS_CI:
14+
sys.stderr.write(
15+
"Windows CI does not have necessary dependencies for test_torchinductor yet\n"
16+
)
17+
if __name__ == "__main__":
18+
sys.exit(0)
19+
raise unittest.SkipTest("requires sympy/functorch/filelock")
20+
21+
try:
22+
try:
23+
from .test_aot_inductor import (
24+
AOTInductorTestsTemplate,
25+
check_model,
26+
check_model_with_multiple_inputs,
27+
code_check_count,
28+
)
29+
except ImportError:
30+
from test_aot_inductor import ( # @manual
31+
AOTInductorTestsTemplate,
32+
check_model,
33+
check_model_with_multiple_inputs,
34+
code_check_count,
35+
)
36+
except (unittest.SkipTest, ImportError):
37+
if __name__ == "__main__":
38+
sys.exit(0)
39+
raise
40+
41+
42+
# Similar to copy_tests in test_torchinductor.py, but only takes a whitelist of tests
43+
def copy_tests(my_cls, other_cls, suffix, whitelist): # noqa: B902
44+
for name, value in my_cls.__dict__.items():
45+
if name.startswith("test_") and name in whitelist:
46+
# You cannot copy functions in Python, so we use closures here to
47+
# create objects with different ids. Otherwise, unittest.skip
48+
# would modify all methods sharing the same object id. Also, by
49+
# using a default argument, we create a copy instead of a
50+
# reference. Otherwise, we would lose access to the value.
51+
52+
@functools.wraps(value)
53+
@config.patch(
54+
{
55+
"aot_inductor.codegen_standalone": True,
56+
"max_autotune_gemm_backends": "TRITON",
57+
"max_autotune_conv_backends": "TRITON",
58+
}
59+
)
60+
def new_test(self, value=value):
61+
return value(self)
62+
63+
# Copy __dict__ which may contain test metadata
64+
new_test.__dict__ = copy.deepcopy(value.__dict__)
65+
setattr(other_cls, f"{name}_{suffix}", new_test)
66+
67+
# Special case convenience routine
68+
if hasattr(my_cls, "is_dtype_supported"):
69+
other_cls.is_dtype_supported = my_cls.is_dtype_supported
70+
71+
72+
test_list_cpu = {
73+
# Need to sort out third-party library build issues, e.g. blas, sleef
74+
}
75+
76+
77+
class AOTInductorTestLibtorchFreeCpu(TestCase):
78+
device = "cpu"
79+
device_type = "cpu"
80+
check_model = check_model
81+
check_model_with_multiple_inputs = check_model_with_multiple_inputs
82+
code_check_count = code_check_count
83+
allow_stack_allocation = False
84+
use_minimal_arrayref_interface = False
85+
86+
87+
copy_tests(
88+
AOTInductorTestsTemplate,
89+
AOTInductorTestLibtorchFreeCpu,
90+
"cpu_standalone",
91+
test_list_cpu,
92+
)
93+
94+
test_list_gpu = {
95+
"test_cos",
96+
}
97+
98+
99+
@unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
100+
class AOTInductorTestLibtorchFreeGpu(TestCase):
101+
device = GPU_TYPE
102+
device_type = GPU_TYPE
103+
check_model = check_model
104+
check_model_with_multiple_inputs = check_model_with_multiple_inputs
105+
code_check_count = code_check_count
106+
allow_stack_allocation = False
107+
use_minimal_arrayref_interface = False
108+
109+
110+
copy_tests(
111+
AOTInductorTestsTemplate,
112+
AOTInductorTestLibtorchFreeGpu,
113+
f"{GPU_TYPE}_standalone",
114+
test_list_gpu,
115+
)
116+
117+
118+
if __name__ == "__main__":
119+
from torch._inductor.test_case import run_tests
120+
121+
run_tests(needs="filelock")

torch/_inductor/codecache.py

+49-4
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,12 @@ def get_keys(cls) -> KeysView[str]:
15321532

15331533

15341534
class AotCodeCompiler:
1535+
"""
1536+
AOTCodeCompiler is a class that handles the compilation of AOTInductor
1537+
kernels. It is responsible for generating the kernel and wrapper code,
1538+
compiling and packaging them.
1539+
"""
1540+
15351541
@classmethod
15361542
def compile(
15371543
cls,
@@ -1744,6 +1750,9 @@ def _compile_consts(consts: bytes, platform: str) -> str:
17441750

17451751
metadata = config.aot_inductor.metadata
17461752
metadata["AOTI_DEVICE_KEY"] = device_type
1753+
metadata["STANDALONE"] = (
1754+
"1" if config.aot_inductor.codegen_standalone else "0"
1755+
)
17471756

17481757
# Save user provided metadata
17491758
meta_json = str(
@@ -1878,6 +1887,27 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
18781887

18791888
log.debug("aot wrapper compilation command: %s", wrapper_compile_cmd)
18801889
log.debug("aot kernel compilation command: %s", kernel_compile_cmd)
1890+
1891+
cuda_utils_o: list[str] = []
1892+
if config.aot_inductor.codegen_standalone and device_type == "cuda":
1893+
# TODO: seletively add additional cuda files
1894+
cuda_util_files: list[str] = []
1895+
cuda_build_options = CppTorchDeviceOptions(
1896+
compiler="nvcc",
1897+
compile_only=True,
1898+
**compile_command,
1899+
)
1900+
for file in cuda_util_files:
1901+
cuda_builder = CppBuilder(
1902+
name=file,
1903+
sources=file,
1904+
output_dir=str(wrapper_path_operator.parent),
1905+
BuildOption=cuda_build_options,
1906+
)
1907+
if not config.aot_inductor.package_cpp_only:
1908+
cuda_builder.build()
1909+
cuda_utils_o.append(cuda_builder.get_target_file_path())
1910+
18811911
if config.aot_inductor.package_cpp_only:
18821912
# Not doing the actual compilation here
18831913
compile_flags = str(
@@ -2001,7 +2031,14 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
20012031
use_relative_path=use_relative_path,
20022032
)
20032033

2004-
obj_srcs = [wrapper_o, kernel_o, consts_o, *gpu_kernels_o, *cubins_o]
2034+
obj_srcs = [
2035+
wrapper_o,
2036+
kernel_o,
2037+
consts_o,
2038+
*gpu_kernels_o,
2039+
*cubins_o,
2040+
*cuda_utils_o,
2041+
]
20052042
so_builder = CppBuilder(
20062043
name=output_name,
20072044
sources=obj_srcs,
@@ -2096,7 +2133,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
20962133
@clear_on_fresh_inductor_cache
20972134
@functools.lru_cache
20982135
def cpp_prefix_path() -> str:
2099-
path = Path(__file__).parent / "codegen/cpp_prefix.h"
2136+
path = Path(__file__).parent / "codegen" / "cpp_prefix.h"
21002137
with path.open() as f:
21012138
content = f.read()
21022139
_, filename = write(
@@ -2571,7 +2608,11 @@ class CppWrapperCodeCache(CppPythonBindingsCodeCache):
25712608
call_entry_function = "return inductor_entry_cpp({});"
25722609
extra_parse_arg = textwrap.dedent(
25732610
"""
2611+
#ifdef AOTI_STANDALONE
2612+
#include <torch/csrc/inductor/aoti_standalone/c/shim.h>
2613+
#else
25742614
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
2615+
#endif // AOTI_STANDALONE
25752616
25762617
static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {{
25772618
std::vector<AtenTensorHandle> result;
@@ -3215,7 +3256,7 @@ def _nvcc_host_compiler_options() -> list[str]:
32153256
]
32163257

32173258

3218-
def _nvcc_compiler_options() -> list[str]:
3259+
def _nvcc_get_arch_option() -> str:
32193260
arch = cuda_env.get_cuda_arch()
32203261
if arch == "90":
32213262
# Required by cutlass compilation.
@@ -3225,13 +3266,17 @@ def _nvcc_compiler_options() -> list[str]:
32253266
code = [f"sm_{arch}", f"compute_{arch}"]
32263267
if config.cuda.enable_cuda_lto:
32273268
code += [f"lto_{arch}"]
3269+
return f"gencode=arch=compute_{arch},code=[{','.join(code)}]"
3270+
3271+
3272+
def _nvcc_compiler_options() -> list[str]:
32283273
options = [
32293274
"-t=0",
32303275
"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
32313276
"-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1",
32323277
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
32333278
"-w",
3234-
f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
3279+
f"-{_nvcc_get_arch_option()}",
32353280
config.cuda.compile_opt_level,
32363281
"-std=c++17",
32373282
"--expt-relaxed-constexpr",

torch/_inductor/codegen/cpp_prefix.h

+4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@
2929
#include <c10/util/irange.h>
3030
#include <c10/util/Half.h>
3131
#include <c10/util/TypeCast.h>
32+
#ifdef AOTI_STANDALONE
33+
#include <torch/csrc/inductor/aoti_standalone/c/shim.h>
34+
#else
3235
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
36+
#endif // AOTI_STANDALONE
3337

3438
#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
3539
#define INDUCTOR_USE_VECTOR_TYPES() 1

torch/_inductor/codegen/cpp_template.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,13 @@ def header(self) -> IndentedBuffer:
124124
res = IndentedBuffer()
125125
res.writeline(codecache.cpp_prefix())
126126
# TODO: add c10::ForcedUnroll test to test_aoti_abi_check
127-
res.splice("""#include <c10/util/Unroll.h>""")
128-
res.splice("""#include <torch/csrc/inductor/aoti_torch/c/shim.h>""")
127+
res.splice("""
128+
#include <c10/util/Unroll.h>
129+
#ifdef AOTI_STANDALONE
130+
#include <torch/csrc/inductor/aoti_standalone/c/shim.h>
131+
#else
132+
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
133+
#endif // AOTI_STANDALONE""")
129134
enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
130135
"linux",
131136
"win32",

torch/_inductor/codegen/cpp_wrapper_cpu.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,9 @@ def add_device_include(self, device: str) -> None:
172172
# present.
173173
self.header.splice(self.get_device_include_path(device))
174174
extend_aoti_c_shim_include = (
175-
f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
175+
f"torch/csrc/inductor/aoti_standalone/{self.device}/c_shim_{self.device}.h"
176+
if config.aot_inductor.codegen_standalone
177+
else f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
176178
)
177179
extend_aoti_c_shim_path = os.path.join(
178180
os.path.dirname(torch.__file__),
@@ -942,6 +944,8 @@ def finalize_prefix(self):
942944
self.codegen_const_run_driver()
943945
aot_mode_decls.writeline("} // namespace torch::aot_inductor")
944946
aot_mode_decls.writeline("using namespace torch::aot_inductor;")
947+
if config.aot_inductor.codegen_standalone:
948+
aot_mode_decls.writeline("using namespac 741A e torch::standalone;")
945949

946950
self.prefix = cache_decls = IndentedBuffer()
947951
for dtype in self.used_cached_dtypes:

torch/_inductor/config.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ class aot_inductor:
12501250
force_mmap_weights: bool = False
12511251

12521252
package: bool = False
1253-
package_cpp_only: bool = False
1253+
package_cpp_only: bool = os.environ.get("AOT_INDUCTOR_PACKAGE_CPP_ONLY", "0") == "1"
12541254

12551255
# Dictionary of metadata users might want to save to pass to the runtime.
12561256
# TODO: Move this somewhere else, since it's no longer really a config
@@ -1295,6 +1295,11 @@ class aot_inductor:
12951295
# Experimental. Controls automatic precompiling of common AOTI include files.
12961296
precompile_headers: bool = False
12971297

1298+
# Experimental. Controls whether to generate model code in a standalone way.
1299+
codegen_standalone: bool = (
1300+
os.environ.get("AOT_INDUCTOR_CODEGEN_STANDALONE", "0") == "1"
1301+
)
1302+
12981303
# Embed generated .cubin files into the .so
12991304
embed_cubin: bool = False
13001305

0 commit comments

Comments
 (0)
0