8000 [caffe2] Export operators to c10 without including ATen/Tensor.h · pytorch/pytorch@c40d87c · GitHub
[go: up one dir, main page]

Skip to content

Commit c40d87c

Browse files
committed
[caffe2] Export operators to c10 without including ATen/Tensor.h
With this change, the only caffe2 files that depend on `ATen/Tensor.h` are ones that directly use the ATen API. Specifically, ``` [ "caffe2/CMakeFiles/torch_cuda_cpp.dir/contrib/aten/aten_op_gpu.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/core/tensor.cc.o", "caffe2/CMakeFiles/torch_cuda_cpp.dir/operators/layer_norm_op.cu.o", "caffe2/CMakeFiles/torch_cpu.dir/core/IValueInterface.cc.o", "caffe2/CMakeFiles/cuda_tensor_interop_test.dir/__/aten/src/ATen/test/cuda_tensor_interop_test.cpp.o", "caffe2/CMakeFiles/torch_cpu.dir/contrib/aten/aten_op.cc.o", "caffe2/CMakeFiles/caffe2_pybind11_state_gpu.dir/python/pybind_state.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/operators/layer_norm_op.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/core/export_c10_op_to_caffe2.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/core/export_caffe2_op_to_c10.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/operators/enforce_finite_op.cc.o", "caffe2/CMakeFiles/torch_cpu.dir/core/operator.cc.o", "caffe2/CMakeFiles/tensor_interop_test.dir/__/aten/src/ATen/test/tensor_interop_test.cpp.o", "caffe2/CMakeFiles/caffe2_pybind11_state.dir/python/pybind_state.cc.o" ] ``` ghstack-source-id: 3862e06 Pull Request resolved: #67096
1 parent 1a0e590 commit c40d87c

File tree

5 files changed

+201
-165
lines changed

5 files changed

+201
-165
lines changed

BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,7 @@ filegroup(
778778
"caffe2/core/db.cc",
779779
"caffe2/core/event.cc",
780780
"caffe2/core/export_c10_op_to_caffe2.cc",
781+
"caffe2/core/export_caffe2_op_to_c10.cc",
781782
"caffe2/core/graph.cc",
782783
"caffe2/core/init.cc",
783784
"caffe2/core/init_denormals.cc",
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#define TORCH_ASSERT_NO_OPERATORS
2+
#include <caffe2/core/export_caffe2_op_to_c10.h>
3+
#undef TORCH_ASSERT_NO_OPERATORS
4+
5+
#if defined(EXPOSE_C2_OPS) || \
6+
!defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
7+
8+
#include <ATen/core/function_schema.h>
9+
#include <ATen/core/dispatch/Dispatcher.h>
10+
#include <torch/csrc/jit/frontend/function_schema_parser.h>
11+
#include <torch/library.h>
12+
13+
namespace caffe2 {
14+
namespace detail {
15+
16+
// We only manipulate the stack inside of this function, so the header
17+
// does not depend on Tensor or IValue at all.
18+
void call_caffe2_op_from_c10(
19+
const OperatorHandle &opHandle,
20+
c10::Stack* stack,
21+
_CallCaffe2OpFunc* call_op) {
22+
// precondition: on the stack, there's one IValue for each argument of the
23+
// c10 schema. The last argument is an optional tensor list that
24+
// (if not ivalue::None) contains a preallocated output tensor for each
25+
// operator output.
26+
27+
// As an invariant, we don't want any autograd gradients to be tracked in
28+
// Caffe2 operators.
29+
at::NoGradGuard guard;
30+
31+
const auto &schema = opHandle.schema();
32+
AT_ASSERT(
33+
schema.arguments().size() != 0 &&
34+
schema.arguments().back().type()->isSubtypeOf(
35+
*OptionalType::create(ListType::ofTensors())));
36+
IValue preallocated_outputs = torch::jit::pop(*stack);
37+
38+
const size_t num_outputs = schema.returns().size();
39+
const size_t num_inputs = schema.arguments().size() -
40+
1; // -1 because the last argument is the list of preallocated tensors
41+
42+
c10::List<at::Tensor> outputs;
43+
if (preallocated_outputs.isNone()) {
44+
// either the schema doesn't support preallocated outputs or it does but
45+
// they haven't been passed in. Pass a list of uninitialized tensors to
46+
// the caffe2 operator as preallocated outputs.
47+
outputs.resize(num_outputs);
48+
} else {
49+
AT_ASSERT(preallocated_outputs.isTensorList());
50+
outputs = std::move(preallocated_outputs).toTensorList();
51+
}
52+
53+
// TODO Avoid vector allocation. One idea would be to keep the std::vector
54+
// instances in the cache.
55+
std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
56+
57+
// Convert outputs to caffe2::Tensor
58+
std::vector<caffe2::Tensor> outputs_c2(num_outputs);
59+
for (auto i : c10::irange(num_outputs)) {
60+
outputs_c2[i] = caffe2::Tensor(outputs.get(i));
61+
}
62+
63+
outputs_c2 = (*call_op)(schema, inputs, std::move(outputs_c2));
64+
65+
66+
bool return_tensor_list = false;
67+
if (schema.returns().size() == 1) {
68+
auto type = schema.returns()[0].type();
69+
if (c10::ListTypePtr list_type = type->cast<c10::ListType>()) {
70+
if (list_type->getElementType()->kind() == c10::TypeKind::TensorType) {
71+
return_tensor_list = true;
72+
}
73+
}
74+
}
75+
if (return_tensor_list) {
76+
for (auto i : c10::irange(num_outputs)) {
77+
outputs.set(i, at::Tensor(std::move(outputs_c2[i])));
78+
}
79+
torch::jit::push(*stack, outputs);
80+
} else {
81+
for (auto i : c10::irange(num_outputs)) {
82+
torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i])));
83+
}
84+
}
85+
86+
// postcondition: All inputs are cleared from the stack, there's now one
87+
// IValue for each output which holds the result. This
88+
// might reuse one of the preallocated tensors but doesn't have
89+
// to.
90+
}
91+
92+
static FunctionSchema make_function_schema_for_c10(const char* schema_str) {
93+
#if !defined(EXPOSE_C2_OPS) && \
94+
(defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
95+
throw std::logic_error(
96+
"We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
97+
#else
98+
c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str);
99+
std::vector<c10::Argument> arguments = parsed_schema.arguments();
100+
arguments.emplace_back(
101+
PREALLOCATED_OUTPUT_ARGNAME,
102+
c10::OptionalType::create(c10::ListType::ofTensors()),
103+
nullopt,
104+
IValue());
105+
106+
return FunctionSchema(
107+
parsed_schema.name(),
108+
parsed_schema.overload_name(),
109+
std::move(arguments),
110+
parsed_schema.returns(),
111+
parsed_schema.is_vararg(),
112+
parsed_schema.is_varret());
113+
#endif
114+
}
115+
116+
template <c10::DispatchKey key>
117+
RegisterDefinition<key>::RegisterDefinition(const char *name, c10::BoxedKernel kernel) {
118+
if (c10::impl::dispatch_key_allowlist_check(key)) {
119+
static torch::Library m(
120+
torch::Library::IMPL, "_caffe2", key, __FILE__, __LINE__);
121+
m.impl(name, torch::CppFunction::makeFromBoxedKernel(std::move(kernel)));
122+
}
123+
}
124+
125+
template class RegisterDefinition<c10::DispatchKey::CPU>;
126+
template class RegisterDefinition<c10::DispatchKey::CUDA>;
127+
template class RegisterDefinition<c10::DispatchKey::HIP>;
128+
129+
RegisterSchema::RegisterSchema(const char *schema_str) {
130+
static torch::Library m(
131+
torch::Library::FRAGMENT, "_caffe2", c10::nullopt,
132+
__FILE__, __LINE__);
133+
m.def(make_function_schema_for_c10(schema_str));
134+
}
135+
136+
} // namespace detail
137+
} // namespace caffe2
138+
139+
#endif

0 commit comments

Comments
 (0)
0