diff --git a/BUILD.bazel b/BUILD.bazel index 6b73b899f19f94..d15c72c787eb18 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -511,6 +511,7 @@ filegroup( "caffe2/core/db.cc", "caffe2/core/event.cc", "caffe2/core/export_c10_op_to_caffe2.cc", + "caffe2/core/export_caffe2_op_to_c10.cc", "caffe2/core/graph.cc", "caffe2/core/init.cc", "caffe2/core/init_denormals.cc", diff --git a/caffe2/core/export_caffe2_op_to_c10.cc b/caffe2/core/export_caffe2_op_to_c10.cc new file mode 100644 index 00000000000000..dd260f29f7dd16 --- /dev/null +++ b/caffe2/core/export_caffe2_op_to_c10.cc @@ -0,0 +1,148 @@ +#define TORCH_ASSERT_NO_OPERATORS +#include +#undef TORCH_ASSERT_NO_OPERATORS + +#if defined(EXPOSE_C2_OPS) || \ + !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) + +#include +#include +#include +#include + +namespace caffe2 { +namespace detail { + +// We only manipulate the stack inside of this function, so the header +// does not depend on Tensor or IValue at all. +void call_caffe2_op_from_c10( + const OperatorHandle &opHandle, + c10::Stack* stack, + _CallCaffe2OpFunc* call_op) { + // precondition: on the stack, there's one IValue for each argument of the + // c10 schema. The last argument is an optional tensor list that + // (if not ivalue::None) contains a preallocated output tensor for each + // operator output. + + // As an invariant, we don't want any autograd gradients to be tracked in + // Caffe2 operators. + at::NoGradGuard guard; + + const auto &schema = opHandle.schema(); + AT_ASSERT( + schema.arguments().size() != 0 && + schema.arguments().back().type()->isSubtypeOf( + *OptionalType::create(ListType::ofTensors()))); + IValue preallocated_outputs = torch::jit::pop(*stack); + + c10::List outputs; + if (preallocated_outputs.isNone()) { + // either the schema doesn't support preallocated outputs or it does but + // they haven't been passed in. Pass a list of uninitialized tensors to + // the caffe2 operator as preallocated outputs. + outputs.resize(schema.returns().size()); + } else { + AT_ASSERT(preallocated_outputs.isTensorList()); + outputs = std::move(preallocated_outputs).toTensorList(); + } + + // -1 because the last argument is the list of preallocated tensors + const size_t num_inputs = schema.arguments().size() - 1; + + // TODO Avoid vector allocation. One idea would be to keep the std::vector + // instances in the cache. + std::vector inputs = torch::jit::pop(*stack, num_inputs); + + // Convert outputs to caffe2::Tensor + const size_t num_outputs = outputs.size(); + std::vector outputs_c2(num_outputs); + for (auto i : c10::irange(num_outputs)) { + outputs_c2[i] = caffe2::Tensor(outputs.extract(i)); + } + + outputs_c2 = (*call_op)(schema, inputs, std::move(outputs_c2)); + TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size()); + + + bool return_tensor_list = false; + if (schema.returns().size() == 1) { + auto type = schema.returns()[0].type(); + if (c10::ListTypePtr list_type = type->cast()) { + if (list_type->getElementType()->kind() == c10::TypeKind::TensorType) { + return_tensor_list = true; + } + } + } + if (return_tensor_list) { + for (const auto i : c10::irange(num_outputs)) { + outputs.set(i, at::Tensor(std::move(outputs_c2[i]))); + } + torch::jit::push(*stack, outputs); + } else { + for (const auto i : c10::irange(num_outputs)) { + torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i]))); + } + } + + // postcondition: All inputs are cleared from the stack, there's now one + // IValue for each output which holds the result. This + // might reuse one of the preallocated tensors but doesn't have + // to. +} + +static FunctionSchema make_function_schema_for_c10( + const char* schema_str, + c10::optional optional_alias_analysis_kind) { +#if !defined(EXPOSE_C2_OPS) && \ + (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE)) + throw std::logic_error( + "We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build."); +#else + c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str); + std::vector arguments = parsed_schema.arguments(); + arguments.emplace_back( + PREALLOCATED_OUTPUT_ARGNAME, + c10::OptionalType::create(c10::ListType::ofTensors()), + nullopt, + IValue()); + + auto schema = FunctionSchema( + parsed_schema.name(), + parsed_schema.overload_name(), + std::move(arguments), + parsed_schema.returns(), + parsed_schema.is_vararg(), + parsed_schema.is_varret()); + if (optional_alias_analysis_kind) { + schema.setAliasAnalysis(*optional_alias_analysis_kind); + } + return schema; +#endif +} + +template +RegisterDefinition::RegisterDefinition(const char *name, c10::BoxedKernel kernel) { + if (c10::impl::dispatch_key_allowlist_check(key)) { + static torch::Library m( + torch::Library::IMPL, "_caffe2", key, __FILE__, __LINE__); + m.impl(name, torch::CppFunction::makeFromBoxedKernel(std::move(kernel))); + } +} + +template struct RegisterDefinition; +template struct RegisterDefinition; +template struct RegisterDefinition; + +RegisterSchema::RegisterSchema( + const char *schema_str, + c10::optional optional_alias_analysis_kind) { + static torch::Library m( + torch::Library::FRAGMENT, "_caffe2", c10::nullopt, + __FILE__, __LINE__); + m.def(make_function_schema_for_c10(schema_str, optional_alias_analysis_kind)); +} + +} // namespace detail +} // namespace caffe2 + +#endif diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h index 0dfd5e0366ddad..f02136b9e18945 100644 --- a/caffe2/core/export_caffe2_op_to_c10.h +++ b/caffe2/core/export_caffe2_op_to_c10.h @@ -5,16 +5,17 @@ #if defined(EXPOSE_C2_OPS) || \ !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include +namespace c10 { +struct FunctionSchema; +struct IValue; +class OperatorHandle; +using Stack = std::vector; +} + namespace caffe2 { namespace detail { @@ -36,238 +37,75 @@ inline std::vector _call_caffe2_op( return std::move(op).move_output_tensors(); } -// This function is inline in the hope that compilers optimizing for speed will -// inline it into call_caffe2_op_from_c10, allowing call_op to be inlined and -// avoiding the function pointer indirection, while compilers optimizing for -// binary size will keep it a separate function instead of inlining it into -// a template and will reuse the binary code of this function between ops. -// We measured and confirmed that binary size off the instagram ios app is -// reduced when having _call_caffe2_op_from_c10 separate from the templated -// call_caffe2_op_from_c10. -inline void _call_caffe2_op_from_c10( - c10::Stack* stack, - const c10::FunctionSchema& schema, - _CallCaffe2OpFunc* call_op) { - // precondition: on the stack, there's one IValue for each argument of the - // c10 schema. The last argument is an optional tensor list that - // (if not ivalue::None) contains a preallocated output tensor for each - // operator output. - - // As an invariant, we don't want any autograd gradients to be tracked in - // Caffe2 operators. - at::NoGradGuard guard; - - AT_ASSERT( - schema.arguments().size() != 0 && - schema.arguments().back().type()->isSubtypeOf( - *OptionalType::create(ListType::ofTensors()))); - IValue preallocated_outputs = torch::jit::pop(*stack); - - const size_t num_inputs = schema.arguments().size() - - 1; // -1 because the last argument is the list of preallocated tensors - - c10::List outputs; - if (preallocated_outputs.isNone()) { - // either the schema doesn't support preallocated outputs or it does but - // they haven't been passed in. Pass a list of uninitialized tensors to - // the caffe2 operator as preallocated outputs. - outputs.resize(schema.returns().size()); - } else { - AT_ASSERT(preallocated_outputs.isTensorList()); - outputs = std::move(preallocated_outputs).toTensorList(); - } - - // TODO Avoid vector allocation. One idea would be to keep the std::vector - // instances in the cache. - std::vector inputs = torch::jit::pop(*stack, num_inputs); - - // Convert outputs to caffe2::Tensor - const size_t num_outputs = outputs.size(); - std::vector outputs_c2(num_outputs); - for (auto i : c10::irange(num_outputs)) { - outputs_c2[i] = caffe2::Tensor(outputs.extract(i)); - } - - outputs_c2 = (*call_op)(schema, std::move(inputs), std::move(outputs_c2)); - TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size()); - - bool return_tensor_list = false; - if (schema.returns().size() == 1) { - auto type = schema.returns()[0].type(); - if (c10::ListTypePtr list_type = type->cast()) { - if (list_type->getElementType()->kind() == c10::TypeKind::TensorType) { - return_tensor_list = true; - } - } - } - if (return_tensor_list) { - for (const auto i : c10::irange(num_outputs)) { - outputs.set(i, at::Tensor(std::move(outputs_c2[i]))); - } - torch::jit::push(*stack, outputs); - } else { - for (const auto i : c10::irange(num_outputs)) { - torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i]))); - } - } - - // postcondition: All inputs are cleared from the stack, there's now one - // IValue for each output which holds the result. This - // might reuse one of the preallocated tensors but doesn't have - // to. +TORCH_API void call_caffe2_op_from_c10( + const c10::OperatorHandle &opHandle, + c10::Stack *stack, + _CallCaffe2OpFunc *call_op); + +template +void boxed_caffe2_operator(const OperatorHandle& opHandle, c10::Stack* stack) { + call_caffe2_op_from_c10( + opHandle, + stack, + &_call_caffe2_op); } -template -void call_caffe2_op_from_c10( - const c10::OperatorHandle& /*opHandle*/, - c10::Stack* stack) { - _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op); -} +template +struct TORCH_API RegisterDefinition { + RegisterDefinition(const char *name, c10::BoxedKernel kernel); +}; -inline FunctionSchema make_function_schema_for_c10( - const char* schema_str, - c10::optional optional_alias_analysis_kind) { -#if !defined(EXPOSE_C2_OPS) && \ - (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE)) - throw std::logic_error( - "We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build."); -#else - c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str); - std::vector arguments = parsed_schema.arguments(); - arguments.emplace_back( - PREALLOCATED_OUTPUT_ARGNAME, - c10::OptionalType::create(c10::ListType::ofTensors()), - nullopt, - IValue()); +extern template struct RegisterDefinition; +extern template struct RegisterDefinition; +extern template struct RegisterDefinition; - auto schema = FunctionSchema( - parsed_schema.name(), - parsed_schema.overload_name(), - std::move(arguments), - parsed_schema.returns(), - parsed_schema.is_vararg(), - parsed_schema.is_varret()); - if (optional_alias_analysis_kind) { - schema.setAliasAnalysis(*optional_alias_analysis_kind); - } - return schema; -#endif -} +struct TORCH_API RegisterSchema { + RegisterSchema( + const char *schema_str, + c10::optional optional_alias_analysis_kind); +}; } // namespace detail } // namespace caffe2 -/** - * To register a caffe2 operator caffe2::MyOperator with the c10 dispatcher, - * call: - * - * In caffe2/operators/MyOperator.h: - * - * > C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(C10MyOperator) // C10MyOperator is the - * name - * // used by c10 for this operator - * - * In caffe2/operators/MyOperator.cc - * - * > C10_EXPORT_CAFFE2_OP_TO_C10_CPU ( - * > C10MyOperator, - * > "_caffe2::C10MyOperator(Tensor input1, int argument2, float argument3) - * -> (Tensor output1, Tensor output2)" > caffe2::MyOperator - * // This is the caffe2 operator > // - * class template > ) - * - * In caffe2/operators/MyOperator.cu - * - * > C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(C10MyOperator , - * caffe2::MyOperator) - * - * Notes: - * - all macros must be defined in the top level namespace, not in namespace - * caffe2. - * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and - * C10_EXPORT_CAFFE2_OP_TO_C10_CPU . - * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if - * you don't want to expose the operator for CUDA operations. - * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor - * inputs must precede any non-tensor inputs. - * - * More complex use cases: - * - If your operator has a variable number of input tensors, make the first (!) - * input an input of type TensorList. There must be no other tensor inputs. - */ -#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) \ - namespace caffe2 { \ - namespace _c10_ops { \ - TORCH_API const FunctionSchema& schema_##OperatorName(); \ - } \ - } - -#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ - OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \ - /* Register the op schema with the c10 dispatcher */ \ - namespace caffe2 { \ - namespace _c10_ops { \ - C10_EXPORT const FunctionSchema& schema_##OperatorName() { \ - static const FunctionSchema schema = \ - ::caffe2::detail::make_function_schema_for_c10( \ - OperatorSchema, OptionalAliasAnalysisKind); \ - return schema; \ - } \ - TORCH_LIBRARY_FRAGMENT(_caffe2, m) { \ - m.def(::caffe2::detail::make_function_schema_for_c10( \ - OperatorSchema, OptionalAliasAnalysisKind)); \ - } \ - } \ - } - -#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY( \ - OperatorName, OperatorClass) \ - /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, CPU, m) { \ - m.impl( \ - "_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } +#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) -#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \ - OperatorName, OperatorSchema, OperatorClass) \ - C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ - OperatorName, OperatorSchema, c10::nullopt) \ +#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \ + /* Register the op schema with the c10 dispatcher */ \ + static const caffe2::detail::RegisterSchema \ + C10_ANONYMOUS_VARIABLE(RegisterSchema_static_init_)( \ + OperatorSchema, OptionalAliasAnalysisKind); + +#define _C10_EXPORT_CAFFE2_OP_TO_C10_KEY( \ + OperatorName, OperatorClass, Key) \ + /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ + static const caffe2::detail::RegisterDefinition \ + C10_ANONYMOUS_VARIABLE(Register##Key##Definition_static_init_)( \ + "_caffe2::" #OperatorName, \ + c10::BoxedKernel::makeFromFunction< \ + &::caffe2::detail::boxed_caffe2_operator>()); + +#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY( \ + OperatorName, OperatorClass) \ + _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, CPU) + +#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \ + OperatorName, OperatorSchema, OperatorClass) \ + C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, c10::nullopt) \ C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass) -#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS( \ - OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \ - C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ - OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \ - C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass) +#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass) \ + _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, CUDA) -#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass) \ - /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) { \ - m.impl( \ - "_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } // You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro . // The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically // rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify . -#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass) \ - /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, HIP, m) { \ - m.impl( \ - "_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } +#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass) \ + _C10_EXPORT_CAFFE2_OP_TO_C10_KEY(OperatorName, OperatorClass, HIP) + #else // Don't use c10 dispatcher on mobile because of binary size diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu index 434a9b797cf51b..bdf94e2b4193fe 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cu +++ b/caffe2/sgd/adagrad_fused_op_gpu.cu @@ -1,4 +1,4 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include