8000 Enable qint8 and quint8 add for AArch64 using ACL directly · pytorch/pytorch@f3f8c72 · GitHub
[go: up one dir, main page]

Skip to content

Commit f3f8c72

Browse files
committed
Enable qint8 and quint8 add for AArch64 using ACL directly
This enables qint8 and quint8 add for AArch64 through Arm Compute Library (ACL) directly. It's based on changes in PR #145942 which enables the use of ACL directly in ATen. Relative performance improvement using OMP_NUM_THREADS=1 is ~15x, using OMP_NUM_THREADS=32 it’s ~5.4x.
1 parent 1542c78 commit f3f8c72

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

aten/src/ATen/native/quantized/cpu/ACLUtils.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <arm_compute/function_info/ActivationLayerInfo.h>
1010
#include <arm_compute/runtime/Allocator.h>
1111
#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
12+
#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
1213
#include <arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h>
1314
#include <arm_compute/runtime/NEON/functions/NEQuantizationLayer.h>
1415
#include <arm_compute/runtime/Tensor.h>
@@ -63,6 +64,21 @@ struct ACLDynamicQuantMatmul {
6364
}
6465
};
6566

67+
struct ACLInt8Add {
68+
arm_compute::Tensor qa_acl_tensor;
69+
arm_compute::Tensor qb_acl_tensor;
70+
arm_compute::Tensor qdst_acl_tensor;
71+
arm_compute::NEArithmeticAddition acl_add;
72+
73+
~ACLInt8Add() {
74+
// This will free memory allocated for the quantized src tensor since the
75+
// allocation happened through ACL: src_s8_tensor.allocator()->allocate()
76+
qa_acl_tensor.allocator()->free();
77+
qb_acl_tensor.allocator()->free();
78+
qdst_acl_tensor.allocator()->free();
79+
}
80+
};
81+
6682
} // namespace at::native::acl_utils
6783

6884
struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn {

aten/src/ATen/native/quantized/cpu/BinaryOps.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <ATen/ExpandUtils.h>
66
#include <torch/library.h>
77
#include <ATen/quantized/Quantizer.h>
8+
#include <ATen/native/quantized/cpu/ACLUtils.h>
89
#include <ATen/native/quantized/cpu/BinaryOps.h>
910
#include <ATen/native/quantized/cpu/QuantizedOps.h>
1011
#include <ATen/native/quantized/cpu/init_qnnpack.h>
@@ -384,6 +385,83 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
384385
}
385386
#endif // USE_XNNPACK
386387

388+
#if AT_MKLDNN_ACL_ENABLED()
389+
template <typename T>
390+
Tensor acl_qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point){
391+
TORCH_CHECK(qa.ndimension() > 0, "acl_qadd(): Got empty input tensor.");
392+
TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
393+
qa.qscheme() == kPerTensorSymmetric,
394+
"Only per tensor quantization is supported in ACL quantized add.");
395+
396+
auto qa_mem_format = qa.suggest_memory_format();
397+
Tensor dst = at::native::empty_affine_quantized(
398+
at::infer_size_dimvector(qa.sizes(), qb.sizes()),
399+
qa.scalar_type(),
400+
std::nullopt /* layout */,
401+
kCPU,
402+
std::nullopt /* pin_memory */,
403+
scale,
404+
zero_point,
405+
qa_mem_format);
406+
407+
if (qb.size(0) == 0){
408+
return dst;
409+
}
410+
411+
// acl_obj created here to enable to cache this in the future
412+
std::shared_ptr<at::native::acl_utils::ACLInt8Add> acl_obj = std::make_shared<at::native::acl_utils::ACLInt8Add>();
413+
414+
auto input_dims = qa.sizes().vec();
415+
416+
arm_compute::TensorShape qa_acl_tensor_shape;
417+
arm_compute::TensorShape qb_acl_tensor_shape;
418+
arm_compute::TensorShape qdst_acl_tensor_shape;
419+
for(size_t i = input_dims.size() - 1; i != (size_t)-1; i--){
420+
qa_acl_tensor_shape.set(i, input_dims[i], false, true);
421+
qb_acl_tensor_shape.set(i, input_dims[i], false, true);
422+
qdst_acl_tensor_shape.set(i, input_dims[i], false, true);
423+
}
424+
425+
arm_compute::QuantizationInfo qa_qinfo = { static_cast<float>(qa.q_scale()), static_cast<int32_t>(qa.q_zero_point()), false };
426+
arm_compute::QuantizationInfo qb_qinfo = { static_cast<float>(qb.q_scale()), static_cast<int32_t>(qb.q_zero_point()), false };
427+
arm_compute::QuantizationInfo qdst_qinfo = { static_cast<float>(scale), static_cast<int32_t>(zero_point), false };
428+
429+
arm_compute::DataType acl_data_t;
430+
if(typeid(T) == typeid(int8_t)){
431+
acl_data_t = arm_compute::DataType::QASYMM8_SIGNED;
432+
}
433+
else{
434+
acl_data_t = arm_compute::DataType::QASYMM8; // unsigned
435+
}
436+
437+
arm_compute::TensorInfo qa_acl_tensor_info(qa_acl_tensor_shape, 1, acl_data_t, qa_qinfo);
438+
arm_compute::TensorInfo qb_acl_tensor_info(qb_acl_tensor_shape, 1, acl_data_t, qb_qinfo);
439+
arm_compute::TensorInfo qdst_acl_tensor_info(qdst_acl_tensor_shape, 1, acl_data_t, qdst_qinfo);
440+
441+
acl_obj->qa_acl_tensor.allocator()->init(qa_acl_tensor_info);
442+
acl_obj->qb_acl_tensor.allocator()->init(qb_acl_tensor_info);
443+
acl_obj->qdst_acl_tensor.allocator()->init(qdst_acl_tensor_info);
444+
445+
arm_compute::ConvertPolicy policy = arm_compute::ConvertPolicy::SATURATE;
446+
auto stat = acl_obj->acl_add.validate(&qa_acl_tensor_info, &qb_acl_tensor_info, &qdst_acl_tensor_info, policy);
447+
TORCH_CHECK(stat.error_code() == arm_compute::ErrorCode::OK,
448+
"arm_compute::NEArithmeticAddition config error.");
449+
acl_obj->acl_add.configure(&(acl_obj->qa_acl_tensor), &(acl_obj->qb_acl_tensor), &(acl_obj->qdst_acl_tensor), policy);
450+
451+
acl_obj->qa_acl_tensor.allocator()->import_memory((T*)qa.data_ptr());
452+
acl_obj->qb_acl_tensor.allocator()->import_memory((T*)qb.data_ptr());
453+
acl_obj->qdst_acl_tensor.allocator()->import_memory((T*)dst.data_ptr());
454+
455+
// If creating acl_obj once and caching need to reset QuantizationInfo with:
456+
// acl_obj->qa_acl_tensor.info()->set_quantization_info(qa_qinfo);
457+
// acl_obj->qb_acl_tensor.info()->set_quantization_info(qb_qinfo);
458+
// acl_obj->qdst_acl_tensor.info()->set_quantization_info(qdst_qinfo);
459+
acl_obj->acl_add.run();
460+
461+
return dst;
462+
}
463+
#endif // AT_MKLDNN_ACL_ENABLED()
464+
387465
template <bool ReLUFused = false>
388466
Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
389467
check_inputs(qa, qb);
@@ -406,6 +484,18 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
406484
}
407485
#endif // USE_PYTORCH_QNNPACK
408486
}
487+
488+
#if AT_MKLDNN_ACL_ENABLED()
489+
if((!ReLUFused) && (qa.sizes() == qb.sizes())){
490+
if(qa.scalar_type() == c10::ScalarType::QInt8){
491+
return acl_qadd<int8_t>(qa, qb, scale, zero_point);
492+
}
493+
else if(qa.scalar_type() == c10::ScalarType::QUInt8){
494+
return acl_qadd<uint8_t>(qa, qb, scale, zero_point);
495+
}
496+
}
497+
#endif // AT_MKLDNN_ACL_ENABLED()
498+
409499
auto qc = at::_empty_affine_quantized(
410500
qa.sizes(),
411501
at::device(kCPU)

0 commit comments

Comments
 (0)
0