10000 Enable qint8 and quint8 add for AArch64 using ACL directly (#148653) · pytorch/pytorch@6c2db8f · GitHub
[go: up one dir, main page]

Skip to content

Commit 6c2db8f

Browse files
fadara01pytorchmergebot
authored andcommitted
Enable qint8 and quint8 add for AArch64 using ACL directly (#148653)
This enables qint8 and quint8 add for AArch64 through Arm Compute Library (ACL) directly. Relative performance improvement using OMP_NUM_THREADS=1 is ~15x, using OMP_NUM_THREADS=32 it’s ~5.4x. Co-authored-by: David Svantesson <david.svantesson-yeung@arm.com> Pull Request resolved: #148653 Approved by: https://github.com/malfet ghstack dependencies: #148585
1 parent 2e0c98f commit 6c2db8f

File tree

3 files changed

+140
-0
lines changed

3 files changed

+140
-0
lines changed

aten/src/ATen/native/quantized/cpu/ACLUtils.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,51 @@ void StaticQuantMatmul::configure() {
288288
gemm_info_);
289289
}
290290

291+
QuantAdd::QuantAdd(
292+
arm_compute::DataType dtype,
293+
const std::vector<int64_t>& input_dims,
294+
double qa_scale,
295+
int64_t qa_offset,
296+
double qb_scale,
297+
int64_t qb_offset,
298+
double dst_scale,
299+
int64_t dst_offset) {
300+
arm_compute::QuantizationInfo qa_qinfo = {
301+
static_cast<float>(qa_scale), static_cast<int32_t>(qa_offset), false};
302+
arm_compute::QuantizationInfo qb_qinfo = {
303+
static_cast<float>(qb_scale), static_cast<int32_t>(qb_offset), false};
304+
arm_compute::QuantizationInfo qdst_qinfo = {
305+
static_cast<float>(dst_scale), static_cast<int32_t>(dst_offset), false};
306+
307+
arm_compute::TensorShape qa_acl_tensor_shape;
308+
arm_compute::TensorShape qb_acl_tensor_shape;
309+
arm_compute::TensorShape qdst_acl_tensor_shape;
310+
for (int i = input_dims.size() - 1; i >= 0; i--) {
311+
qa_acl_tensor_shape.set(i, input_dims[i], false, true);
312+
qb_acl_tensor_shape.set(i, input_dims[i], false, true);
313+
qdst_acl_tensor_shape.set(i, input_dims[i], false, true);
314+
}
315+
arm_compute::TensorInfo qa_acl_tensor_info(
316+
qa_acl_tensor_shape, 1, dtype, qa_qinfo);
317+
arm_compute::TensorInfo qb_acl_tensor_info(
318+
qb_acl_tensor_shape, 1, dtype, qb_qinfo);
319+
arm_compute::TensorInfo qdst_acl_tensor_info(
320+
qdst_acl_tensor_shape, 1, dtype, qdst_qinfo);
321+
322+
qa_tensor.allocator()->init(qa_acl_tensor_info);
323+
qb_tensor.allocator()->init(qb_acl_tensor_info);
324+
qdst_tensor.allocator()->init(qdst_acl_tensor_info);
325+
}
326+
327+
arm_compute::Status QuantAdd::validate() {
328+
return q_add.validate(
329+
qa_tensor.info(), qb_tensor.info(), qdst_tensor.info(), policy);
330+
}
331+
332+
void QuantAdd::configure() {
333+
q_add.configure(&qa_tensor, &qb_tensor, &qdst_tensor, policy);
334+
}
335+
291336
} // namespace at::native::acl_utils
292337

293338
PackedLinearWeightsACL::PackedLinearWeightsACL(

aten/src/ATen/native/quantized/cpu/ACLUtils.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <arm_compute/core/TensorInfo.h>
99
#include <arm_compute/function_info/ActivationLayerInfo.h>
1010
#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
11+
#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
1112
#include <arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h>
1213
#include <arm_compute/runtime/NEON/functions/NEQuantizationLayer.h>
1314
#include <arm_compute/runtime/Tensor.h>
@@ -121,6 +122,29 @@ struct StaticQuantMatmul : public QuantMatmul {
121122
std::optional<at::Tensor> bia_q_tensor_orig_;
122123
};
123124

125+
struct QuantAdd {
126+
arm_compute::Tensor qa_tensor;
127+
arm_compute::Tensor qb_tensor;
128+
arm_compute::Tensor qdst_tensor;
129+
arm_compute::NEArithmeticAddition q_add;
130+
131+
QuantAdd(
132+
arm_compute::DataType dtype,
133+
const std::vector<int64_t>& input_dims,
134+
double qa_scale,
135+
int64_t qa_offset,
136+
double qb_scale,
137+
int64_t qb_offset,
138+
double dst_scale,
139+
int64_t dst_offset);
140+
141+
arm_compute::Status validate();
142+
void configure();
143+
144+
private:
145+
arm_compute::ConvertPolicy policy{arm_compute::ConvertPolicy::SATURATE};
146+
};
147+
124148
} // namespace at::native::acl_utils
125149
struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn {
126150
using ACLQuantMatmul = at::native::acl_utils::QuantMatmul;

aten/src/ATen/native/quantized/cpu/BinaryOps.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <ATen/ExpandUtils.h>
66
#include <torch/library.h>
77
#include <ATen/quantized/Quantizer.h>
8+
#include <ATen/native/quantized/cpu/ACLUtils.h>
89
#include <ATen/native/quantized/cpu/BinaryOps.h>
910
#include <ATen/native/quantized/cpu/QuantizedOps.h>
1011
#include <ATen/native/quantized/cpu/init_qnnpack.h>
@@ -384,6 +385,67 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
384385
}
385386
#endif // USE_XNNPACK
386387

388+
#if AT_MKLDNN_ACL_ENABLED()
389+
Tensor acl_qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
390+
TORCH_CHECK(
391+
qa.qscheme() == kPerTensorAffine || qa.qscheme() == kPerTensorSymmetric,
392+
"Only per tensor quantization is supported in ACL quantized add.");
393+
394+
Tensor qa_contig = qa.contiguous(qa.suggest_memory_format());
395+
Tensor qb_contig = qb.contiguous(qa.suggest_memory_format());
396+
auto qa_mem_format = qa_contig.suggest_memory_format();
397+
Tensor dst = at::native::empty_affine_quantized(
398+
at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
399+
qa_contig.scalar_type(),
400+
std::nullopt /* layout */,
401+
kCPU,
402+
std::nullopt /* pin_memory */,
403+
scale,
404+
zero_point,
405+
qa_mem_format);
406+
407+
if (qb_contig.size(0) == 0) {
408+
return dst;
409+
}
410+
411+
auto input_dims = qa_contig.sizes().vec();
412+
auto acl_dtype = dst.scalar_type() == kQInt8
413+
? arm_compute::DataType::QASYMM8_SIGNED
414+
: arm_compute::DataType::QASYMM8;
415+
auto acl_add = std::make_shared<acl_utils::QuantAdd>(
416+
acl_dtype,
417+
input_dims,
418+
qa_contig.q_scale(),
419+
qa_contig.q_zero_point(),
420+
qb_contig.q_scale(),
421+
qb_contig.q_zero_point(),
422+
dst.q_scale(),
423+
dst.q_zero_point());
424+
425+
auto status = acl_add->validate();
426+
TORCH_CHECK(
427+
status.error_code() == arm_compute::ErrorCode::OK,
428+
"Arm Compute Library's Quantized Matmul Validation Failed: " +
429+
status.error_description());
430+
431+
acl_add->configure();
432+
433+
acl_add->qa_tensor.allocator()->import_memory(qa_contig.data_ptr());
434+
acl_add->qb_tensor.allocator()->import_memory(qb_contig.data_ptr());
435+
acl_add->qdst_tensor.allocator()->import_memory(dst.data_ptr());
436+
437+
acl_add->q_add.run();
438+
439+
// this will not free memory, it will just tell ACL that we're no longer
440+
// using the pointer
441+
acl_add->qa_tensor.allocator()->free();
442+
acl_add->qb_tensor.allocator()->free();
443+
acl_add->qdst_tensor.allocator()->free();
444+
445+
return dst;
446+
}
447+
#endif // AT_MKLDNN_ACL_ENABLED()
448+
387449
template <bool ReLUFused = false>
388450
Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
389451
check_inputs(qa, qb);
@@ -406,6 +468,15 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
406468
}
407469
#endif // USE_PYTORCH_QNNPACK
408470
}
471+
472+
#if AT_MKLDNN_ACL_ENABLED()
473+
if (!ReLUFused && qa.ndimension() > 0 && qa.sizes() == qb.sizes() &&
474+
qa.scalar_type() == qb.scalar_type() &&
475+
(qa.scalar_type() == kQInt8 || qa.scalar_type() == kQUInt8)) {
476+
return acl_qadd(qa, qb, scale, zero_point);
477+
}
478+
#endif // AT_MKLDNN_ACL_ENABLED()
479+
409480
auto qc = at::_empty_affine_quantized(
410481
qa.sizes(),
411482
at::device(kCPU)

0 commit comments

Comments
 (0)
0