5
5
#include < ATen/ExpandUtils.h>
6
6
#include < torch/library.h>
7
7
#include < ATen/quantized/Quantizer.h>
8
+ #include < ATen/native/quantized/cpu/ACLUtils.h>
8
9
#include < ATen/native/quantized/cpu/BinaryOps.h>
9
10
#include < ATen/native/quantized/cpu/QuantizedOps.h>
10
11
#include < ATen/native/quantized/cpu/init_qnnpack.h>
@@ -384,6 +385,67 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
384
385
}
385
386
#endif // USE_XNNPACK
386
387
388
+ #if AT_MKLDNN_ACL_ENABLED()
389
+ Tensor acl_qadd (Tensor qa, Tensor qb, double scale, int64_t zero_point) {
390
+ TORCH_CHECK (
391
+ qa.qscheme () == kPerTensorAffine || qa.qscheme () == kPerTensorSymmetric ,
392
+ " Only per tensor quantization is supported in ACL quantized add." );
393
+
394
+ Tensor qa_contig = qa.contiguous (qa.suggest_memory_format ());
395
+ Tensor qb_contig = qb.contiguous (qa.suggest_memory_format ());
396
+ auto qa_mem_format = qa_contig.suggest_memory_format ();
397
+ Tensor dst = at::native::empty_affine_quantized (
398
+ at::infer_size_dimvector (qa_contig.sizes (), qb_contig.sizes ()),
399
+ qa_contig.scalar_type (),
400
+ std::nullopt /* layout */ ,
401
+ kCPU ,
402
+ std::nullopt /* pin_memory */ ,
403
+ scale,
404
+ zero_point,
405
+ qa_mem_format);
406
+
407
+ if (qb_contig.size (0 ) == 0 ) {
408
+ return dst;
409
+ }
410
+
411
+ auto input_dims = qa_contig.sizes ().vec ();
412
+ auto acl_dtype = dst.scalar_type () == kQInt8
413
+ ? arm_compute::DataType::QASYMM8_SIGNED
414
+ : arm_compute::DataType::QASYMM8;
415
+ auto acl_add = std::make_shared<acl_utils::QuantAdd>(
416
+ acl_dtype,
417
+ input_dims,
418
+ qa_contig.q_scale (),
419
+ qa_contig.q_zero_point (),
420
+ qb_contig.q_scale (),
421
+ qb_contig.q_zero_point (),
422
+ dst.q_scale (),
423
+ dst.q_zero_point ());
424
+
425
+ auto status = acl_add->validate ();
426
+ TORCH_CHECK (
427
+ status.error_code () == arm_compute::ErrorCode::OK,
428
+ " Arm Compute Library's Quantized Matmul Validation Failed: " +
429
+ status.error_description ());
430
+
431
+ acl_add->configure ();
432
+
433
+ acl_add->qa_tensor .allocator ()->import_memory (qa_contig.data_ptr ());
434
+ acl_add->qb_tensor .allocator ()->import_memory (qb_contig.data_ptr ());
435
+ acl_add->qdst_tensor .allocator ()->import_memory (dst.data_ptr ());
436
+
437
+ acl_add->q_add .run ();
438
+
439
+ // this will not free memory, it will just tell ACL that we're no longer
440
+ // using the pointer
441
+ acl_add->qa_tensor .allocator ()->free ();
442
+ acl_add->qb_tensor .allocator ()->free ();
443
+ acl_add->qdst_tensor .allocator ()->free ();
444
+
445
+ return dst;
446
+ }
447
+ #endif // AT_MKLDNN_ACL_ENABLED()
448
+
387
449
template <bool ReLUFused = false >
388
450
Tensor qadd (Tensor qa, Tensor qb, double scale, int64_t zero_point) {
389
451
check_inputs (qa, qb);
@@ -406,6 +468,15 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
406
468
}
407
469
#endif // USE_PYTORCH_QNNPACK
408
470
}
471
+
472
+ #if AT_MKLDNN_ACL_ENABLED()
473
+ if (!ReLUFused && qa.ndimension () > 0 && qa.sizes () == qb.sizes () &&
474
+ qa.scalar_type () == qb.scalar_type () &&
475
+ (qa.scalar_type () == kQInt8 || qa.scalar_type () == kQUInt8 )) {
476
+ return acl_qadd (qa, qb, scale, zero_point);
477
+ }
478
+ #endif // AT_MKLDNN_ACL_ENABLED()
479
+
409
480
auto qc = at::_empty_affine_quantized (
410
481
qa.sizes (),
411
482
at::device (kCPU )
0 commit comments