5
5
#include < ATen/ExpandUtils.h>
6
6
#include < torch/library.h>
7
7
#include < ATen/quantized/Quantizer.h>
8
+ #include < ATen/native/quantized/cpu/ACLUtils.h>
8
9
#include < ATen/native/quantized/cpu/BinaryOps.h>
9
10
#include < ATen/native/quantized/cpu/QuantizedOps.h>
10
11
#include < ATen/native/quantized/cpu/init_qnnpack.h>
@@ -384,6 +385,83 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
384
385
}
385
386
#endif // USE_XNNPACK
386
387
388
+ #if AT_MKLDNN_ACL_ENABLED()
389
+ template <typename T>
390
+ Tensor acl_qadd (Tensor qa, Tensor qb, double scale, int64_t zero_point){
391
+ TORCH_CHECK (qa.ndimension () > 0 , " acl_qadd(): Got empty input tensor." );
392
+ TORCH_CHECK (qa.qscheme () == kPerTensorAffine ||
393
+ qa.qscheme () == kPerTensorSymmetric ,
394
+ " Only per tensor quantization is supported in ACL quantized add." );
395
+
396
+ auto qa_mem_format = qa.suggest_memory_format ();
397
+ Tensor dst = at::native::empty_affine_quantized (
398
+ at::infer_size_dimvector (qa.sizes (), qb.sizes ()),
399
+ qa.scalar_type (),
400
+ std::nullopt /* layout */ ,
401
+ kCPU ,
402
+ std::nullopt /* pin_memory */ ,
403
+ scale,
404
+ zero_point,
405
+ qa_mem_format);
406
+
407
+ if (qb.size (0 ) == 0 ){
408
+ return dst;
409
+ }
410
+
411
+ // acl_obj created here to enable to cache this in the future
412
+ std::shared_ptr<at::native::acl_utils::ACLInt8Add> acl_obj = std::make_shared<at::native::acl_utils::ACLInt8Add>();
413
+
414
+ auto input_dims = qa.sizes ().vec ();
415
+
416
+ arm_compute::TensorShape qa_acl_tensor_shape;
417
+ arm_compute::TensorShape qb_acl_tensor_shape;
418
+ arm_compute::TensorShape qdst_acl_tensor_shape;
419
+ for (size_t i = input_dims.size () - 1 ; i != (size_t )-1 ; i--){
420
+ qa_acl_tensor_shape.set (i, input_dims[i], false , true );
421
+ qb_acl_tensor_shape.set (i, input_dims[i], false , true );
422
+ qdst_acl_tensor_shape.set (i, input_dims[i], false , true );
423
+ }
424
+
425
+ arm_compute::QuantizationInfo qa_qinfo = { static_cast <float >(qa.q_scale ()), static_cast <int32_t >(qa.q_zero_point ()), false };
426
+ arm_compute::QuantizationInfo qb_qinfo = { static_cast <float >(qb.q_scale ()), static_cast <int32_t >(qb.q_zero_point ()), false };
427
+ arm_compute::QuantizationInfo qdst_qinfo = { static_cast <float >(scale), static_cast <int32_t >(zero_point), false };
428
+
429
+ arm_compute::DataType acl_data_t ;
430
+ if (typeid (T) == typeid (int8_t )){
431
+ acl_data_t = arm_compute::DataType::QASYMM8_SIGNED;
432
+ }
433
+ else {
434
+ acl_data_t = arm_compute::DataType::QASYMM8; // unsigned
435
+ }
436
+
437
+ arm_compute::TensorInfo qa_acl_tensor_info (qa_acl_tensor_shape, 1 , acl_data_t , qa_qinfo);
438
+ arm_compute::TensorInfo qb_acl_tensor_info (qb_acl_tensor_shape, 1 , acl_data_t , qb_qinfo);
439
+ arm_compute::TensorInfo qdst_acl_tensor_info (qdst_acl_tensor_shape, 1 , acl_data_t , qdst_qinfo);
440
+
441
+ acl_obj->qa_acl_tensor .allocator ()->init (qa_acl_tensor_info);
442
+ acl_obj->qb_acl_tensor .allocator ()->init (qb_acl_tensor_info);
443
+ acl_obj->qdst_acl_tensor .allocator ()->init (qdst_acl_tensor_info);
444
+
445
+ arm_compute::ConvertPolicy policy = arm_compute::ConvertPolicy::SATURATE;
446
+ auto stat = acl_obj->acl_add .validate (&qa_acl_tensor_info, &qb_acl_tensor_info, &qdst_acl_tensor_info, policy);
447
+ TORCH_CHECK (stat.error_code () == arm_compute::ErrorCode::OK,
448
+ " arm_compute::NEArithmeticAddition config error." );
449
+ acl_obj->acl_add .configure (&(acl_obj->qa_acl_tensor ), &(acl_obj->qb_acl_tensor ), &(acl_obj->qdst_acl_tensor ), policy);
450
+
451
+ acl_obj->qa_acl_tensor .allocator ()->import_memory ((T*)qa.data_ptr ());
452
+ acl_obj->qb_acl_tensor .allocator ()->import_memory ((T*)qb.data_ptr ());
453
+ acl_obj->qdst_acl_tensor .allocator ()->import_memory ((T*)dst.data_ptr ());
454
+
455
+ // If creating acl_obj once and caching need to reset QuantizationInfo with:
456
+ // acl_obj->qa_acl_tensor.info()->set_quantization_info(qa_qinfo);
457
+ // acl_obj->qb_acl_tensor.info()->set_quantization_info(qb_qinfo);
458
+ // acl_obj->qdst_acl_tensor.info()->set_quantization_info(qdst_qinfo);
459
+ acl_obj->acl_add .run ();
460
+
461
+ return dst;
462
+ }
463
+ #endif // AT_MKLDNN_ACL_ENABLED()
464
+
387
465
template <bool ReLUFused = false >
388
466
Tensor qadd (Tensor qa, Tensor qb, double scale, int64_t zero_point) {
389
467
check_inputs (qa, qb);
@@ -406,6 +484,18 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
406
484
}
407
485
#endif // USE_PYTORCH_QNNPACK
408
486
}
487
+
488
+ #if AT_MKLDNN_ACL_ENABLED()
489
+ if ((!ReLUFused) && (qa.sizes () == qb.sizes ())){
490
+ if (qa.scalar_type () == c10::ScalarType::QInt8){
491
+ return acl_qadd<int8_t >(qa, qb, scale, zero_point);
492
+ }
493
+ else if (qa.scalar_type () == c10::ScalarType::QUInt8){
494
+ return acl_qadd<uint8_t >(qa, qb, scale, zero_point);
495
+ }
496
+ }
497
+ #endif // AT_MKLDNN_ACL_ENABLED()
498
+
409
499
auto qc = at::_empty_affine_quantized (
410
500
qa.sizes (),
411
501
at::device (kCPU )
0 commit comments