Merge pull request numpy#40 from howjmay/vpadd_s8

howjmay · web-flow · commit f3a3dac1b1ec · 2023-11-05T16:26:47.000+01:00
feat: Add vpadd_s8
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -999,7 +999,17 @@ FORCE_INLINE int8x8_t vmax_s8(int8x8_t __a, int8x8_t __b) { return __riscv_vmax_
 
 // FORCE_INLINE uint32x4_t vminq_u32(uint32x4_t __a, uint32x4_t __b);
 
-// FORCE_INLINE int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b);
+FORCE_INLINE int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b) {
+  uint8_t mask_arr[] = {85};
+  vbool16_t mask = __riscv_vlm_v_b16(mask_arr, 8);
+  vint8mf2_t a_slidedown = __riscv_vslidedown_vx_i8mf2(__a, 1, 8);
+  vint8mf2_t b_slidedown = __riscv_vslidedown_vx_i8mf2(__b, 1, 8);
+  vint8mf2_t a_add = __riscv_vadd_vv_i8mf2(__a, a_slidedown, 8);
+  vint8mf2_t b_add = __riscv_vadd_vv_i8mf2(__b, b_slidedown, 8);
+  vint8mf2_t a_compress = __riscv_vcompress_vm_i8mf2(a_add, mask, 8);
+  vint8mf2_t b_compress = __riscv_vcompress_vm_i8mf2(b_add, mask, 8);
+  return __riscv_vslideup_vx_i8mf2(a_compress, b_compress, 4, 8);
+}
 
 // FORCE_INLINE int16x4_t vpadd_s16(int16x4_t __a, int16x4_t __b);
 
diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -1600,7 +1600,24 @@ result_t test_vminq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
 
 result_t test_vminq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
-result_t test_vpadd_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vpadd_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  const int8_t *_a = (int8_t *)impl.test_cases_int_pointer1;
+  const int8_t *_b = (int8_t *)impl.test_cases_int_pointer2;
+  int8_t _c[8];
+  _c[0] = _a[0] + _a[1];
+  _c[1] = _a[2] + _a[3];
+  _c[2] = _a[4] + _a[5];
+  _c[3] = _a[6] + _a[7];
+  _c[4] = _b[0] + _b[1];
+  _c[5] = _b[2] + _b[3];
+  _c[6] = _b[4] + _b[5];
+  _c[7] = _b[6] + _b[7];
+
+  int8x8_t a = vld1_s8(_a);
+  int8x8_t b = vld1_s8(_b);
+  int8x8_t c = vpadd_s8(a, b);
+  return validate_int8(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]);
+}
 
 result_t test_vpadd_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }