Merge pull request numpy#59 from howjmay/vmlsl_s8

howjmay · web-flow · commit ed7594d9ab33 · 2023-11-09T19:19:20.000+01:00
feat: Add vmlsl_s8
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -476,7 +476,9 @@ FORCE_INLINE int8x8_t vmls_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
 
 // FORCE_INLINE uint32x4_t vmlsq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c);
 
-// FORCE_INLINE int16x8_t vmlsl_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c);
+FORCE_INLINE int16x8_t vmlsl_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c) {
+  return __riscv_vsub_vv_i16m1(__a, __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vwmul_vv_i16m2(__b, __c, 8)), 8);
+}
 
 // FORCE_INLINE int32x4_t vmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c);
 
diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -1106,7 +1106,25 @@ result_t test_vmlsq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
 
 result_t test_vmlsq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
-result_t test_vmlsl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmlsl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
+  const int8_t *_b = (int8_t *)impl.test_cases_int_pointer1 + 8;
+  const int8_t *_c = (int8_t *)impl.test_cases_int_pointer2;
+  int16_t d0 = _a[0] - (int16_t)_b[0] * (int16_t)_c[0];
+  int16_t d1 = _a[1] - (int16_t)_b[1] * (int16_t)_c[1];
+  int16_t d2 = _a[2] - (int16_t)_b[2] * (int16_t)_c[2];
+  int16_t d3 = _a[3] - (int16_t)_b[3] * (int16_t)_c[3];
+  int16_t d4 = _a[4] - (int16_t)_b[4] * (int16_t)_c[4];
+  int16_t d5 = _a[5] - (int16_t)_b[5] * (int16_t)_c[5];
+  int16_t d6 = _a[6] - (int16_t)_b[6] * (int16_t)_c[6];
+  int16_t d7 = _a[7] - (int16_t)_b[7] * (int16_t)_c[7];
+
+  int16x8_t a = vld1q_s16(_a);
+  int8x8_t b = vld1_s8(_b);
+  int8x8_t c = vld1_s8(_c);
+  int16x8_t d = vmlsl_s8(a, b, c);
+  return validate_int16(d, d0, d1, d2, d3, d4, d5, d6, d7);
+}
 
 result_t test_vmlsl_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }