8000 Merge pull request #59 from howjmay/vmlsl_s8 · plctlab/numpy@ed7594d · GitHub
[go: up one dir, main page]

Skip to content

Commit ed7594d

Browse files
authored
Merge pull request numpy#59 from howjmay/vmlsl_s8
feat: Add vmlsl_s8
2 parents 74d36ce + 31043fb commit ed7594d

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

neon2rvv.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,9 @@ FORCE_INLINE int8x8_t vmls_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
476476

477477
// FORCE_INLINE uint32x4_t vmlsq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c);
478478

479-
// FORCE_INLINE int16x8_t vmlsl_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c);
479+
FORCE_INLINE int16x8_t vmlsl_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c) {
480+
return __riscv_vsub_vv_i16m1(__a, __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vwmul_vv_i16m2(__b, __c, 8)), 8);
481+
}
480482

481483
// FORCE_INLINE int32x4_t vmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c);
482484

tests/impl.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1106,7 +1106,25 @@ result_t test_vmlsq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
11061106

11071107
result_t test_vmlsq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
11081108

1109-
result_t test_vmlsl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
1109+
result_t test_vmlsl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
1110+
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
1111+
const int8_t *_b = (int8_t *)impl.test_cases_int_pointer1 + 8;
1112+
const int8_t *_c = (int8_t *)impl.test_cases_int_pointer2;
1113+
int16_t d0 = _a[0] - (int16_t)_b[0] * (int16_t)_c[0];
1114+
int16_t d1 = _a[1] - (int16_t)_b[1] * (int16_t)_c[1];
1115+
int16_t d2 = _a[2] - (int16_t)_b[2] * (int16_t)_c[2];
1116+
int16_t d3 = _a[3] - (int16_t)_b[3] * (int16_t)_c[3];
1117+
int16_t d4 = _a[4] - (int16_t)_b[4] * (int16_t)_c[4];
1118+
int16_t d5 = _a[5] - (int16_t)_b[5] * (int16_t)_c[5];
1119+
int16_t d6 = _a[6] - (int16_t)_b[6] * (int16_t)_c[6];
1120+
int16_t d7 = _a[7] - (int16_t)_b[7] * (int16_t)_c[7];
1121+
1122+
int16x8_t a = vld1q_s16(_a);
1123+
int8x8_t b = vld1_s8(_b);
1124+
int8x8_t c = vld1_s8(_c);
1125+
int16x8_t d = vmlsl_s8(a, b, c);
1126+
return 4CDB validate_int16(d, d0, d1, d2, d3, d4, d5, d6, d7);
1127+
}
11101128

11111129
result_t test_vmlsl_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
11121130

0 commit comments

Comments
 (0)
0