8000 Merge pull request #114 from howjmay/vrsubhn_s16 · plctlab/numpy@41613a9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 41613a9

Browse files
authored
Merge pull request numpy#114 from howjmay/vrsubhn_s16
feat: Add vrsubhn_s16
2 parents 0e58fd8 + ae61621 commit 41613a9

File tree

2 files changed

+22
-7
lines changed

2 files changed

+22
-7
lines changed

neon2rvv.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,8 @@ FORCE_INLINE int8x8_t vaddhn_s16(int16x8_t __a, int16x8_t __b) {
340340
// FORCE_INLINE uint32x2_t vaddhn_u64(uint64x2_t __a, uint64x2_t __b);
341341

342342
FORCE_INLINE int8x8_t vraddhn_s16(int16x8_t __a, int16x8_t __b) {
343-
uint8_t mask_arr[] = {0xaa, 0xaa};
344-
vbool8_t mask = __riscv_vlm_v_b8(mask_arr, 16);
345-
int16x8_t add_ab = (__riscv_vadd_vv_i16m1(__a, __b, 8));
343+
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8(0xaa));
344+
int16x8_t add_ab = __riscv_vadd_vv_i16m1(__a, __b, 8);
346345
int16x8_t add_round = __riscv_vadd_vx_i16m1(add_ab, 1 << 7, 8);
347346
return __riscv_vcompress_vm_i8m1(__riscv_vreinterpret_v_i16m1_i8m1(add_round), mask, 16);
348347
}
@@ -738,7 +737,12 @@ FORCE_INLINE int8x8_t vsubhn_s16(int16x8_t __a, int16x8_t __b) {
738737

739738
// FORCE_INLINE uint32x2_t vsubhn_u64(uint64x2_t __a, uint64x2_t __b);
740739

741-
// FORCE_INLINE int8x8_t vrsubhn_s16(int16x8_t __a, int16x8_t __b);
740+
FORCE_INLINE int8x8_t vrsubhn_s16(int16x8_t __a, int16x8_t __b) {
741+
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8(0xaa));
742+
int16x8_t ab_sub = __riscv_vsub_vv_i16m1(__a, __b, 8);
743+
int16x8_t sub_round = __riscv_vadd_vx_i16m1(ab_sub, 1 << 7, 8);
744+
return __riscv_vcompress_vm_i8m1(__riscv_vreinterpret_v_i16m1_i8m1(sub_round), mask, 16);
745+
}
742746

743747
// FORCE_INLINE int16x4_t vrsubhn_s32(int32x4_t __a, int32x4_t __b);
744748

tests/impl.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,6 @@ result_t test_vraddhn_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
847847
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
848848
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
849849
int8_t _c[8];
850-
851850
const int16_t round = 1 << 7;
852851
for (int i = 0; i < 8; i++) {
853852
_c[i] = ((_a[i] + _b[i] + round) >> 8) & 0xff;
@@ -856,7 +855,6 @@ result_t test_vraddhn_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
856855
int16x8_t a = vld1q_s16(_a);
857856
int16x8_t b = vld1q_s16(_b);
858857
int8x8_t c = vraddhn_s16(a, b);
859-
860858
return validate_int8(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]);
861859
}
862860

@@ -1566,7 +1564,20 @@ result_t test_vsubhn_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
15661564

15671565
result_t test_vsubhn_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
15681566

1569-
result_t test_vrsubhn_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
1567+
result_t test_vrsubhn_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
1568+
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
1569+
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
1570+
int8_t _c[8];
1571+
const int16_t round = 1 << 7;
1572+
for (int i = 0; i < 8; i++) {
1573+
_c[i] = ((_a[i] - _b[i] + round) >> 8) & 0xff;
1574+
}
1575+
1576+
int16x8_t a = vld1q_s16(_a);
1577+
int16x8_t b = vld1q_s16(_b);
1578+
int8x8_t c = vrsubhn_s16(a, b);
1579+
return validate_int8(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]);
1580+
}
15701581

15711582
result_t test_vrsubhn_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
15721583

0 commit comments

Comments
 (0)
0