8000 Merge pull request #40 from howjmay/vpadd_s8 · plctlab/numpy@f3a3dac · GitHub
[go: up one dir, main page]

Skip to content

Commit f3a3dac

Browse files
authored
Merge pull request numpy#40 from howjmay/vpadd_s8
feat: Add vpadd_s8
2 parents 926bc06 + b855c91 commit f3a3dac

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

neon2rvv.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -999,7 +999,17 @@ FORCE_INLINE int8x8_t vmax_s8(int8x8_t __a, int8x8_t __b) { return __riscv_vmax_
999999

10001000
// FORCE_INLINE uint32x4_t vminq_u32(uint32x4_t __a, uint32x4_t __b);
10011001

1002-
// FORCE_INLINE int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b);
1002+
FORCE_INLINE int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b) {
1003+
uint8_t mask_arr[] = {85};
1004+
vbool16_t mask = __riscv_vlm_v_b16(mask_arr, 8);
1005+
vint8mf2_t a_slidedown = __riscv_vslidedown_vx_i8mf2(__a, 1, 8);
1006+
vint8mf2_t b_slidedown = __riscv_vslidedown_vx_i8mf2(__b, 1, 8);
1007+
vint8mf2_t a_add = __riscv_vadd_vv_i8mf2(__a, a_slidedown, 8);
1008+
vint8mf2_t b_add = __riscv_vadd_vv_i8mf2(__b, b_slidedown, 8);
1009+
vint8mf2_t a_compress = __riscv_vcompress_vm_i8mf2(a_add, mask, 8);
1010+
vint8mf2_t b_compress = __riscv_vcompress_vm_i8mf2(b_add, mask, 8);
1011+
return __riscv_vslideup_vx_i8mf2(a_compress, b_compress, 4, 8);
1012+
}
10031013

10041014
// FORCE_INLINE int16x4_t vpadd_s16(int16x4_t __a, int16x4_t __b);
10051015

tests/impl.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,24 @@ result_t test_vminq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
16001600

16011601
result_t test_vminq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
16021602

1603-
result_t test_vpadd_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
1603+
result_t test_vpadd_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
1604+
const int8_t *_a = (int8_t *)impl.test_cases_int_pointer1;
1605+
const int8_t *_b = (int8_t *)impl.test_cases_int_pointer2;
1606+
int8_t _c[8];
1607+
_c[0] = _a[0] + _a[1];
1608+
_c[1] = _a[2] + _a[3];
1609+
_c[2] = _a[4] + _a[5];
1610+
_c[3] = _a[6] + _a[7];
1611+
_c[4] = _b[0] + _b[1];
1612+
_c[5] = _b[2] + _b[3];
1613+
_c[6] = _b[4] + _b[5];
1614+
_c[7] = _b[6] + _b[7];
1615+
1616+
int8x8_t a = vld1_s8(_a);
1617+
int8x8_t b = vld1_s8(_b);
1618+
int8x8_t c = vpadd_s8(a, b);
1619+
return validate_int8(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]);
1620+
}
16041621

16051622
result_t test_vpadd_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
16061623

0 commit comments

Comments
 (0)
0