10000 ENH: [3/4] implement the NumPy C SIMD vectorization interface · numpy/numpy@5eab8e0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5eab8e0

Browse files
committed
ENH: [3/4] implement the NumPy C SIMD vectorization interface
implement the same intrinsics as X86 for NEON
1 parent 6539973 commit 5eab8e0

File tree

7 files changed

+748
-0
lines changed

7 files changed

+748
-0
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_NEON_ARITHMETIC_H
6+
#define _NPY_SIMD_NEON_ARITHMETIC_H
7+
8+
/***************************
9+
* Addition
10+
***************************/
11+
// non-saturated
12+
#define npyv_add_u8 vaddq_u8
13+
#define npyv_add_s8 vaddq_s8
14+
#define npyv_add_u16 vaddq_u16
15+
#define npyv_add_s16 vaddq_s16
16+
#define npyv_add_u32 vaddq_u32
17+
#define npyv_add_s32 vaddq_s32
18+
#define npyv_add_u64 vaddq_u64
19+
#define npyv_add_s64 vaddq_s64
20+
#define npyv_add_f32 vaddq_f32
21+
#define npyv_add_f64 vaddq_f64
22+
23+
// saturated
24+
#define npyv_adds_u8 vqaddq_u8
25+
#define npyv_adds_s8 vqaddq_s8
26+
#define npyv_adds_u16 vqaddq_u16
27+
#define npyv_adds_s16 vqaddq_s16
28+
29+
/***************************
30+
* Subtraction
31+
***************************/
32+
// non-saturated
33+
#define npyv_sub_u8 vsubq_u8
34+
#define npyv_sub_s8 vsubq_s8
35+
#define npyv_sub_u16 vsubq_u16
36+
#define npyv_sub_s16 vsubq_s16
37+
#define npyv_sub_u32 vsubq_u32
38+
#define npyv_sub_s32 vsubq_s32
39+
#define npyv_sub_u64 vsubq_u64
40+
#define npyv_sub_s64 vsubq_s64
41+
#define npyv_sub_f32 vsubq_f32
42+
#define npyv_sub_f64 vsubq_f64
43+
44+
// saturated
45+
#define npyv_subs_u8 vqsubq_u8
46+
#define npyv_subs_s8 vqsubq_s8
47+
#define npyv_subs_u16 vqsubq_u16
48+
#define npyv_subs_s16 vqsubq_s16
49+
50+
/***************************
51+
* Multiplication
52+
***************************/
53+
// non-saturated
54+
#define npyv_mul_u8 vmulq_u8
55+
#define npyv_mul_s8 vmulq_s8
56+
#define npyv_mul_u16 vmulq_u16
57+
#define npyv_mul_s16 vmulq_s16
58+
#define npyv_mul_u32 vmulq_u32
59+
#define npyv_mul_s32 vmulq_s32
60+
#define npyv_mul_f32 vmulq_f32
61+
#define npyv_mul_f64 vmulq_f64
62+
63+
/***************************
64+
* Division
65+
***************************/
66+
#ifdef __aarch64__
67+
#define npyv_div_f32 vdivq_f32
68+
#else
69+
NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
70+
{
71+
float32x4_t recip = vrecpeq_f32(b);
72+
recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
73+
return vmulq_f32(a, recip);
74+
}
75+
#endif
76+
#define npyv_div_f64 vdivq_f64
77+
78+
#endif // _NPY_SIMD_NEON_ARITHMETIC_H
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_NEON_CVT_H
6+
#define _NPY_SIMD_NEON_CVT_H
7+
8+
// convert boolean vectors to integer vectors
9+
#define npyv_cvt_u8_b8(A) A
10+
#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A)
11+
#define npyv_cvt_u16_b16(A) A
12+
#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
13+
#define npyv_cvt_u32_b32(A) A
14+
#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
15+
#define npyv_cvt_u64_b64(A) A
16+
#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
17+
#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
18+
#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
19+
20+
// convert integer vectors to boolean vectors
21+
#define npyv_cvt_b8_u8(BL) BL
22+
#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL)
23+
#define npyv_cvt_b16_u16(BL) BL
24+
#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
25+
#define npyv_cvt_b32_u32(BL) BL
26+
#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
27+
#define npyv_cvt_b64_u64(BL) BL
28+
#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
29+
#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
30+
#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
31+
32+
#endif // _NPY_SIMD_NEON_CVT_H
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_NEON_MEMORY_H
6+
#define _NPY_SIMD_NEON_MEMORY_H
7+
8+
/***************************
9+
* load/store
10+
***************************/
11+
// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors
12+
#define NPYV_IMPL_NEON_MEM(SFX, CTYPE) \
13+
NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \
14+
{ return vld1q_##SFX((const CTYPE*)ptr); } \
15+
NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \
16+
{ return vld1q_##SFX((const CTYPE*)ptr); } \
17+
NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \
18+
{ return vld1q_##SFX((const CTYPE*)ptr); } \
19+
NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \
20+
{ \
21+
return vcombine_##SFX( \
22+
vld1_##SFX((const CTYPE*)ptr), vdup_n_##SFX(0) \
23+
); \
24+
} \
25+
NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
26+
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
27+
NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
28+
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
29+
NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
30+
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
31+
NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
32+
{ vst1_##SFX((CTYPE*)ptr, vget_low_##SFX(vec)); } \
33+
NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
34+
{ vst1_##SFX((CTYPE*)ptr, vget_high_##SFX(vec)); }
35+
36+
NPYV_IMPL_NEON_MEM(u8, uint8_t)
37+
NPYV_IMPL_NEON_MEM(s8, int8_t)
38+
NPYV_IMPL_NEON_MEM(u16, uint16_t)
39+
NPYV_IMPL_NEON_MEM(s16, int16_t)
40+
NPYV_IMPL_NEON_MEM(u32, uint32_t)
41+
NPYV_IMPL_NEON_MEM(s32, int32_t)
42+
NPYV_IMPL_NEON_MEM(u64, uint64_t)
43+
NPYV_IMPL_NEON_MEM(s64, int64_t)
44+
NPYV_IMPL_NEON_MEM(f32, float)
45+
#if NPY_SIMD_F64
46+
NPYV_IMPL_NEON_MEM(f64, double)
47+
#endif
48+
49+
#endif // _NPY_SIMD_NEON_MEMORY_H

0 commit comments

Comments
 (0)
0