8000 ENH: [4/4] implement the NumPy C SIMD vectorization interface · numpy/numpy@791cae4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 791cae4

Browse files
committed
ENH: [4/4] implement the NumPy C SIMD vectorization interface
implement the same intrinsics as X86 for Power/VSX little-endian mode
1 parent 5eab8e0 commit 791cae4

File tree

7 files changed

+763
-0
lines changed

7 files changed

+763
-0
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_VSX_ARITHMETIC_H
6+
#define _NPY_SIMD_VSX_ARITHMETIC_H
7+
8+
/***************************
9+
* Addition
10+
***************************/
11+
// non-saturated
12+
#define npyv_add_u8 vec_add
13+
#define npyv_add_s8 vec_add
14+
#define npyv_add_u16 vec_add
15+
#define npyv_add_s16 vec_add
16+
#define npyv_add_u32 vec_add
17+
#define npyv_add_s32 vec_add
18+
#define npyv_add_u64 vec_add
19+
#define npyv_add_s64 vec_add
20+
#define npyv_add_f32 vec_add
21+
#define npyv_add_f64 vec_add
22+
23+
// saturated
24+
#define npyv_adds_u8 vec_adds
25+
#define npyv_adds_s8 vec_adds
26+
#define npyv_adds_u16 vec_adds
27+
#define npyv_adds_s16 vec_adds
28+
29+
/***************************
30+
* Subtraction
31+
***************************/
32+
// non-saturated
33+
#define npyv_sub_u8 vec_sub
34+
#define npyv_sub_s8 vec_sub
35+
#define npyv_sub_u16 vec_sub
36+
#define npyv_sub_s16 vec_sub
37+
#define npyv_sub_u32 vec_sub
38+
#define npyv_sub_s32 vec_sub
39+
#define npyv_sub_u64 vec_sub
40+
#define npyv_sub_s64 vec_sub
41+
#define npyv_sub_f32 vec_sub
42+
#define npyv_sub_f64 vec_sub
43+
44+
// saturated
45< 8000 /td>+
#define npyv_subs_u8 vec_subs
46+
#define npyv_subs_s8 vec_subs
47+
#define npyv_subs_u16 vec_subs
48+
#define npyv_subs_s16 vec_subs
49+
50+
/***************************
51+
* Multiplication
52+
***************************/
53+
// non-saturated
54+
// up to GCC 6 vec_mul only supports precisions and llong
55+
#if defined(__GNUC__) && __GNUC__ < 7
56+
#define NPYV_IMPL_VSX_MUL(T_VEC, SFX, ...) \
57+
NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \
58+
{ \
59+
const npyv_u8 ev_od = {__VA_ARGS__}; \
60+
return vec_perm( \
61+
(T_VEC)vec_mule(a, b), \
62+
(T_VEC)vec_mulo(a, b), ev_od \
63+
); \
64+
}
65+
66+
NPYV_IMPL_VSX_MUL(npyv_u8, u8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30)
67+
NPYV_IMPL_VSX_MUL(npyv_s8, s8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30)
68+
NPYV_IMPL_VSX_MUL(npyv_u16, u16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29)
69+
NPYV_IMPL_VSX_MUL(npyv_s16, s16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29)
70+
71+
// vmuluwm can be used for unsigned or signed 32-bit integers
72+
#define NPYV_IMPL_VSX_MUL_32(T_VEC, SFX) \
73+
NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \
74+
{ \
75+
T_VEC ret; \
76+
__asm__ __volatile__( \
77+
"vmuluwm %0,%1,%2" : \
78+
"=v" (ret) : "v" (a), "v" (b) \
79+
); \
80+
return ret; \
81+
}
82+
83+
NPYV_IMPL_VSX_MUL_32(npyv_u32, u32)
84+
NPYV_IMPL_VSX_MUL_32(npyv_s32, s32)
85+
86+
#else
87+
#define npyv_mul_u8 vec_mul
88+
#define npyv_mul_s8 vec_mul
89+
#define npyv_mul_u16 vec_mul
90+
#define npyv_mul_s16 vec_mul
91+
#define npyv_mul_u32 vec_mul
92+
#define npyv_mul_s32 vec_mul
93+
#endif
94+
#define npyv_mul_f32 vec_mul
95+
#define npyv_mul_f64 vec_mul
96+
97+
/***************************
98+
* Division
99+
***************************/
100+
#define npyv_div_f32 vec_div
101+
#define npyv_div_f64 vec_div
102+
103+
#endif // _NPY_SIMD_VSX_ARITHMETIC_H
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_VSX_CVT_H
6+
#define _NPY_SIMD_VSX_CVT_H
7+
8+
// convert boolean vectors to integer vectors
9+
#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL)
10+
#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL)
11+
#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
12+
#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
13+
#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
14+
#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
15+
#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
16+
#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
17+
#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
18+
#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)
19+
20+
// convert integer vectors to boolean vectors
21+
#define npyv_cvt_b8_u8(A) ((npyv_b8) A)
22+
#define npyv_cvt_b8_s8(A) ((npyv_b8) A)
23+
#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
24+
#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
25+
#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
26+
#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
27+
#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
28+
#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
29+
#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
30+
#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
31+
32+
#endif // _NPY_SIMD_VSX_CVT_H
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#ifndef NPY_SIMD
2+
#error "Not a standalone header"
3+
#endif
4+
5+
#ifndef _NPY_SIMD_VSX_MEMORY_H
6+
#define _NPY_SIMD_VSX_MEMORY_H
7+
/****************************
8+
* load/store
9+
****************************/
10+
// TODO: test load by cast
11+
#define VSX__CAST_lOAD 0
12+
#if VSX__CAST_lOAD
13+
#define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR)))
14+
#else
15+
/**
16+
* CLANG fails to load unaligned addresses via vec_xl, vec_xst
17+
* so we failback to vec_vsx_ld, vec_vsx_st
18+
*/
19+
#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
20+
#define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR)
21+
#else
22+
#define npyv__load(PTR, T_VEC) vec_xl(0, PTR)
23+
#endif
24+
#endif
25+
// unaligned load
26+
#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8)
27+
#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8)
28+
#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16)
29+
#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16)
30+
#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32)
31+
#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32)
32+
#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32)
33+
#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64)
34+
#if VSX__CAST_lOAD
35+
#define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64)
36+
#define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64)
37+
#else
38+
#define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR))
39+
#define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR))
40+
#endif
41+
// aligned load
42+
#define npyv_loada_u8(PTR) vec_ld(0, PTR)
43+
#define npyv_loada_s8 npyv_loada_u8
44+
#define npyv_loada_u16 npyv_loada_u8
45+
#define npyv_loada_s16 npyv_loada_u8
46+
#define npyv_loada_u32 npyv_loada_u8
47+
#define npyv_loada_s32 npyv_loada_u8
48+
#define npyv_loada_u64 npyv_load_u64
49+
#define npyv_loada_s64 npyv_load_s64
50+
#define npyv_loada_f32 npyv_loada_u8
51+
#define npyv_loada_f64 npyv_load_f64
52+
// stream load
53+
#define npyv_loads_u8 npyv_loada_u8
54+
#define npyv_loads_s8 npyv_loada_s8
55+
#define npyv_loads_u16 npyv_loada_u16
56+
#define npyv_loads_s16 npyv_loada_s16
57+
#define npyv_loads_u32 npyv_loada_u32
58+
#define npyv_loads_s32 npyv_loada_s32
59+
#define npyv_loads_u64 npyv_loada_u64
60+
#define npyv_loads_s64 npyv_loada_s64
61+
#define npyv_loads_f32 npyv_loada_f32
62+
#define npyv_loads_f64 npyv_loada_f64
63+
// load lower part
64+
// avoid aliasing rules
65+
#ifdef __cplusplus
66+
template<typename T_PTR>
67+
NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr)
68+
{ return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; }
69+
#else
70+
NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr)
71+
{ npy_uint64 *ptr64 = ptr; return ptr64; }
72+
#endif // __cplusplus
73+
#if defined(__clang__) && !defined(__IBMC__)
74+
// vec_promote doesn't support doubleword on clang
75+
#define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR))
76+
#else
77+
#define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0)
78+
#endif
79+
#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR))
80+
#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR))
81+
#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR))
82+
#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR))
83+
#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR))
84+
#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR))
85+
#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR))
86+
#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR))
87+
#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR))
88+
// unaligned store
89+
#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
90+
#define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
91+
#else
92+
#define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR)
93+
#endif
94+
#define npyv_store_s8 npyv_store_u8
95+
#define npyv_store_u16 npyv_store_u8
96+
#define npyv_store_s16 npyv_store_u8
97+
#define npyv_store_u32 npyv_store_u8
98+
#define npyv_store_s32 npyv_store_u8
99+
#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
100+
#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
101+
#define npyv_store_f32 npyv_store_u8
102+
#define npyv_store_f64 npyv_store_u8
103+
// aligned store
104+
#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR)
105+
#define npyv_storea_s8 npyv_storea_u8
106+
#define npyv_storea_u16 npyv_storea_u8
107+
#define npyv_storea_s16 npyv_storea_u8
108+
#define npyv_storea_u32 npyv_storea_u8
109+
#define npyv_storea_s32 npyv_storea_u8
110+
#define npyv_storea_u64 npyv_store_u64
111+
#define npyv_storea_s64 npyv_store_s64
112+
#define npyv_storea_f32 npyv_storea_u8
113+
#define npyv_storea_f64 npyv_store_f64
114+
// stream store
115+
#define npyv_stores_u8 npyv_storea_u8
116+
#define npyv_stores_s8 npyv_storea_s8
117+
#define npyv_stores_u16 npyv_storea_u16
118+
#define npyv_stores_s16 npyv_storea_s16
119+
#define npyv_stores_u32 npyv_storea_u32
120+
#define npyv_stores_s32 npyv_storea_s32
121+
#define npyv_stores_u64 npyv_storea_u64
122+
#define npyv_stores_s64 npyv_storea_s64
123+
#define npyv_stores_f32 npyv_storea_f32
124+
#define npyv_stores_f64 npyv_storea_f64
125+
// store lower part
126+
#define npyv_storel_u8(PTR, VEC) \
127+
*npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0)
128+
#define npyv_storel_s8 npyv_storel_u8
129+
#define npyv_storel_u16 npyv_storel_u8
130+
#define npyv_storel_s16 npyv_storel_u8
131+
#define npyv_storel_u32 npyv_storel_u8
132+
#define npyv_storel_s32 npyv_storel_u8
133+
#define npyv_storel_s64 npyv_storel_u8
134+
#define npyv_storel_u64 npyv_storel_u8
135+
#define npyv_storel_f32 npyv_storel_u8
136+
#define npyv_storel_f64 npyv_storel_u8
137+
// store higher part
138+
#define npyv_storeh_u8(PTR, VEC) \
139+
*npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1)
140+
#define npyv_storeh_s8 npyv_storeh_u8
141+
#define npyv_storeh_u16 npyv_storeh_u8
142+
#define npyv_storeh_s16 npyv_storeh_u8
143+
#define npyv_storeh_u32 npyv_storeh_u8
144+
#define npyv_storeh_s32 npyv_storeh_u8
145+
#define npyv_storeh_s64 npyv_storeh_u8
146+
#define npyv_storeh_u64 npyv_storeh_u8
147+
#define npyv_storeh_f32 npyv_storeh_u8
148+
#define npyv_storeh_f64 npyv_storeh_u8
149+
150+
#endif // _NPY_SIMD_VSX_MEMORY_H

0 commit comments

Comments
 (0)
0