8000 ENH, SIMD: Add CPU feature detection and simd functions for AArch64 SVE by kawakami-k · Pull Request #22265 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH, SIMD: Add CPU feature detection and simd functions for AArch64 SVE #22265

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ENH, SIMD: Add SVE simd functions
  • Loading branch information
kawakami-k committed Oct 3, 2022
commit b910144fe323f5a2bec246d1a09ac5694850e1d3
23 changes: 15 additions & 8 deletions numpy/core/src/common/simd/intdiv.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
divisor.val[0] = npyv_setall_u16(m);
divisor.val[1] = npyv_set_u8(sh1);
divisor.val[2] = npyv_set_u8(sh2);
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_setall_u8(sh1);
divisor.val[2] = npyv_setall_u8(sh2);
Expand Down Expand Up @@ -249,7 +249,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
npyv_s8x3 divisor;
divisor.val[0] = npyv_setall_s8(m);
divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0);
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_s8(sh);
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_setall_s8(-sh);
Expand Down Expand Up @@ -285,7 +285,7 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_u16(sh1);
divisor.val[2] = npyv_set_u16(sh2);
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_u16(sh1);
divisor.val[2] = npyv_setall_u16(sh2);
#elif defined(NPY_HAVE_NEON)
Expand Down Expand Up @@ -317,7 +317,7 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_s16(sh);
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_s16(sh);
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_setall_s16(-sh);
Expand Down Expand Up @@ -352,7 +352,7 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_u32(sh1);
divisor.val[2] = npyv_set_u32(sh2);
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_u32(sh1);
divisor.val[2] = npyv_setall_u32(sh2);
#elif defined(NPY_HAVE_NEON)
Expand Down Expand Up @@ -389,7 +389,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_s32(sh);
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_s32(sh);
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_setall_s32(-sh);
Expand All @@ -402,7 +402,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
{
npyv_u64x3 divisor;
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) \
|| (defined(NPY_HAVE_NEON) && !defined(NPY_HAVE_SVE))
divisor.val[0] = npyv_setall_u64(d);
#else
npy_uint64 l, l2, sh1, sh2, m;
Expand All @@ -427,6 +428,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_u64(sh1);
divisor.val[2] = npyv_set_u64(sh2);
#elif defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_u64(sh1);
divisor.val[2] = npyv_setall_u64(sh2);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand All @@ -437,7 +441,8 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
{
npyv_s64x3 divisor;
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) \
|| (defined(NPY_HAVE_NEON) && !defined(NPY_HAVE_SVE))
divisor.val[0] = npyv_setall_s64(d);
divisor.val[1] = npyv_cvt_s64_b64(
npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0])
Expand Down Expand Up @@ -465,6 +470,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_s64(sh);
#elif defined(NPY_HAVE_SVE)
divisor.val[1] = npyv_setall_s64(sh);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down
81 changes: 79 additions & 2 deletions numpy/core/src/multiarray/item_selection.c
< D966 td class="blob-num blob-num-addition empty-cell">
Original file line number Diff line number Diff line change
Expand Up @@ -2154,7 +2154,7 @@ count_nonzero_bytes_384(const npy_uint64 * w)
return r;
}

#if NPY_SIMD
#if NPY_SIMD && !defined(NPY_HAVE_SVE)
/* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
NPY_FINLINE NPY_GCC_OPT_3 npyv_u8
count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
Expand Down Expand Up @@ -2190,7 +2190,7 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
}
return vsum16;
}
#endif // NPY_SIMD
#endif // NPY_SIMD && !defined(NPY_HAVE_SVE)
/*
* Counts the number of non-zero values in a raw array.
* The one loop process is shown below(take SSE2 with 128bits vector for example):
Expand All @@ -2215,6 +2215,23 @@ count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
npy_intp count = 0;
if (bstride == 1) {
#if NPY_SIMD
#ifdef NPY_SIMD_POPCNT
npyv_u8 zero = npyv_zero_u8();

for (; len >= npyv_nlanes_u8; len -= npyv_nlanes_u8, data += npyv_nlanes_u8) {
npyv_u8 d = npyv_load_u8(data);
npyv_b8 b = npyv_cmpneq_u8(d, zero);
count += npyv_popcnt_b8(b);
}
if (len) {
npyv_u8 d = npyv_load_tillz_u8(data, len);
npyv_b8 b = npyv_cmpneq_u8(d, zero);

count += npyv_popcnt_b8(b);
len = 0;
}
return count;
#else
npy_uintp len_m = len & -npyv_nlanes_u8;
npy_uintp zcount = 0;
for (const char *end = data + len_m; data < end;) {
Expand All @@ -2228,6 +2245,7 @@ count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
}
len -= len_m;
count = len_m - zcount;
#endif
#else
if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) {
int step = 6 * sizeof(npy_uint64);
Expand All @@ -2251,6 +2269,24 @@ count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
npy_intp count = 0;
#if NPY_SIMD
if (bstride == sizeof(npy_uint16)) {
#ifdef NPY_SIMD_POPCNT
npy_uint16 *data_u16 = (npy_uint16 *) data;
npyv_u16 zero = npyv_zero_u16();

for (; len >= npyv_nlanes_u16; len -= npyv_nlanes_u16, data_u16 += npyv_nlanes_u16) {
npyv_u16 d = npyv_load_u16(data_u16);
npyv_b16 b = npyv_cmpneq_u16(d, zero);
count += npyv_popcnt_b16(b);
}
if (len) {
npyv_u16 d = npyv_load_tillz_u16(data_u16, len);
npyv_b16 b = npyv_cmpneq_u16(d, zero);

count += npyv_popcnt_b16(b);
len = 0;
}
return count;
#else
npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16;
const npyv_u16 vone = npyv_setall_u16(1);
const npyv_u16 vzero = npyv_zero_u16();
Expand All @@ -2269,6 +2305,7 @@ count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
}
len -= len_m;
count = len_m - zcount;
#endif
}
#endif
for (; len > 0; --len, data += bstride) {
Expand All @@ -2283,6 +2320,25 @@ count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
npy_intp count = 0;
#if NPY_SIMD
if (bstride == sizeof(npy_uint32)) {
#ifdef NPY_SIMD_POPCNT
npy_uint32 *data_u32 = (npy_uint32 *) data;
npyv_u32 zero = npyv_zero_u32();

for (; len >= npyv_nlanes_u32; len -= npyv_nlanes_u32, data_u32 += npyv_nlanes_u32) {
npyv_u32 d = npyv_load_u32(data_u32);
npyv_b32 b = npyv_cmpneq_u32(d, zero);

count += npyv_popcnt_b32(b);
}
if (len) {
npyv_u32 d = npyv_load_tillz_u32(data_u32, len);
npyv_b32 b = npyv_cmpneq_u32(d, zero);

count += npyv_popcnt_b32(b);
len = 0;
}
return count;
#else
const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32;
const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32;
const npyv_u32 vone = npyv_setall_u32(1);
Expand All @@ -2299,6 +2355,7 @@ count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
count = len_m - npyv_sum_u64(npyv_add_u64(odd, even));
len -= len_m;
#endif
}
#endif
for (; len > 0; --len, data += bstride) {
Expand All @@ -2313,6 +2370,25 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
npy_intp count = 0;
#if NPY_SIMD
if (bstride == sizeof(npy_uint64)) {
#ifdef NPY_SIMD_POPCNT
npy_uint64 *data_u64 = (npy_uint64 *) data;
npyv_u64 zero = npyv_zero_u64();

for (; len >= npyv_nlanes_u64; len -= npyv_nlanes_u64, data_u64 += npyv_nlanes_u64) {
npyv_u64 d = npyv_load_u64(data_u64);
npyv_b64 b = npyv_cmpneq_u64(d, zero);

count += npyv_popcnt_b64(b);
}
if (len) {
npyv_u64 d = npyv_load_tillz_u64(data_u64, len);
npyv_b64 b = npyv_cmpneq_u64(d, zero);

count += npyv_popcnt_b64(b);
len = 0;
}
return count;
#else
const npy_uintp len_m = len & -npyv_nlanes_u64;
const npyv_u64 vone = npyv_setall_u64(1);
const npyv_u64 vzero = npyv_zero_u64();
Expand All @@ -2325,6 +2401,7 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
}
len -= len_m;
count = len_m - npyv_sum_u64(vsum64);
#endif
}
#endif
for (; len > 0; --len, data += bstride) {
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/umath/_umath_tests.dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* @targets $werror baseline
* SSE2 SSE41 AVX2
* VSX VSX2 VSX3
* NEON ASIMD ASIMDHP
* NEON ASIMD ASIMDHP SVE
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
Expand Down
Loading
0