ENH, SIMD: Add SVE simd functions

numpy · kawakami-k · Sep 15, 2022 · Sep 16, 2022 · Oct 3, 2022 · Sep 16, 2022
commit b910144fe323f5a2bec246d1a09ac5694850e1d3
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
@@ -208,7 +208,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
     divisor.val[0] = npyv_setall_u16(m);
     divisor.val[1] = npyv_set_u8(sh1);
     divisor.val[2] = npyv_set_u8(sh2);
-#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
     divisor.val[0] = npyv_setall_u8(m);
     divisor.val[1] = npyv_setall_u8(sh1);
     divisor.val[2] = npyv_setall_u8(sh2);
@@ -249,7 +249,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
     npyv_s8x3 divisor;
     divisor.val[0] = npyv_setall_s8(m);
     divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0);
-    #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+    #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
         divisor.val[1] = npyv_setall_s8(sh);
     #elif defined(NPY_HAVE_NEON)
         divisor.val[1] = npyv_setall_s8(-sh);
@@ -285,7 +285,7 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_u16(sh1);
     divisor.val[2] = npyv_set_u16(sh2);
-#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
     divisor.val[1] = npyv_setall_u16(sh1);
     divisor.val[2] = npyv_setall_u16(sh2);
 #elif defined(NPY_HAVE_NEON)
@@ -317,7 +317,7 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
     divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s16(sh);
-#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
     divisor.val[1] = npyv_setall_s16(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s16(-sh);
@@ -352,7 +352,7 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_u32(sh1);
     divisor.val[2] = npyv_set_u32(sh2);
-#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
     divisor.val[1] = npyv_setall_u32(sh1);
     divisor.val[2] = npyv_setall_u32(sh2);
 #elif defined(NPY_HAVE_NEON)
@@ -389,7 +389,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
     divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s32(sh);
-#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_SVE)
     divisor.val[1] = npyv_setall_s32(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s32(-sh);
@@ -402,7 +402,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
 NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
 {
     npyv_u64x3 divisor;
-#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) \
+    || (defined(NPY_HAVE_NEON) && !defined(NPY_HAVE_SVE))
     divisor.val[0] = npyv_setall_u64(d);
 #else
     npy_uint64 l, l2, sh1, sh2, m;
@@ -427,6 +428,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
     #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
         divisor.val[1] = npyv_set_u64(sh1);
         divisor.val[2] = npyv_set_u64(sh2);
+    #elif defined(NPY_HAVE_SVE)
+        divisor.val[1] = npyv_setall_u64(sh1);
+        divisor.val[2] = npyv_setall_u64(sh2);
     #else
         #error "please initialize the shifting operand for the new architecture"
     #endif
@@ -437,7 +441,8 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
 NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
 {
     npyv_s64x3 divisor;
-#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) \
+    || (defined(NPY_HAVE_NEON) && !defined(NPY_HAVE_SVE))
     divisor.val[0] = npyv_setall_s64(d);
     divisor.val[1] = npyv_cvt_s64_b64(
         npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0])
@@ -465,6 +470,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
     divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0);  // sign of divisor
     #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s64(sh);
+    #elif defined(NPY_HAVE_SVE)
+    divisor.val[1] = npyv_setall_s64(sh);
     #else
         #error "please initialize the shifting operand for the new architecture"
     #endif

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
@@ -2154,7 +2154,7 @@ count_nonzero_bytes_384(const npy_uint64 * w)
     return r;
 }
 
-#if NPY_SIMD
+#if NPY_SIMD && !defined(NPY_HAVE_SVE)
 /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
 NPY_FINLINE NPY_GCC_OPT_3 npyv_u8
 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
@@ -2190,7 +2190,7 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
     }
     return vsum16;
 }
-#endif // NPY_SIMD
+#endif // NPY_SIMD && !defined(NPY_HAVE_SVE)
 /*
  * Counts the number of non-zero values in a raw array.
  * The one loop process is shown below(take SSE2 with 128bits vector for example):
@@ -2215,6 +2215,23 @@ count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
     npy_intp count = 0;
     if (bstride == 1) {
     #if NPY_SIMD
+    #ifdef NPY_SIMD_POPCNT
+        npyv_u8 zero = npyv_zero_u8();
+
+        for (; len >= npyv_nlanes_u8; len -= npyv_nlanes_u8, data += npyv_nlanes_u8) {
+            npyv_u8 d = npyv_load_u8(data);
+            npyv_b8 b = npyv_cmpneq_u8(d, zero);
+            count += npyv_popcnt_b8(b);
+        }
+        if (len) {
+            npyv_u8 d = npyv_load_tillz_u8(data, len);
+            npyv_b8 b = npyv_cmpneq_u8(d, zero);
+
+            count += npyv_popcnt_b8(b);
+            len = 0;
+      }
+      return count;
+    #else
         npy_uintp len_m = len & -npyv_nlanes_u8;
         npy_uintp zcount = 0;
         for (const char *end = data + len_m; data < end;) {
@@ -2228,6 +2245,7 @@ count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
         }
         len  -= len_m;
         count = len_m - zcount;
+    #endif
     #else
         if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) {
             int step = 6 * sizeof(npy_uint64);
@@ -2251,6 +2269,24 @@ count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
     npy_intp count = 0;
 #if NPY_SIMD
     if (bstride == sizeof(npy_uint16)) {
+    #ifdef NPY_SIMD_POPCNT
+        npy_uint16 *data_u16 = (npy_uint16 *) data;
+        npyv_u16 zero = npyv_zero_u16();
+
+        for (; len >= npyv_nlanes_u16; len -= npyv_nlanes_u16, data_u16 += npyv_nlanes_u16) {
+            npyv_u16 d = npyv_load_u16(data_u16);
+            npyv_b16 b = npyv_cmpneq_u16(d, zero);
+            count += npyv_popcnt_b16(b);
+      }
+      if (len) {
+          npyv_u16 d = npyv_load_tillz_u16(data_u16, len);
+          npyv_b16 b = npyv_cmpneq_u16(d, zero);
+
+          count += npyv_popcnt_b16(b);
+          len = 0;
+      }
+      return count;
+    #else
         npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16;
         const npyv_u16 vone  = npyv_setall_u16(1);
         const npyv_u16 vzero = npyv_zero_u16();
@@ -2269,6 +2305,7 @@ count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
         }
         len  -= len_m;
         count = len_m - zcount;
+    #endif
     }
 #endif
     for (; len > 0; --len, data += bstride) {
@@ -2283,6 +2320,25 @@ count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
     npy_intp count = 0;
 #if NPY_SIMD
     if (bstride == sizeof(npy_uint32)) {
+    #ifdef NPY_SIMD_POPCNT
+        npy_uint32 *data_u32 = (npy_uint32 *) data;
+        npyv_u32 zero = npyv_zero_u32();
+
+        for (; len >= npyv_nlanes_u32; len -= npyv_nlanes_u32, data_u32 += npyv_nlanes_u32) {
+            npyv_u32 d = npyv_load_u32(data_u32);
+            npyv_b32 b = npyv_cmpneq_u32(d, zero);
+
+            count += npyv_popcnt_b32(b);
+        }
+        if (len) {
+            npyv_u32 d = npyv_load_tillz_u32(data_u32, len);
+            npyv_b32 b = npyv_cmpneq_u32(d, zero);
+
+            count += npyv_popcnt_b32(b);
+            len = 0;
+        }
+        return count;
+    #else
         const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32;
         const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32;
         const npyv_u32 vone   = npyv_setall_u32(1);
@@ -2299,6 +2355,7 @@ count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
         npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
         count = len_m - npyv_sum_u64(npyv_add_u64(odd, even));
         len  -= len_m;
+    #endif
     }
 #endif
     for (; len > 0; --len, data += bstride) {
@@ -2313,6 +2370,25 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
     npy_intp count = 0;
 #if NPY_SIMD
     if (bstride == sizeof(npy_uint64)) {
+    #ifdef NPY_SIMD_POPCNT
+        npy_uint64 *data_u64 = (npy_uint64 *) data;
+        npyv_u64 zero = npyv_zero_u64();
+
+        for (; len >= npyv_nlanes_u64; len -= npyv_nlanes_u64, data_u64 += npyv_nlanes_u64) {
+            npyv_u64 d = npyv_load_u64(data_u64);
+            npyv_b64 b = npyv_cmpneq_u64(d, zero);
+
+            count += npyv_popcnt_b64(b);
+        }
+        if (len) {
+            npyv_u64 d = npyv_load_tillz_u64(data_u64, len);
+            npyv_b64 b = npyv_cmpneq_u64(d, zero);
+
+            count += npyv_popcnt_b64(b);
+            len = 0;
+        }
+      return count;
+    #else
         const npy_uintp len_m = len & -npyv_nlanes_u64;
         const npyv_u64 vone   = npyv_setall_u64(1);
         const npyv_u64 vzero  = npyv_zero_u64();
@@ -2325,6 +2401,7 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
         }
         len  -= len_m;
         count = len_m - npyv_sum_u64(vsum64);
+    #endif
     }
 #endif
     for (; len > 0; --len, data += bstride) {

diff --git a/numpy/core/src/umath/_umath_tests.dispatch.c b/numpy/core/src/umath/_umath_tests.dispatch.c
@@ -4,7 +4,7 @@
  * @targets $werror baseline
  * SSE2 SSE41 AVX2
  * VSX VSX2 VSX3
- * NEON ASIMD ASIMDHP
+ * NEON ASIMD ASIMDHP SVE
  */
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>