numpy · seiko2plus · Nov 24, 2023 · Jun 20, 2023 · Jun 22, 2023 · Jun 26, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "vendored-meson/meson"]
 	path = vendored-meson/meson
 	url = https://github.com/numpy/meson.git
+[submodule "numpy/_core/src/highway"]
+	path = numpy/_core/src/highway
+	url = https://github.com/google/highway.git
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
@@ -94,6 +94,9 @@ if use_svml
     error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.')
   endif
 endif
+if not fs.exists('src/highway/README.md')
+  error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.')
+endif
 if not fs.exists('src/npysort/x86-simd-sort/README.md')
   error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.')
 endif
@@ -761,13 +764,18 @@ foreach gen_mtargets : [
   [
     'simd_qsort.dispatch.h',
     'src/npysort/simd_qsort.dispatch.cpp',
-    [AVX512_SKX]
+    [AVX512_SKX, ASIMD]
-    [AVX512_SKX, ASIMD]
+    [AVX512_SKX]
-    [AVX512_SKX, ASIMD]
+    [AVX512_SKX]
   ],
   [
     'simd_qsort_16bit.dispatch.h',
     'src/npysort/simd_qsort_16bit.dispatch.cpp',
     [AVX512_SPR, AVX512_ICL]
   ],
+  [
+    'simd_argsort.dispatch.h',
+    'src/npysort/simd_argsort.dispatch.cpp',
+    [AVX512_SKX]
+  ],
-  [
-    'simd_argsort.dispatch.h',
-    'src/npysort/simd_argsort.dispatch.cpp',
-    [AVX512_SKX]
-  ],
+  [
+    'highway_qsort.dispatch.h',
+    'src/npysort/highway_qsort.dispatch.cpp',
+    [[AVX2, FMA3], SSE42, ASIMD, VSX2]
+  ],
-  [
-    'simd_argsort.dispatch.h',
-    'src/npysort/simd_argsort.dispatch.cpp',
-    [AVX512_SKX]
-  ],
+  [
+    'highway_qsort.dispatch.h',
+    'src/npysort/highway_qsort.dispatch.cpp',
+    [[AVX2, FMA3], SSE42, ASIMD, VSX2]
+  ],
 ]
   mtargets = mod_features.multi_targets(
     gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
@@ -782,7 +790,8 @@ foreach gen_mtargets : [
       'src/common',
       'src/multiarray',
       'src/npymath',
-      'src/umath'
+      'src/umath',
+      'src/highway',
     ]
   )
   if not is_variable('multiarray_umath_mtargets')

diff --git a/numpy/_core/src/highway b/numpy/_core/src/highway
diff --git a/numpy/_core/src/npysort/quicksort.cpp b/numpy/_core/src/npysort/quicksort.cpp
@@ -60,6 +60,8 @@
 #include <utility>
 
 #define NOT_USED NPY_UNUSED(unused)
+#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)
+
 /*
  * pushing largest partition has upper bound of log2(n) space
  * we store two pointers each time
@@ -83,12 +85,14 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
         #endif
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
     }
+    #if !DISABLE_HIGHWAY_OPTIMIZATION
     else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
         #ifndef NPY_DISABLE_OPTIMIZATION
             #include "simd_qsort.dispatch.h"
         #endif
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
     }
+    #endif
-    #if !DISABLE_HIGHWAY_OPTIMIZATION
-    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
-        #ifndef NPY_DISABLE_OPTIMIZATION
-            #include "simd_qsort.dispatch.h"
-        #endif
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
-    }
-    #endif
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+        if (dispfunc == nullptr) {
+            // Priority is given to Intel-sort library to its efficient support for AVX512.
+            // For other CPU targets, we fallback to Google's highway sort.
+            #ifndef NPY_DISABLE_OPTIMIZATION
+                #include "highway_qsort.dispatch.h"
+            #endif
+            NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::highway_qsort::template QSort, <TF>);
+        }
+    }
+    #endif
-    #if !DISABLE_HIGHWAY_OPTIMIZATION
-    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
-        #ifndef NPY_DISABLE_OPTIMIZATION
-            #include "simd_qsort.dispatch.h"
-        #endif
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
-    }
-    #endif
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+        if (dispfunc == nullptr) {
+            // Priority is given to Intel-sort library to its efficient support for AVX512.
+            // For other CPU targets, we fallback to Google's highway sort.
+            #ifndef NPY_DISABLE_OPTIMIZATION
+                #include "highway_qsort.dispatch.h"
+            #endif
+            NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::highway_qsort::template QSort, <TF>);
+        }
+    }
+    #endif
     if (dispfunc) {
         (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
         return true;
@@ -105,7 +109,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
     using TF = typename np::meta::FixedWidth<T>::Type;
     void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
     #ifndef NPY_DISABLE_OPTIMIZATION
-        #include "simd_qsort.dispatch.h"
+        #include "simd_argsort.dispatch.h"
     #endif
     /* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes
      * in 32-bit*/

diff --git a/numpy/_core/src/npysort/selection.cpp b/numpy/_core/src/npysort/selection.cpp
@@ -28,6 +28,7 @@
 #include "simd_qsort.hpp"
 
 #define NOT_USED NPY_UNUSED(unused)
+#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
-#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
-#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
 
 template<typename T>
 inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
@@ -55,12 +56,14 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
             #endif
             NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
         }
+        #if !DISABLE_HIGHWAY_OPTIMIZATION
         else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
             #ifndef NPY_DISABLE_OPTIMIZATION
                 #include "simd_qsort.dispatch.h"
             #endif
             NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
         }
+        #endif
         if (dispfunc) {
             (*dispfunc)(reinterpret_cast<TF*>(v), num, kth);
             return true;
@@ -85,7 +88,7 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp
         sizeof(npy_intp) == sizeof(int64_t)) {
         using TF = typename np::meta::FixedWidth<T>::Type;
         #ifndef NPY_DISABLE_OPTIMIZATION
-            #include "simd_qsort.dispatch.h"
+            #include "simd_argsort.dispatch.h"
         #endif
         void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr;
         NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, <TF>);

diff --git a/numpy/_core/src/npysort/simd_argsort.dispatch.cpp b/numpy/_core/src/npysort/simd_argsort.dispatch.cpp
@@ -0,0 +1,71 @@
+/*@targets
+ * $maxopt $keep_baseline
+ * avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+#ifndef __CYGWIN__
+
+#if defined(NPY_HAVE_AVX512_SKX)
+    #include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {

+#if defined(NPY_HAVE_AVX512_SKX)
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
+{
+    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
+{
+    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+}
+#endif
+
+}} // namespace np::simd
+
+#endif // __CYGWIN__
diff --git a/numpy/_core/src/npysort/simd_qsort.dispatch.cpp b/numpy/_core/src/npysort/simd_qsort.dispatch.cpp
@@ -1,5 +1,7 @@
 /*@targets
- * $maxopt $keep_baseline avx512_skx
+ * $maxopt $keep_baseline
+ * avx512_skx
+ * asimd
  */
 // policy $keep_baseline is used to avoid skip building avx512_skx
 // when its part of baseline features (--cpu-baseline), since
@@ -8,39 +10,19 @@
 #include "simd_qsort.hpp"
 #ifndef __CYGWIN__
 
+#define USE_HIGHWAY defined(__aarch64__)
+
 #if defined(NPY_HAVE_AVX512_SKX)
     #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
     #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
-    #include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
+#elif USE_HIGHWAY
+    #define VQSORT_ONLY_STATIC 1
+    #include "hwy/contrib/sort/vqsort-inl.h"
 #endif
 
 namespace np { namespace qsort_simd {
 
 #if defined(NPY_HAVE_AVX512_SKX)
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
-{
-    avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
-}
 template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth)
 {
     avx512_qselect(arr, kth, num, true);
@@ -89,31 +71,32 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
 {
     avx512_qsort(arr, size);
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
+#elif USE_HIGHWAY
-#elif USE_HIGHWAY
-#elif USE_HIGHWAY
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
 {
-    avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
+    hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
 }
-#endif  // NPY_HAVE_AVX512_SKX
+#endif
 
 }} // namespace np::simd
 

diff --git a/numpy/_core/src/npysort/simd_qsort.hpp b/numpy/_core/src/npysort/simd_qsort.hpp
@@ -3,13 +3,21 @@
 
 #include "common.hpp"
 
+#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)
+
 namespace np { namespace qsort_simd {
 
+#if !DISABLE_HIGHWAY_OPTIMIZATION
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "simd_qsort.dispatch.h"
 #endif
 NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
 NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))
+#endif
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_argsort.dispatch.h"
+#endif
 NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSort, (T *arr, npy_intp* arg, npy_intp size))
 NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSelect, (T *arr, npy_intp* arg, npy_intp kth, npy_intp size))
 
@@ -20,4 +28,7 @@ NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t siz
 NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))
 
 } } // np::qsort_simd
+
+#undef DISABLE_HIGHWAY_OPTIMIZATION
+
 #endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP