diff --git a/.gitmodules b/.gitmodules index 9c92105bad8e..9847e2842fbc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "vendored-meson/meson"] path = vendored-meson/meson url = https://github.com/numpy/meson.git +[submodule "numpy/_core/src/highway"] + path = numpy/_core/src/highway + url = https://github.com/google/highway.git diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 8ed696531b4c..7f61aee6dd52 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -94,6 +94,9 @@ if use_svml error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.') endif endif +if not fs.exists('src/highway/README.md') + error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.') +endif if not fs.exists('src/npysort/x86-simd-sort/README.md') error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.') endif @@ -761,13 +764,18 @@ foreach gen_mtargets : [ [ 'simd_qsort.dispatch.h', 'src/npysort/simd_qsort.dispatch.cpp', - [AVX512_SKX] + [AVX512_SKX, ASIMD] ], [ 'simd_qsort_16bit.dispatch.h', 'src/npysort/simd_qsort_16bit.dispatch.cpp', [AVX512_SPR, AVX512_ICL] ], + [ + 'simd_argsort.dispatch.h', + 'src/npysort/simd_argsort.dispatch.cpp', + [AVX512_SKX] + ], ] mtargets = mod_features.multi_targets( gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1], @@ -782,7 +790,8 @@ foreach gen_mtargets : [ 'src/common', 'src/multiarray', 'src/npymath', - 'src/umath' + 'src/umath', + 'src/highway', ] ) if not is_variable('multiarray_umath_mtargets') diff --git a/numpy/_core/src/highway b/numpy/_core/src/highway new file mode 160000 index 000000000000..65d30ea17f3f --- /dev/null +++ b/numpy/_core/src/highway @@ -0,0 +1 @@ +Subproject commit 65d30ea17f3fde10dfe3805b2dc5c22ad59d9363 diff --git a/numpy/_core/src/npysort/quicksort.cpp b/numpy/_core/src/npysort/quicksort.cpp index 4fb7aee17da3..4ffbf5a4ab8a 100644 --- a/numpy/_core/src/npysort/quicksort.cpp +++ b/numpy/_core/src/npysort/quicksort.cpp @@ -60,6 +60,8 @@ #include #define NOT_USED NPY_UNUSED(unused) +#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__) + /* * pushing largest partition has upper bound of log2(n) space * we store two pointers each time @@ -83,12 +85,14 @@ inline bool quicksort_dispatch(T *start, npy_intp num) #endif NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, ); } + #if !DISABLE_HIGHWAY_OPTIMIZATION else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { #ifndef NPY_DISABLE_OPTIMIZATION #include "simd_qsort.dispatch.h" #endif NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, ); } + #endif if (dispfunc) { (*dispfunc)(reinterpret_cast(start), static_cast(num)); return true; @@ -105,7 +109,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num) using TF = typename np::meta::FixedWidth::Type; void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr; #ifndef NPY_DISABLE_OPTIMIZATION - #include "simd_qsort.dispatch.h" + #include "simd_argsort.dispatch.h" #endif /* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes * in 32-bit*/ diff --git a/numpy/_core/src/npysort/selection.cpp b/numpy/_core/src/npysort/selection.cpp index c8bd10d708c2..c09ad4c60a6d 100644 --- a/numpy/_core/src/npysort/selection.cpp +++ b/numpy/_core/src/npysort/selection.cpp @@ -28,6 +28,7 @@ #include "simd_qsort.hpp" #define NOT_USED NPY_UNUSED(unused) +#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__)) template inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth) @@ -55,12 +56,14 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth) #endif NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, ); } + #if !DISABLE_HIGHWAY_OPTIMIZATION else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { #ifndef NPY_DISABLE_OPTIMIZATION #include "simd_qsort.dispatch.h" #endif NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, ); } + #endif if (dispfunc) { (*dispfunc)(reinterpret_cast(v), num, kth); return true; @@ -85,7 +88,7 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp sizeof(npy_intp) == sizeof(int64_t)) { using TF = typename np::meta::FixedWidth::Type; #ifndef NPY_DISABLE_OPTIMIZATION - #include "simd_qsort.dispatch.h" + #include "simd_argsort.dispatch.h" #endif void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr; NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, ); diff --git a/numpy/_core/src/npysort/simd_argsort.dispatch.cpp b/numpy/_core/src/npysort/simd_argsort.dispatch.cpp new file mode 100644 index 000000000000..05ecc946ee45 --- /dev/null +++ b/numpy/_core/src/npysort/simd_argsort.dispatch.cpp @@ -0,0 +1,71 @@ +/*@targets + * $maxopt $keep_baseline + * avx512_skx + */ +// policy $keep_baseline is used to avoid skip building avx512_skx +// when its part of baseline features (--cpu-baseline), since +// 'baseline' option isn't specified within targets. + +#include "simd_qsort.hpp" +#ifndef __CYGWIN__ + +#if defined(NPY_HAVE_AVX512_SKX) + #include "x86-simd-sort/src/avx512-64bit-argsort.hpp" +#endif + +namespace np { namespace qsort_simd { + +#if defined(NPY_HAVE_AVX512_SKX) +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth) +{ + avx512_argselect(arr, reinterpret_cast(arg), kth, num); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size) +{ + avx512_argsort(arr, reinterpret_cast(arg), size); +} +#endif + +}} // namespace np::simd + +#endif // __CYGWIN__ diff --git a/numpy/_core/src/npysort/simd_qsort.dispatch.cpp b/numpy/_core/src/npysort/simd_qsort.dispatch.cpp index f492da041bc8..5ad51bbc759a 100644 --- a/numpy/_core/src/npysort/simd_qsort.dispatch.cpp +++ b/numpy/_core/src/npysort/simd_qsort.dispatch.cpp @@ -1,5 +1,7 @@ /*@targets - * $maxopt $keep_baseline avx512_skx + * $maxopt $keep_baseline + * avx512_skx + * asimd */ // policy $keep_baseline is used to avoid skip building avx512_skx // when its part of baseline features (--cpu-baseline), since @@ -8,39 +10,19 @@ #include "simd_qsort.hpp" #ifndef __CYGWIN__ +#define USE_HIGHWAY defined(__aarch64__) + #if defined(NPY_HAVE_AVX512_SKX) #include "x86-simd-sort/src/avx512-32bit-qsort.hpp" #include "x86-simd-sort/src/avx512-64bit-qsort.hpp" - #include "x86-simd-sort/src/avx512-64bit-argsort.hpp" +#elif USE_HIGHWAY + #define VQSORT_ONLY_STATIC 1 + #include "hwy/contrib/sort/vqsort-inl.h" #endif namespace np { namespace qsort_simd { #if defined(NPY_HAVE_AVX512_SKX) -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth) -{ - avx512_argselect(arr, reinterpret_cast(arg), kth, num); -} template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth) { avx512_qselect(arr, kth, num, true); @@ -89,31 +71,32 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) { avx512_qsort(arr, size); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size) +#elif USE_HIGHWAY +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size) +template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) { - avx512_argsort(arr, reinterpret_cast(arg), size); + hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); } -#endif // NPY_HAVE_AVX512_SKX +#endif }} // namespace np::simd diff --git a/numpy/_core/src/npysort/simd_qsort.hpp b/numpy/_core/src/npysort/simd_qsort.hpp index c45e5bf331f2..5e9030adcb4a 100644 --- a/numpy/_core/src/npysort/simd_qsort.hpp +++ b/numpy/_core/src/npysort/simd_qsort.hpp @@ -3,13 +3,21 @@ #include "common.hpp" +#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__) + namespace np { namespace qsort_simd { +#if !DISABLE_HIGHWAY_OPTIMIZATION #ifndef NPY_DISABLE_OPTIMIZATION #include "simd_qsort.dispatch.h" #endif NPY_CPU_DISPATCH_DECLARE(template void QSort, (T *arr, intptr_t size)) NPY_CPU_DISPATCH_DECLARE(template void QSelect, (T* arr, npy_intp num, npy_intp kth)) +#endif + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "simd_argsort.dispatch.h" +#endif NPY_CPU_DISPATCH_DECLARE(template void ArgQSort, (T *arr, npy_intp* arg, npy_intp size)) NPY_CPU_DISPATCH_DECLARE(template void ArgQSelect, (T *arr, npy_intp* arg, npy_intp kth, npy_intp size)) @@ -20,4 +28,7 @@ NPY_CPU_DISPATCH_DECLARE(template void QSort, (T *arr, intptr_t siz NPY_CPU_DISPATCH_DECLARE(template void QSelect, (T* arr, npy_intp num, npy_intp kth)) } } // np::qsort_simd + +#undef DISABLE_HIGHWAY_OPTIMIZATION + #endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP