-
-
Notifications
You must be signed in to change notification settings - Fork 10.8k
ENH: Use Highway's VQSort on AArch64 #24018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a8082fe
edac5aa
11ab024
003c3a4
87f6d19
4d139b3
59443e8
278e1de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -94,6 +94,9 @@ if use_svml | |||||||||||||||||||||
error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.') | ||||||||||||||||||||||
endif | ||||||||||||||||||||||
endif | ||||||||||||||||||||||
if not fs.exists('src/highway/README.md') | ||||||||||||||||||||||
error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.') | ||||||||||||||||||||||
endif | ||||||||||||||||||||||
if not fs.exists('src/npysort/x86-simd-sort/README.md') | ||||||||||||||||||||||
error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.') | ||||||||||||||||||||||
endif | ||||||||||||||||||||||
|
@@ -761,13 +764,18 @@ foreach gen_mtargets : [ | |||||||||||||||||||||
[ | ||||||||||||||||||||||
'simd_qsort.dispatch.h', | ||||||||||||||||||||||
'src/npysort/simd_qsort.dispatch.cpp', | ||||||||||||||||||||||
[AVX512_SKX] | ||||||||||||||||||||||
[AVX512_SKX, ASIMD] | ||||||||||||||||||||||
], | ||||||||||||||||||||||
[ | ||||||||||||||||||||||
'simd_qsort_16bit.dispatch.h', | ||||||||||||||||||||||
'src/npysort/simd_qsort_16bit.dispatch.cpp', | ||||||||||||||||||||||
[AVX512_SPR, AVX512_ICL] | ||||||||||||||||||||||
], | ||||||||||||||||||||||
[ | ||||||||||||||||||||||
'simd_argsort.dispatch.h', | ||||||||||||||||||||||
'src/npysort/simd_argsort.dispatch.cpp', | ||||||||||||||||||||||
[AVX512_SKX] | ||||||||||||||||||||||
], | ||||||||||||||||||||||
Comment on lines
+774
to
+778
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Create an independent source and also a separate configuration header for the CPU targets. Please note that this will require creating a separate header, similar to simd_qsort.hpp, containing the forward declarations of the defined highway sort functions while considering the use of a different namespace, for example, Another note here: Are there any reasons to limit the use to the ARM architecture? |
||||||||||||||||||||||
] | ||||||||||||||||||||||
mtargets = mod_features.multi_targets( | ||||||||||||||||||||||
gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1], | ||||||||||||||||||||||
|
@@ -782,7 +790,8 @@ foreach gen_mtargets : [ | |||||||||||||||||||||
'src/common', | ||||||||||||||||||||||
'src/multiarray', | ||||||||||||||||||||||
'src/npymath', | ||||||||||||||||||||||
'src/umath' | ||||||||||||||||||||||
'src/umath', | ||||||||||||||||||||||
'src/highway', | ||||||||||||||||||||||
] | ||||||||||||||||||||||
) | ||||||||||||||||||||||
if not is_variable('multiarray_umath_mtargets') | ||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -60,6 +60,8 @@ | |||||||||||||||||||||||||||||||||||||||||||||||
#include <utility> | ||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||
#define NOT_USED NPY_UNUSED(unused) | ||||||||||||||||||||||||||||||||||||||||||||||||
#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__) | ||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||
/* | ||||||||||||||||||||||||||||||||||||||||||||||||
* pushing largest partition has upper bound of log2(n) space | ||||||||||||||||||||||||||||||||||||||||||||||||
* we store two pointers each time | ||||||||||||||||||||||||||||||||||||||||||||||||
|
@@ -83,12 +85,14 @@ inline bool quicksort_dispatch(T *start, npy_intp num) | |||||||||||||||||||||||||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||||||||||||||||||||||||||
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>); | ||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||
#if !DISABLE_HIGHWAY_OPTIMIZATION | ||||||||||||||||||||||||||||||||||||||||||||||||
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { | ||||||||||||||||||||||||||||||||||||||||||||||||
#ifndef NPY_DISABLE_OPTIMIZATION | ||||||||||||||||||||||||||||||||||||||||||||||||
#include "simd_qsort.dispatch.h" | ||||||||||||||||||||||||||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||||||||||||||||||||||||||
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>); | ||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||||||||||||||||||||||||||
Comment on lines
+88
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Using Google Highway after confirming the unavailability of the Intel library, either due to architectural differences or the AVX512 features not being available during runtime. |
||||||||||||||||||||||||||||||||||||||||||||||||
if (dispfunc) { | ||||||||||||||||||||||||||||||||||||||||||||||||
(*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num)); | ||||||||||||||||||||||||||||||||||||||||||||||||
return true; | ||||||||||||||||||||||||||||||||||||||||||||||||
|
@@ -105,7 +109,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num) | |||||||||||||||||||||||||||||||||||||||||||||||
using TF = typename np::meta::FixedWidth<T>::Type; | ||||||||||||||||||||||||||||||||||||||||||||||||
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr; | ||||||||||||||||||||||||||||||||||||||||||||||||
#ifndef NPY_DISABLE_OPTIMIZATION | ||||||||||||||||||||||||||||||||||||||||||||||||
#include "simd_qsort.dispatch.h" | ||||||||||||||||||||||||||||||||||||||||||||||||
#include "simd_argsort.dispatch.h" | ||||||||||||||||||||||||||||||||||||||||||||||||
#endif | ||||||||||||||||||||||||||||||||||||||||||||||||
/* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes | ||||||||||||||||||||||||||||||||||||||||||||||||
* in 32-bit*/ | ||||||||||||||||||||||||||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -28,6 +28,7 @@ | |||
#include "simd_qsort.hpp" | ||||
|
||||
#define NOT_USED NPY_UNUSED(unused) | ||||
#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__)) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Based on the presented suggestion, there would be no purpose in keeping this #definition. |
||||
|
||||
template<typename T> | ||||
inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth) | ||||
|
@@ -55,12 +56,14 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth) | |||
#endif | ||||
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>); | ||||
} | ||||
#if !DISABLE_HIGHWAY_OPTIMIZATION | ||||
else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) { | ||||
#ifndef NPY_DISABLE_OPTIMIZATION | ||||
#include "simd_qsort.dispatch.h" | ||||
#endif | ||||
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>); | ||||
} | ||||
#endif | ||||
if (dispfunc) { | ||||
(*dispfunc)(reinterpret_cast<TF*>(v), num, kth); | ||||
return true; | ||||
|
@@ -85,7 +88,7 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp | |||
sizeof(npy_intp) == sizeof(int64_t)) { | ||||
using TF = typename np::meta::FixedWidth<T>::Type; | ||||
#ifndef NPY_DISABLE_OPTIMIZATION | ||||
#include "simd_qsort.dispatch.h" | ||||
#include "simd_argsort.dispatch.h" | ||||
#endif | ||||
void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr; | ||||
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, <TF>); | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/*@targets | ||
* $maxopt $keep_baseline | ||
* avx512_skx | ||
*/ | ||
// policy $keep_baseline is used to avoid skip building avx512_skx | ||
// when its part of baseline features (--cpu-baseline), since | ||
// 'baseline' option isn't specified within targets. | ||
|
||
#include "simd_qsort.hpp" | ||
#ifndef __CYGWIN__ | ||
|
||
#if defined(NPY_HAVE_AVX512_SKX) | ||
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp" | ||
#endif | ||
|
||
namespace np { namespace qsort_simd { | ||
|
||
#if defined(NPY_HAVE_AVX512_SKX) | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||
{ | ||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size) | ||
{ | ||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||
} | ||
#endif | ||
|
||
}} // namespace np::simd | ||
|
||
#endif // __CYGWIN__ |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -1,5 +1,7 @@ | ||||
/*@targets | ||||
* $maxopt $keep_baseline avx512_skx | ||||
* $maxopt $keep_baseline | ||||
* avx512_skx | ||||
* asimd | ||||
*/ | ||||
// policy $keep_baseline is used to avoid skip building avx512_skx | ||||
// when its part of baseline features (--cpu-baseline), since | ||||
|
@@ -8,39 +10,19 @@ | |||
#include "simd_qsort.hpp" | ||||
#ifndef __CYGWIN__ | ||||
|
||||
#define USE_HIGHWAY defined(__aarch64__) | ||||
|
||||
#if defined(NPY_HAVE_AVX512_SKX) | ||||
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp" | ||||
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp" | ||||
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp" | ||||
#elif USE_HIGHWAY | ||||
#define VQSORT_ONLY_STATIC 1 | ||||
#include "hwy/contrib/sort/vqsort-inl.h" | ||||
#endif | ||||
|
||||
namespace np { namespace qsort_simd { | ||||
|
||||
#if defined(NPY_HAVE_AVX512_SKX) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth) | ||||
{ | ||||
avx512_qselect(arr, kth, num, true); | ||||
|
@@ -89,31 +71,32 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) | |||
{ | ||||
avx512_qsort(arr, size); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size) | ||||
#elif USE_HIGHWAY | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Based on the proposed suggestion, the following functions in this modification should be defined within a separate new source |
||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size) | ||||
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size) | ||||
{ | ||||
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size); | ||||
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); | ||||
} | ||||
#endif // NPY_HAVE_AVX512_SKX | ||||
#endif | ||||
|
||||
}} // namespace np::simd | ||||
|
||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Based on the presented suggestion, there would be no purpose in adding this target.