8000 ENH: Use Highway's VQSort on AArch64 by Mousius · Pull Request #24018 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Use Highway's VQSort on AArch64 #24018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@
[submodule "vendored-meson/meson"]
path = vendored-meson/meson
url = https://github.com/numpy/meson.git
[submodule "numpy/_core/src/highway"]
path = numpy/_core/src/highway
url = https://github.com/google/highway.git
13 changes: 11 additions & 2 deletions numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ if use_svml
error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.')
endif
endif
if not fs.exists('src/highway/README.md')
error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.')
endif
if not fs.exists('src/npysort/x86-simd-sort/README.md')
error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.')
endif
Expand Down Expand Up @@ -761,13 +764,18 @@ foreach gen_mtargets : [
[
'simd_qsort.dispatch.h',
'src/npysort/simd_qsort.dispatch.cpp',
[AVX512_SKX]
[AVX512_SKX, ASIMD]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[AVX512_SKX, ASIMD]
[AVX512_SKX]

Based on the presented suggestion, there would be no purpose in adding this target.

],
[
'simd_qsort_16bit.dispatch.h',
'src/npysort/simd_qsort_16bit.dispatch.cpp',
[AVX512_SPR, AVX512_ICL]
],
[
'simd_argsort.dispatch.h',
'src/npysort/simd_argsort.dispatch.cpp',
[AVX512_SKX]
],
Comment on lines +774 to +778
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[
'simd_argsort.dispatch.h',
'src/npysort/simd_argsort.dispatch.cpp',
[AVX512_SKX]
],
[
'highway_qsort.dispatch.h',
'src/npysort/highway_qsort.dispatch.cpp',
[[AVX2, FMA3], SSE42, ASIMD, VSX2]
],

Create an independent source and also a separate configuration header for the CPU targets. Please note that this will require creating a separate header, similar to simd_qsort.hpp, containing the forward declarations of the defined highway sort functions while considering the use of a different namespace, for example, highway_qsort.

Another note here: Are there any reasons to limit the use to the ARM architecture?

]
mtargets = mod_features.multi_targets(
gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
Expand All @@ -782,7 +790,8 @@ foreach gen_mtargets : [
'src/common',
'src/multiarray',
'src/npymath',
'src/umath'
'src/umath',
'src/highway',
]
)
if not is_variable('multiarray_umath_mtargets')
Expand Down
1 change: 1 addition & 0 deletions numpy/_core/src/highway
Submodule highway added at 65d30e
6 changes: 5 additions & 1 deletion numpy/_core/src/npysort/quicksort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
#include <utility>

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)

/*
* pushing largest partition has upper bound of log2(n) space
* we store two pointers each time
Expand All @@ -83,12 +85,14 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#endif
Comment on lines +88 to +95
Copy link
Member
@seiko2plus seiko2plus Nov 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#endif
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
if (dispfunc == nullptr) {
// Priority is given to Intel-sort library to its efficient support for AVX512.
// For other CPU targets, we fallback to Google's highway sort.
#ifndef NPY_DISABLE_OPTIMIZATION
#include "highway_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::highway_qsort::template QSort, <TF>);
}
}
#endif

Using Google Highway after confirming the unavailability of the Intel library, either due to architectural differences or the AVX512 features not being available during runtime.
Also, note that there is no need for the guard DISABLE_HIGHWAY_OPTIMIZATION as the targeted features are controlled through Meson

if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
return true;
Expand All @@ -105,7 +109,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
using TF = typename np::meta::FixedWidth<T>::Type;
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "simd_argsort.dispatch.h"
#endif
/* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes
* in 32-bit*/
Expand Down
5 changes: 4 additions & 1 deletion numpy/_core/src/npysort/selection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "simd_qsort.hpp"

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))

Based on the presented suggestion, there would be no purpose in keeping this #definition.


template<typename T>
inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
Expand Down Expand Up @@ -55,12 +56,14 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
#endif
if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(v), num, kth);
return true;
Expand All @@ -85,7 +88,7 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp
sizeof(npy_intp) == sizeof(int64_t)) {
using TF = typename np::meta::FixedWidth<T>::Type;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "simd_argsort.dispatch.h"
#endif
void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr;
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, <TF>);
Expand Down
71 changes: 71 additions & 0 deletions numpy/_core/src/npysort/simd_argsort.dispatch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*@targets
* $maxopt $keep_baseline
* avx512_skx
*/
// policy $keep_baseline is used to avoid skip building avx512_skx
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#endif

namespace np { namespace qsort_simd {

#if defined(NPY_HAVE_AVX512_SKX)
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
}
#endif

}} // namespace np::simd

#endif // __CYGWIN__
61 changes: 22 additions & 39 deletions numpy/_core/src/npysort/simd_qsort.dispatch.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/*@targets
* $maxopt $keep_baseline avx512_skx
* $maxopt $keep_baseline
* avx512_skx
* asimd
*/
// policy $keep_baseline is used to avoid skip building avx512_skx
// when its part of baseline features (--cpu-baseline), since
Expand All @@ -8,39 +10,19 @@
#include "simd_qsort.hpp"
#ifndef __CYGWIN__

#define USE_HIGHWAY defined(__aarch64__)

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif USE_HIGHWAY
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"
#endif

namespace np { namespace qsort_simd {

#if defined(NPY_HAVE_AVX512_SKX)
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth)
{
avx512_qselect(arr, kth, num, true);
Expand Down Expand Up @@ -89,31 +71,32 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
avx512_qsort(arr, size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
#elif USE_HIGHWAY
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#elif USE_HIGHWAY

Based on the proposed suggestion, the following functions in this modification should be defined within a separate new source highway_qsort.dispatch.cpp.

template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
#endif // NPY_HAVE_AVX512_SKX
#endif

}} // namespace np::simd

Expand Down
11 changes: 11 additions & 0 deletions numpy/_core/src/npysort/simd_qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,21 @@

#include "common.hpp"

#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)

namespace np { namespace qsort_simd {

#if !DISABLE_HIGHWAY_OPTIMIZATION
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))
#endif

#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSort, (T *arr, npy_intp* arg, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSelect, (T *arr, npy_intp* arg, npy_intp kth, npy_intp size))

Expand All @@ -20,4 +28,7 @@ NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t siz
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))

} } // np::qsort_simd

#undef DISABLE_HIGHWAY_OPTIMIZATION

#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
0