8000 ENH: Use Highway's VQSort on AArch64 · numpy/numpy@e05a00c · GitHub
[go: up one dir, main page]

Skip to content

Commit e05a00c

Browse files
committed
ENH: Use Highway's VQSort on AArch64
Introduces Highway, a SIMD library from Google. Highway has it's own dispatch mechanism and SIMD flavour (which adds padding) which may be a good replacement for NumPy intrinsics. Highway also has its own set of vectorised Math routines we could bolt on. For this patch, I've integrated the VQSort algorithm only on AArch64 so as not to impact other architectures. The VQSort algorithms performance is comparable to the AVX512 algorithms, according to this report: https://github.com/Voultapher/sort-research-rs/blob/main/writeup/intel_avx512/text.md Performance improvements in the NumPy benchmark suite are as follows: ``` before after ratio [f4b52d53] [f08058d4] <main-setuppy> <highway-sort> + 53.0±0.08μs 221±0.4μs 4.18 bench_function_base.Sort.time_sort('quick', 'int64', ('ordered',)) + 84.8±0.3μs 222±0.3μs 2.62 bench_function_base.Sort.time_sort('quick', 'int64', ('reversed',)) + 89.0±0.2μs 197±0.2μs 2.21 bench_function_base.Sort.time_sort('quick', 'float64', ('ordered',)) + 51.5±0.01μs 80.0±0.1μs 1.55 bench_function_base.Sort.time_sort('quick', 'uint32', ('ordered',)) + 52.0±0.07μs 80.3±0.2μs 1.55 bench_function_base.Sort.time_sort('quick', 'int32', ('ordered',)) + 148±0.3μs 197±0.2μs 1.33 bench_function_base.Sort.time_sort('quick', 'float64', ('reversed',)) + 471±0.2μs 598±0.5μs 1.27 bench_function_base.Sort.time_argsort('heap', 'float64', ('ordered',)) + 536±0.7μs 644±0.8μs 1.20 bench_function_base.Sort.time_argsort('heap', 'float64', ('reversed',)) + 39.8±0.02μs 47.4±0.05μs 1.19 bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 1000)) + 39.8±0.01μs 47.4±0.02μs 1.19 bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 1000)) + 40.1±0.01μs 47.6±0.04μs 1.19 bench_function_base.Sort.time_argsort('merge', 'int64', ('sorted_block', 1000)) + 27.9±0.03μs 32.6±0.03μs 1.17 bench_function_base.Sort.time_argsort('heap', 'uint32', ('uniform',)) + 685±0.5μs 796±0.8μs 1.16 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 10)) + 68.7±0.05μs 78.4±0.04μs 1.14 bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 100)) + 68.7±0.05μs 78.3±0.1μs 1.14 bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 100)) + 69.4±0.03μs 78.8±0.05μs 1.14 bench_function_base.Sort.time_argsort('merge', 'int64', ('sorted_block', 100)) + 22.0±0.01μs 24.7±0.01μs 1.12 bench_function_base.Sort.time_sort('heap', 'uint32', ('uniform',)) + 674±0.7μs 743±0.4μs 1.10 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 1000)) + 730±0.6μs 794±0.8μs 1.09 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 100)) + 464±2μs 492±10μs 1.06 bench_function_base.Sort.time_argsort('heap', 'int32', ('reversed',)) + 95.5±0.1μs 101±0.1μs 1.06 bench_function_base.Sort.time_argsort('quick', 'float64', ('ordered',)) + 839±1μs 886±2μs 1.06 bench_function_base.Sort.time_argsort('heap', 'float64', ('random',)) + 95.7±0.1μs 101±0.05μs 1.06 bench_function_base.Sort.time_argsort('quick', 'float32', ('ordered',)) - 516±0.6μs 488±0.9μs 0.95 bench_function_base.Sort.time_sort('heap', 'float32', ('ordered',)) - 532±1μs 499±5μs 0.94 bench_function_base.Sort.time_argsort('heap', 'int16', ('reversed',)) - 34.5±0.05μs 32.2±0.03μs 0.93 bench_function_base.Sort.time_sort('merge', 'int32', ('sorted_block', 1000)) - 715±1μs 666±0.4μs 0.93 bench_function_base.Sort.time_sort('heap', 'float64', ('random',)) - 254±2μs 231±0.2μs 0.91 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 1000)) - 655±0.5μs 592±0.3μs 0.90 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 100)) - 24.7±0.02μs 22.0±0.07μs 0.89 bench_function_base.Sort.time_sort('heap', 'int32', ('uniform',)) - 621±0.7μs 549±0.3μs 0.88 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 1000)) - 659±0.9μs 555±0.5μs 0.84 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 10)) - 545±0.4μs 444±0.5μs 0.81 bench_function_base.Sort.time_sort('heap', 'float64', ('reversed',)) - 511±0.5μs 396±0.4μs 0.78 bench_function_base.Sort.time_sort('heap', 'float64', ('ordered',)) - 316±0.6μs 225±0.2μs 0.71 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 10)) - 333±0.4μs 233±0.3μs 0.70 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 100)) - 323±0.2μs 206±0.2μs 0.64 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 1000)) - 134±0.9μs 85.0±0.07μs 0.64 bench_function_base.Sort.time_sort('quick', 'float32', ('reversed',)) - 376±0.9μs 228±0.3μs 0.61 bench_function_base.Sort.time_sort('quick', 'int64', ('random',)) - 418±0.4μs 207±0.4μs 0.50 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 100)) - 413±2μs 199±0.2μs 0.48 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 10)) - 64.4±0.3ms 28.6±0.3ms 0.44 bench_function_base.Sort.time_sort_worst - 501±2μs 204±0.3μs 0.41 bench_function_base.Sort.time_sort('quick', 'float64', ('random',)) - 248±1μs 86.9±0.2μs 0.35 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 1000)) - 251±0.7μs 87.0±0.2μs 0.35 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 1000)) - 329±0.9μs 91.4±0.1μs 0.28 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 1000)) - 329±1μs 85.2±0.06μs 0.26 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 100)) - 329±0.7μs 85.0±0.06μs 0.26 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 100)) - 315±0.5μs 80.4±0.05μs 0.26 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 10)) - 320±1μs 80.3±0.09μs 0.25 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 10)) - 372±0.4μs 82.1±0.06μs 0.22 bench_function_base.Sort.time_sort('quick', 'int32', ('random',)) - 414±0.4μs 89.6±0.1μs 0.22 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 100)) - 384±1μs 81.8±0.02μs 0.21 bench_function_base.Sort.time_sort('quick', 'uint32', ('random',)) - 415±0.3μs 84.8±0.2μs 0.20 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 10)) - 491±2μs 86.3±0.1μs 0.18 bench_function_base.Sort.time_sort('quick', 'float32', ('random',)) - 111±0.4μs 13.0±0.01μs 0.12 bench_function_base.Sort.time_sort('quick', 'float64', ('uniform',)) - 50.9±0.03μs 4.93±0.01μs 0.10 bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',)) - 108±0.3μs 7.25±0.02μs 0.07 bench_function_base.Sort.time_sort('quick', 'float32', ('uniform',)) - 51.7±0.05μs 3.43±0.01μs 0.07 bench_function_base.Sort.time_sort('quick', 'int32', ('uniform',)) - 51.3±0.07μs 3.35±0.06μs 0.07 bench_function_base.Sort.time_sort('quick', 'uint32', ('uniform',)) ```
1 parent 4af0bab commit e05a00c

8 files changed

+134
-19
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@
77
[submodule "numpy/core/src/npysort/x86-simd-sort"]
88
path = numpy/core/src/npysort/x86-simd-sort
99
url = https://github.com/intel/x86-simd-sort
10+
[submodule "numpy/core/src/highway"]
11+
path = numpy/core/src/highway
12+
url = https://github.com/google/highway.git

numpy/core/setup.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
# useful to avoid improperly requiring SVML when cross compiling.
3535
NPY_DISABLE_SVML = (os.environ.get('NPY_DISABLE_SVML', "0") == "1")
3636

37+
38+
# Set NPY_DISABLE_HIGHWAY=1 in the environment to disable the Highway
39+
# library. This option only has significance on a AArch64 and is most
40+
# useful to avoid improperly requiring Highway when cross compiling.
41+
NPY_DISABLE_HIGHWAY = (os.environ.get('NPY_DISABLE_HIGHWAY', "0") == "1")
42+
3743
# XXX: ugly, we use a class to avoid calling twice some expensive functions in
3844
# config.h/numpyconfig.h. I don't see a better way because distutils force
3945
# config.h generation inside an Extension class, and as such sharing
@@ -79,6 +85,14 @@ def can_link_svml():
7985
and "linux" in platform
8086
and sys.maxsize > 2**31)
8187

88+
def can_link_highway():
89+
"""Highway library is used only on AArch64 architecture
90+
"""
91+
if NPY_DISABLE_HIGHWAY:
92+
return False
93+
platform = sysconfig.get_platform()
94+
return ("arm64" in platform or "aarch64" in platform)
95+
8296
def check_git_submodules():
8397
out = os.popen("git submodule status")
8498
modules = out.readlines()
@@ -462,6 +476,9 @@ def generate_config_h(ext, build_dir):
462476

463477
if can_link_svml():
464478
moredefs.append(('NPY_CAN_LINK_SVML', 1))
479+
480+
if can_link_highway():
481+
moredefs.append(('NPY_CAN_LINK_HIGHWAY', 1))
465482

466483
# Use bogus stride debug aid to flush out bugs where users use
467484
# strides of dimensions with length 1 to index a full contiguous
@@ -907,6 +924,7 @@ def get_mathlib_info(*args):
907924
# links to the arm64 npymath library,
908925
# see gh-22673
909926
join('src', 'npymath', 'arm64_exports.c'),
927+
join('src', 'npysort', 'simd_argsort.dispatch.cpp'),
910928
join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
911929
join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
912930
]
@@ -1010,6 +1028,45 @@ def generate_umath_doc_header(ext, build_dir):
10101028
# to make builds reproducible.
10111029
svml_objs.sort()
10121030

1031+
highway_root = join('numpy', 'core', 'src', 'highway')
1032+
highway_sort = join(highway_root, 'hwy', 'contrib', 'sort')
1033+
highway_sort_objs = []
1034+
if can_link_highway():
1035+
highway_sort_objs = [
1036+
join(highway_root, 'hwy', 'aligned_allocator.cc'),
1037+
join(highway_root, 'hwy', 'per_target.cc'),
1038+
join(highway_root, 'hwy', 'print.cc'),
1039+
join(highway_root, 'hwy', 'targets.cc'),
1040+
*[
1041+
join(highway_sort, sort_file)
1042+
for sort_file in [
1043+
"vqsort.cc",
1044+
"vqsort_128a.cc",
1045+
"vqsort_128d.cc",
1046+
"vqsort_f32a.cc",
1047+
"vqsort_f32d.cc",
1048+
"vqsort_f64a.cc",
1049+
"vqsort_f64d.cc",
1050+
"vqsort_i16a.cc",
1051+
"vqsort_i16d.cc",
1052+
"vqsort_i32a.cc",
1053+
"vqsort_i32d.cc",
1054+
"vqsort_i64a.cc",
1055+
"vqsort_i64d.cc",
1056+
"vqsort_kv64a.cc",
1057+
"vqsort_kv64d.cc",
1058+
"vqsort_kv128a.cc",
1059+
"vqsort_kv128d.cc",
1060+
"vqsort_u16a.cc",
1061+
"vqsort_u16d.cc",
1062+
"vqsort_u32a.cc",
1063+
"vqsort_u32d.cc",
1064+
"vqsort_u64a.cc",
1065+
"vqsort_u64d.cc",
1066+
]
1067+
]
1068+
]
1069+
10131070
config.add_extension('_multiarray_umath',
10141071
sources=multiarray_src + umath_src +
10151072
common_src +
@@ -1021,9 +1078,10 @@ def generate_umath_doc_header(ext, build_dir):
10211078
generate_umath_c,
10221079
generate_umath_doc_header,
10231080
generate_ufunc_api,
1024-
],
1081+
] + highway_sort_objs,
10251082
depends=deps + multiarray_deps + umath_deps +
10261083
common_deps,
1084+
include_dirs=[highway_root],
10271085
libraries=['npymath'],
10281086
extra_objects=svml_objs,
10291087
extra_info=extra_info)

numpy/core/src/highway

Submodule highway added at 7e95b0b

numpy/core/src/npysort/quicksort.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
9595
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
9696
}
9797
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
98-
#ifndef NPY_DISABLE_OPTIMIZATION
98+
#if !defined(NPY_DISABLE_OPTIMIZATION) && !(defined(NPY_HAVE_ASIMD) && !defined(NPY_CAN_LINK_HIGHWAY))
9999
#include "simd_qsort.dispatch.h"
100100
#endif
101101
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
@@ -113,7 +113,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
113113
using TF = typename np::meta::FixedWidth<T>::Type;
114114
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
115115
#ifndef NPY_DISABLE_OPTIMIZATION
116-
#include "simd_qsort.dispatch.h"
116+
#include "simd_argsort.dispatch.h"
117117
#endif
118118
/* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes
119119
* in 32-bit*/
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*@targets
2+
* $maxopt $keep_baseline
3+
* avx512_skx
4+
*/
5+
// policy $keep_baseline is used to avoid skip building avx512_skx
6+
// when its part of baseline features (--cpu-baseline), since
7+
// 'baseline' option isn't specified within targets.
8+
9+
#include "simd_qsort.hpp"
10+
11+
#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
12+
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
13+
#endif
14+
15+
namespace np { namespace qsort_simd {
16+
17+
#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
18+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
19+
{
20+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
21+
}
22+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
23+
{
24+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
25+
}
26+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
27+
{
28+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
29+
}
30+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
31+
{
32+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
33+
}
34+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
35+
{
36+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
37+
}
38+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
39+
{
40+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
41+
}
42+
#endif
43+
44+
}} // namespace np::simd

numpy/core/src/npysort/simd_qsort.dispatch.cpp

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
/*@targets
2-
* $maxopt $keep_baseline avx512_skx
2+
* $maxopt $keep_baseline
3+
* avx512_skx
4+
* asimd
35
*/
46
// policy $keep_baseline is used to avoid skip building avx512_skx
57
// when its part of baseline features (--cpu-baseline), since
@@ -10,7 +12,8 @@
1012
#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
1113
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
1214
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
13-
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
15+
#elif defined(NPY_HAVE_ASIMD)
16+
#include "hwy/contrib/sort/vqsort.h"
1417
#endif
1518

1619
namespace np { namespace qsort_simd {
@@ -40,30 +43,31 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
4043
{
4144
avx512_qsort(arr, size);
4245
}
43-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
46+
#elif defined(NPY_HAVE_ASIMD) && defined(NPY_CAN_LINK_HIGHWAY)
47+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
4448
{
45-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
49+
hwy::VQSort(arr, size, hwy::SortAscending());
4650
}
47-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
51+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
4852
{
49-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
53+
hwy::VQSort(arr, size, hwy::SortAscending());
5054
}
51-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
55+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
5256
{
53-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
57+
hwy::VQSort(arr, size, hwy::SortAscending());
5458
}
55-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
59+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
5660
{
57-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
61+
hwy::VQSort(arr, size, hwy::SortAscending());
5862
}
59-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
63+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
6064
{
61-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
65+
hwy::VQSort(arr, size, hwy::SortAscending());
6266
}
63-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
67+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
6468
{
65-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
69+
hwy::VQSort(arr, size, hwy::SortAscending());
6670
}
67-
#endif // NPY_HAVE_AVX512_SKX
71+
#endif
6872

6973
}} // namespace np::simd

numpy/core/src/npysort/simd_qsort.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ namespace np { namespace qsort_simd {
99
#include "simd_qsort.dispatch.h"
1010
#endif
1111
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
12+
13+
#ifndef NPY_DISABLE_OPTIMIZATION
14+
#include "simd_argsort.dispatch.h"
15+
#endif
1216
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSort, (T *arr, npy_intp* arg, npy_intp size))
1317

1418
#ifndef NPY_DISABLE_OPTIMIZATION

numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*@targets
2-
* $maxopt $keep_baseline avx512_icl avx512_spr
2+
* $maxopt $keep_baseline
3+
* avx512_icl avx512_spr
34
*/
45
// policy $keep_baseline is used to avoid skip building avx512_skx
56
// when its part of baseline features (--cpu-baseline), since

0 commit comments

Comments
 (0)
0