8000 ENH: Enable SVE detection for Highway VQSort · numpy/numpy@0d56dbb · GitHub
[go: up one dir, main page]

Skip to content

Commit 0d56dbb

Browse files
committed
ENH: Enable SVE detection for Highway VQSort
Leveraging the meson infrastructure to selectively enable SVE specifically for Highway, which already supports SVE. ``` | Change | Before [94bc564] <main> | After [1ffedb85] <sve-sort> | Ratio | Benchmark (Parameter) | |----------|----------------------------|-------------------------------|---------|----------------------------------------------------------------------------------| | + | 551±0.8μs | 654±5μs | 1.19 | bench_function_base.Sort.time_sort('merge', 'float32', ('random',)) | | + | 72.9±0.1μs | 80.5±0.08μs | 1.11 | bench_function_base.Sort.time_argsort('quick', 'float32', ('ordered',)) | | + | 553±1μs | 606±2μs | 1.1 | bench_function_base.Sort.time_sort('merge', 'float64', ('random',)) | | + | 41.9±0.3μs | 45.6±0.04μs | 1.09 | bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 1000)) | | + | 493±1μs | 532±2μs | 1.08 | bench_function_base.Sort.time_sort('merge', 'float16', ('random',)) | | + | 73.7±0.2μs | 78.8±0.04μs | 1.07 | bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 100)) | | + | 565±1μs | 600±1μs | 1.06 | bench_function_base.Sort.time_argsort('heap', 'float64', ('ordered',)) | | + | 465±1μs | 491±1μs | 1.06 | bench_function_base.Sort.time_argsort('heap', 'int32', ('reversed',)) | | + | 696±3μs | 739±2μs | 1.06 | bench_function_base.Sort.time_sort('heap', 'float16', ('random',)) | | + | 645±3μs | 684±3μs | 1.06 | bench_function_base.Sort.time_sort('heap', 'float16', ('sorted_block', 10)) | | + | 651±1μs | 691±1μs | 1.06 | bench_function_base.Sort.time_sort('heap', 'float16', ('sorted_block', 100)) | | + | 627±3μs | 665±1μs | 1.06 | bench_function_base.Sort.time_sort('heap', 'float16', ('sorted_block', 1000)) | | + | 467±1μs | 494±0.7μs | 1.06 | bench_function_base.Sort.time_sort('heap', 'float32', ('ordered',)) | | + | 166±0.1μs | 174±1μs | 1.05 | bench_function_base.Sort.time_sort('merge', 'float32', ('sorted_block', 10)) | | - | 77.4±0.08μs | 73.2±0.2μs | 0.95 | bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 100)) | | - | 379±1μs | 359±0.1μs | 0.95 | bench_function_base.Sort.time_sort('heap', 'int32', ('ordered',)) | | - | 341±0.5μs | 324±0.5μs | 0.95 | bench_function_base.Sort.time_sort('quick', 'float16', ('sorted_block', 1000)) | | - | 590±0.5μs | 554±1μs | 0.94 | bench_function_base.Sort.time_argsort('heap', 'float32', ('ordered',)) | | - | 239±5μs | 226±0.6μs | 0.94 | bench_function_base.Sort.time_argsort('merge', 'float16', ('sorted_block', 10)) | | - | 195±2μs | 184±1μs | 0.94 | bench_function_base.Sort.time_argsort('merge', 'float32', ('sorted_block', 10)) | | - | 692±2μs | 637±2μs | 0.92 | bench_function_base.Sort.time_argsort('merge', 'float32', ('random',)) | | - | 45.5±0.03μs | 42.0±0.2μs | 0.92 | bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 1000)) | | - | 80.5±0.07μs | 73.0±0.1μs | 0.91 | bench_function_base.Sort.time_argsort('quick', 'float64', ('ordered',)) | | - | 78.9±0.2μs | 71.7±0.2μs | 0.91 | bench_function_base.Sort.time_sort('quick', 'uint32', ('ordered',)) | | - | 79.2±0.1μs | 72.1±0.2μs | 0.91 | bench_function_base.Sort.time_sort('quick', 'uint32', ('reversed',)) | | - | 131±2μs | 118±0.8μs | 0.9 | bench_function_base.Sort.time_sort('merge', 'float16', ('sorted_block', 100)) | | - | 82.8±0.2μs | 73.7±0.3μs | 0.89 | bench_function_base.Sort.time_sort('quick', 'float32', ('ordered',)) | | - | 83.4±0.07μs | 74.1±0.2μs | 0.89 | bench_function_base.Sort.time_sort('quick', 'float32', ('reversed',)) | | - | 78.6±0.2μs | 70.3±0.2μs | 0.89 | bench_function_base.Sort.time_sort('quick', 'int32', ('ordered',)) | | - | 79.2±0.09μs | 70.8±0.08μs | 0.89 | bench_function_base.Sort.time_sort('quick', 'int32', ('reversed',)) | | - | 3.22±0.02μs | 2.86±0μs | 0.89 | bench_function_base.Sort.time_sort('quick', 'uint32', ('uniform',)) | | - | 3.26±0.04μs | 2.84±0μs | 0.87 | bench_function_base.Sort.time_sort('quick', 'int32', ('uniform',)) | | - | 82.6±0.06μs | 71.1±0.08μs | 0.86 | bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 10)) | | - | 4.91±0.01μs | 4.22±0μs | 0.86 | bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',)) | | - | 79.0±0.2μs | 66.8±0.05μs | 0.85 | bench_function_base.Sort.time_sort('merge', 'float16', ('sorted_block', 1000)) | | - | 78.8±0.05μs | 67.0±0.2μs | 0.85 | bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 10)) | | - | 84.2±0.07μs | 70.8±0.1μs | 0.84 | bench_function_base.Sort.time_sort('quick', 'float32', ('random',)) | | - | 89.4±0.1μs | 75.5±0.05μs | 0.84 | bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 1000)) | | - | 78.9±0.04μs | 65.9±0.1μs | 0.84 | bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 10)) | | - | 85.4±0.06μs | 71.9±0.05μs | 0.84 | bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 1000)) | | - | 85.3±0.08μs | 70.5±0.1μs | 0.83 | bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 1000)) | | - | 80.5±0.03μs | 66.4±0.1μs | 0.83 | bench_function_base.Sort.time_sort('quick', 'uint32', ('random',)) | | - | 87.5±0.05μs | 71.6±0.1μs | 0.82 | bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 100)) | | - | 80.4±0.05μs | 65.4±0.07μs | 0.81 | bench_function_base.Sort.time_sort('quick', 'int32', ('random',)) | | - | 83.6±0.05μs | 66.9±0.1μs | 0.8 | bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 100)) | | - | 83.5±0.05μs | 65.8±0.08μs | 0.79 | bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 100)) | | - | 6.87±0.01μs | 5.13±0.08μs | 0.75 | bench_function_base.Sort.time_sort('quick', 'float32', ('uniform',)) | | - | 12.2±0.02μs | 8.79±0.1μs | 0.72 | bench_function_base.Sort.time_sort('quick', 'float64', ('uniform',)) | | - | 193±0.5μs | 124±0.5μs | 0.65 | bench_function_base.Sort.time_sort('quick', 'float64', ('reversed',)) | | - | 27.7±0.2ms | 18.0±0.2ms | 0.65 | bench_function_base.Sort.time_sort_worst | | - | 192±0.4μs | 123±0.2μs | 0.64 | bench_function_base.Sort.time_sort('quick', 'float64', ('ordered',)) | | - | 202±0.2μs | 128±0.04μs | 0.63 | bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 1000)) | | - | 203±0.5μs | 125±0.09μs | 0.62 | bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 100)) | | - | 199±0.4μs | 122±0.07μs | 0.61 | bench_function_base.Sort.time_sort('quick', 'float64', ('random',)) | | - | 195±0.4μs | 120±0.09μs | 0.61 | bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 10)) | | - | 215±0.3μs | 121±0.3μs | 0.56 | bench_function_base.Sort.time_sort('quick', 'int64', ('ordered',)) | | - | 216±0.3μs | 121±0.7μs | 0.56 | bench_function_base.Sort.time_sort('quick', 'int64', ('reversed',)) | | - | 225±0.3μs | 126±0.3μs | 0.56 | bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 1000)) | | - | 223±0.2μs | 119±0.09μs | 0.54 | bench_function_base.Sort.time_sort('quick', 'int64', ('random',)) | | - | 219±0.06μs | 118±0.08μs | 0.54 | bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 10)) | | - | 227±0.3μs | 123±0.2μs | 0.54 | bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 100)) | ```
1 parent 94bc564 commit 0d56dbb

File tree

6 files changed

+27
-3
lines changed

6 files changed

+27
-3
lines changed

meson_cpu/arm/meson.build

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,15 @@ ASIMDFHM = mod_features.new(
5151
args: {'val': '-march=armv8.2-a+fp16fml', 'match': '-march=.*', 'mfilter': '\+.*'},
5252
test_code: files(source_root + '/numpy/distutils/checks/cpu_asimdfhm.c')[0]
5353
)
54+
## Scalable Vector Extensions (SVE)
55+
SVE = mod_features.new(
56+
'SVE', 8,
57+
args: {'val': '-march=armv8.2-a+sve', 'match': '-march=.*', 'mfilter': '\+.*'},
58+
test_code: files(source_root + '/numpy/distutils/checks/cpu_sve.c')[0]
59+
)
5460
# TODO: Add support for MSVC
5561
ARM_FEATURES = {
5662
'NEON': NEON, 'NEON_FP16': NEON_FP16, 'NEON_VFPV4': NEON_VFPV4,
57-
'ASIMD': ASIMD, 'ASIMDHP': ASIMDHP, 'ASIMDFHM': ASIMDFHM
63+
'ASIMD': ASIMD, 'ASIMDHP': ASIMDHP, 'ASIMDFHM': ASIMDFHM,
64+
'SVE': SVE
5865
}

numpy/_core/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ foreach gen_mtargets : [
766766
[
767767
'simd_qsort.dispatch.h',
768768
'src/npysort/simd_qsort.dispatch.cpp',
769-
[AVX512_SKX, ASIMD]
769+
[AVX512_SKX, SVE, ASIMD]
770770
],
771771
[
772772
'simd_qsort_16bit.dispatch.h',

numpy/_core/src/common/npy_cpu_features.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ static struct {
118118
{NPY_CPU_FEATURE_FPHP, "FPHP"},
119119
{NPY_CPU_FEATURE_ASIMDHP, "ASIMDHP"},
120120
{NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"},
121-
{NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"}};
121+
{NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"},
122+
{NPY_CPU_FEATURE_SVE, "SVE"}};
122123

123124

124125
NPY_VISIBILITY_HIDDEN PyObject *
@@ -760,6 +761,7 @@ npy__cpu_init_features_linux(void)
760761
npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = (hwcap & NPY__HWCAP_ASIMDHP) != 0;
761762
npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = (hwcap & NPY__HWCAP_ASIMDDP) != 0;
762763
npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = (hwcap & NPY__HWCAP_ASIMDFHM) != 0;
764+
npy__cpu_have[NPY_CPU_FEATURE_SVE] = (hwcap & NPY__HWCAP_SVE) != 0;
763765
npy__cpu_init_features_arm8();
764766
} else {
765767
npy__cpu_have[NPY_CPU_FEATURE_NEON] = (hwcap & NPY__HWCAP_NEON) != 0;
@@ -794,6 +796,9 @@ npy__cpu_init_features(void)
794796
#if defined(NPY_HAVE_ASIMDFHM) || defined(__ARM_FEATURE_FP16FML)
795797
npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = 1;
796798
#endif
799+
#if defined(NPY_HAVE_SVE) || defined(__ARM_FEATURE_SVE)
800+
npy__cpu_have[NPY_CPU_FEATURE_SVE] = 1;
801+
#endif
797802
npy__cpu_init_features_arm8();
798803
#else
799804
#if defined(NPY_HAVE_NEON) || defined(__ARM_NEON__)

numpy/_core/src/common/npy_cpu_features.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ enum npy_cpu_features
8686
NPY_CPU_FEATURE_ASIMDDP = 306,
8787
// ARMv8.2 single&half-precision multiply
8888
NPY_CPU_FEATURE_ASIMDFHM = 307,
89+
// Scalable Vector Extensions (SVE)
90+
NPY_CPU_FEATURE_SVE = 308,
8991

9092
// IBM/ZARCH
9193
NPY_CPU_FEATURE_VX = 350,

numpy/_core/src/common/npy_cpuinfo_parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#define NPY__HWCAP_FPHP (1 << 9)
5353
#define NPY__HWCAP_ASIMDHP (1 << 10)
5454
#define NPY__HWCAP_ASIMDDP (1 << 20)
55+
#define NPY__HWCAP_SVE (1 << 22)
5556
#define NPY__HWCAP_ASIMDFHM (1 << 23)
5657
/*
5758
* Get the size of a file by reading it until the end. This is needed

numpy/distutils/checks/cpu_sve.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#include <arm_sve.h>
2+
3+
int main(void)
4+
{
5+
svbool_t p = svptrue_b64();
6+
svint64_t a = svdup_s64(1);
7+
svint64_t b = svdup_s64(2);
8+
return svaddv(p, svmla_z(p, a, a, b));
9+
}

0 commit comments

Comments
 (0)
0