8000 ENH: Vectorize np.sort and np.partition with AVX2 by r-devulap · Pull Request #25045 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Vectorize np.sort and np.partition with AVX2 #25045

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -764,19 +764,28 @@ endforeach
# -----------------------------------
foreach gen_mtargets : [
[
'simd_qsort.dispatch.h',
'src/npysort/simd_qsort.dispatch.cpp',
[AVX512_SKX, ASIMD]
'x86_simd_argsort.dispatch.h',
'src/npysort/x86_simd_argsort.dispatch.cpp',
[AVX512_SKX]
],
[
'x86_simd_qsort.dispatch.h',
'src/npysort/x86_simd_qsort.dispatch.cpp',
[
AVX512_SKX, AVX2,
]
],
[
'simd_qsort_16bit.dispatch.h',
'src/npysort/simd_qsort_16bit.dispatch.cpp',
'x86_simd_qsort_16bit.dispatch.h',
'src/npysort/x86_simd_qsort_16bit.dispatch.cpp',
[AVX512_SPR, AVX512_ICL]
],
[
'simd_argsort.dispatch.h',
'src/npysort/simd_argsort.dispatch.cpp',
[AVX512_SKX]
'highway_qsort.dispatch.h',
'src/npysort/highway_qsort.dispatch.cpp',
[
ASIMD,
]
],
]
mtargets = mod_features.multi_targets(
Expand Down
32 changes: 32 additions & 0 deletions numpy/_core/src/npysort/highway_qsort.dispatch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "highway_qsort.hpp"
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"

namespace np { namespace highway { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}

} } } // np::highway::qsort_simd
16 changes: 16 additions & 0 deletions numpy/_core/src/npysort/highway_qsort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP
#define NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP

#include "common.hpp"

namespace np { namespace highway { namespace qsort_simd {

#ifndef NPY_DISABLE_OPTIMIZATION
#include "highway_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))

} } } // np::highway::qsort_simd

#endif // NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP
25 changes: 12 additions & 13 deletions numpy/_core/src/npysort/quicksort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@
#include "npysort_common.h"
#include "npysort_heapsort.h"
#include "numpy_tag.h"
#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#include "highway_qsort.hpp"

#include <cstdlib>
#include <utility>

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)

/*
* pushing largest partition has upper bound of log2(n) space
Expand All @@ -81,18 +81,21 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
void (*dispfunc)(TF*, intptr_t) = nullptr;
if (sizeof(T) == sizeof(uint16_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort_16bit.dispatch.h"
#include "x86_simd_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86) // x86 32-bit and 64-bit
#include "x86_simd_qsort.dispatch.h"
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
#else
#include "highway_qsort.dispatch.h"
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::highway::qsort_simd::template QSort, <TF>);
#endif
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#endif
if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
return true;
Expand All @@ -109,13 +112,9 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
using TF = typename np::meta::FixedWidth<T>::Type;
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#include "x86_simd_argsort.dispatch.h"
#endif
6D47 /* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes
* in 32-bit*/
if (sizeof(npy_intp) == sizeof(int64_t)) {
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSort, <TF>);
}
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSort, <TF>);
if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(start), arg, num);
return true;
Expand Down
23 changes: 6 additions & 17 deletions numpy/_core/src/npysort/selection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <array>
#include <cstdlib>
#include <utility>
#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
Expand All @@ -37,33 +37,24 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
/*
* Only defined for int16_t, uint16_t, float16, int32_t, uint32_t, float32,
* int64_t, uint64_t, double
* TODO: Enable quickselect for 32-bit. np.argpartition is only vectorized
* for 64-bit when npy_intp is 8 bytes, which means np.partition and
* np.argpartition will produces different results on 32-bit systems.
* Several tests in test_multiarray.py rely on these results being
* identical. We should get rid of this constraint and re-write
* these tests once we enable this for 32-bit.
*/
if constexpr (
(std::is_integral_v<T> || std::is_floating_point_v<T> || std::is_same_v<T, np::Half>) &&
(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) &&
sizeof(npy_intp) == sizeof(int64_t)) {
(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t))) {
using TF = typename np::meta::FixedWidth<T>::Type;
void (*dispfunc)(TF*, npy_intp, npy_intp) = nullptr;
if constexpr (sizeof(T) == sizeof(uint16_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort_16bit.dispatch.h"
#include "x86_simd_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "x86_simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
#endif
if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(v), num, kth);
return true;
Expand All @@ -83,12 +74,10 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp
*/
if constexpr (
(std::is_integral_v<T> || std::is_floating_point_v<T>) &&
(sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) &&
// x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes in 32-bit
sizeof(npy_intp) == sizeof(int64_t)) {
(sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t))) {
using TF = typename np::meta::FixedWidth<T>::Type;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#include "x86_simd_argsort.dispatch.h"
#endif
void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr;
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, <TF>);
Expand Down
103 changes: 0 additions & 103 deletions numpy/_core/src/npysort/simd_qsort.dispatch.cpp

This file was deleted.

2 changes: 1 addition & 1 deletion numpy/_core/src/npysort/x86-simd-sort
Submodule x86-simd-sort updated 77 files
+10 −5 .github/workflows/build-numpy.yml
+99 −54 .github/workflows/c-cpp.yml
+6 −79 Makefile
+63 −147 README.md
+2 −2 _clang-format
+3 −1 benchmarks/bench-all.cpp
+20 −59 benchmarks/bench-argsort.hpp
+48 −0 benchmarks/bench-keyvalue.hpp
+28 −67 benchmarks/bench-partial-qsort.hpp
+27 −38 benchmarks/bench-qselect.hpp
+0 −40 benchmarks/bench-qsort-common.h
+20 −61 benchmarks/bench-qsort.hpp
+0 −201 benchmarks/bench-qsortfp16.cpp
+0 −28 benchmarks/bench-tgl.out
+29 −0 benchmarks/bench-vqsort.cpp
+56 −0 benchmarks/bench.h
+16 −13 benchmarks/meson.build
+30 −0 examples/Makefile
+10 −0 examples/avx2-32bit-qsort.cpp
+10 −0 examples/avx512-16bit-qsort.cpp
+10 −0 examples/avx512-32bit-qsort.cpp
+10 −0 examples/avx512-64bit-qsort.cpp
+9 −0 examples/avx512-argsort.cpp
+18 −0 examples/avx512-kv.cpp
+10 −0 examples/avx512fp-16bit-qsort.cpp
+45 −0 lib/list-of-exported-symbols.txt
+47 −0 lib/meson.build
+32 −0 lib/x86simdsort-avx2.cpp
+38 −0 lib/x86simdsort-icl.cpp
+81 −0 lib/x86simdsort-internal.h
+90 −0 lib/x86simdsort-scalar.h
+62 −0 lib/x86simdsort-skx.cpp
+23 −0 lib/x86simdsort-spr.cpp
+209 −0 lib/x86simdsort.cpp
+43 −0 lib/x86simdsort.h
+39 −10 meson.build
+4 −0 meson_options.txt
+20 −12 run-bench.py
+1 −1 scripts/bench-compare.sh
+8 −7 scripts/branch-compare.sh
+167 −0 src/README.md
+597 −0 src/avx2-32bit-qsort.hpp
+587 −0 src/avx2-64bit-qsort.hpp
+289 −0 src/avx2-emu-funcs.hpp
+106 −227 src/avx512-16bit-common.h
+215 −116 src/avx512-16bit-qsort.hpp
+266 −428 src/avx512-32bit-qsort.hpp
+398 −85 src/avx512-64bit-argsort.hpp
+493 −184 src/avx512-64bit-common.h
+0 −457 src/avx512-64bit-keyvalue-networks.hpp
+219 −35 src/avx512-64bit-keyvaluesort.hpp
+1 −823 src/avx512-64bit-qsort.hpp
+0 −294 src/avx512-common-argsort.h
+0 −753 src/avx512-common-qsort.h
+89 −46 src/avx512fp16-16bit-qsort.hpp
+81 −0 src/xss-common-includes.h
+622 −0 src/xss-common-qsort.h
+497 −0 src/xss-network-keyvaluesort.hpp
+209 −0 src/xss-network-qsort.hpp
+320 −0 src/xss-optimal-networks.hpp
+63 −0 src/xss-pivot-selection.hpp
+18 −27 tests/meson.build
+0 −49 tests/test-argselect.hpp
+0 −81 tests/test-argsort-common.h
+0 −9 tests/test-argsort.cpp
+0 −272 tests/test-argsort.hpp
+49 −69 tests/test-keyvalue.cpp
+0 −51 tests/test-partial-qsort.hpp
+0 −98 tests/test-qselect.hpp
+95 −3 tests/test-qsort-common.h
+0 −49 tests/test-qsort-fp.hpp
+168 −10 tests/test-qsort.cpp
+0 −172 tests/test-qsort.hpp
+0 −161 tests/test-qsortfp16.cpp
+44 −0 utils/custom-compare.h
+91 −0 utils/custom-float.h
+124 −28 utils/rand_array.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
/*@targets
* $maxopt $keep_baseline
* avx512_skx
*/
// policy $keep_baseline is used to avoid skip building avx512_skx
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
Expand All @@ -15,56 +7,57 @@

namespace np { namespace qsort_simd {

/* arg methods currently only have AVX-512 versions */
#if defined(NPY_HAVE_AVX512_SKX)
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num, true);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
avx512_argselect(arr, reinterpret_cast<int64_t*>(arg), kth, num);
avx512_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num, true);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size, true);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
{
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
avx512_argsort(arr, reinterpret_cast<size_t*>(arg), size, true);
}
#endif
#endif // NPY_HAVE_AVX512_SKX

}} // namespace np::simd

Expand Down
Loading
0