8000 ENH: Vectorize np.sort and np.partition with AVX2 by r-devulap · Pull Request #25045 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Vectorize np.sort and np.partition with AVX2 #25045

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a303128
Update x86-simd-sort to latest
Oct 26, 2023
9a7e109
avx512 qsort for fp16, fp32 and fp64 require an explicit flag to sort…
Oct 26, 2023
d4169da
Dispatch AVX2 qsort and partition
Oct 26, 2023
9f1faa1
Separate testing np.partition and np.argpartition
Nov 2, 2023
7c39472
update x86-simd-sort to latest
Nov 2, 2023
3b6643d
Disable test_shape_base.py::TestTakeAlongAxis::test_argequivalent for…
Nov 2, 2023
fb69b5b
Linter fixes
Nov 2, 2023
190e80e
Fix rebase bug in meson.build
Nov 27, 2023
c9588ca
Split highway and x86-simd-sort dispatch to separate files
Nov 27, 2023
0cb29a2
Update x86-simd-sort to latest
Nov 27, 2023
a3ca84b
Include highway/x86-simd-sort at compile time
Nov 30, 2023
9bde4ae
Remove distutils related dispatch code and create new namespace for h…
Dec 1, 2023
b28ed78
Add optional hasnan arguement to avx sorting
Dec 1, 2023
675cb07
Add x86_qsort/qselect functions inside anonymous namespace
Dec 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Split highway and x86-simd-sort dispatch to separate files
  • Loading branch information
Raghuveer Devulapalli committed Nov 30, 2023
commit c9588ca08f00d2085fc63b98683257b5b864c4d0
20 changes: 13 additions & 7 deletions numpy/_core/meson.build
8000
Original file line number Diff line number Diff line change
Expand Up @@ -764,23 +764,29 @@ endforeach
# -----------------------------------
foreach gen_mtargets : [
[
'simd_argsort.dispatch.h',
'src/npysort/simd_argsort.dispatch.cpp',
'x86_simd_argsort.dispatch.h',
'src/npysort/x86_simd_argsort.dispatch.cpp',
[AVX512_SKX]
],
[
'simd_qsort.dispatch.h',
'src/npysort/simd_qsort.dispatch.cpp',
'x86_simd_qsort.dispatch.h',
'src/npysort/x86_simd_qsort.dispatch.cpp',
[
AVX512_SKX, AVX2,
ASIMD,
]
],
[
'simd_qsort_16bit.dispatch.h',
'src/npysort/simd_qsort_16bit.dispatch.cpp',
'x86_simd_qsort_16bit.dispatch.h',
'src/npysort/x86_simd_qsort_16bit.dispatch.cpp',
[AVX512_SPR, AVX512_ICL]
],
[
'highway_qsort.dispatch.h',
'src/npysort/highway_qsort.dispatch.cpp',
[
ASIMD,
]
],
]
mtargets = mod_features.multi_targets(
gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
Expand Down
48 changes: 48 additions & 0 deletions numpy/_core/src/npysort/highway_qsort.dispatch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*@targets
* $maxopt $keep_baseline
* asimd
*/
// policy $keep_baseline is used to avoid skip building avx512_skx
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "highway_qsort.hpp"
#ifndef __CYGWIN__

#if NPY_HAVE_ASIMD
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"
#endif

namespace np { namespace qsort_simd {

#if NPY_HAVE_ASIMD
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
#endif // NPY_HAVE_ASIMD

}} // namespace np::simd

#endif // __CYGWIN__
16 changes: 16 additions & 0 deletions numpy/_core/src/npysort/highway_qsort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP
#define NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP

#include "common.hpp"

namespace np { namespace qsort_simd {

#ifndef NPY_DISABLE_OPTIMIZATION
#include "highway_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))

} } // np::qsort_simd

#endif // NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP
19 changes: 12 additions & 7 deletions numpy/_core/src/npysort/quicksort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@
#include "npysort_common.h"
#include "npysort_heapsort.h"
#include "numpy_tag.h"
#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#include "highway_qsort.hpp"

#include <cstdlib>
#include <utility>

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)

/*
* pushing largest partition has upper bound of log2(n) space
Expand All @@ -81,18 +81,23 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
void (*dispfunc)(TF*, intptr_t) = nullptr;
if (sizeof(T) == sizeof(uint16_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort_16bit.dispatch.h"
#include "x86_simd_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "x86_simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
// If not dispatched for avx-512 or avx2:
if (dispfunc == nullptr) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "highway_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
}
#endif
if (dispfunc) {
(*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
return true;
Expand All @@ -109,7 +114,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
using TF = typename np::meta::FixedWidth<T>::Type;
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#include "x86_simd_argsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSort, <TF>);
if (dispfunc) {
Expand Down
8 changes: 4 additions & 4 deletions numpy/_core/src/npysort/selection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <array>
#include <cstdlib>
#include <utility>
#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"

#define NOT_USED NPY_UNUSED(unused)
#define DISABLE_HIGHWAY_OPTIMIZATION (defined(__arm__) || defined(__aarch64__))
Expand All @@ -45,14 +45,14 @@ inline bool quickselect_dispatch(T* v, npy_intp num, npy_intp kth)
void (*dispfunc)(TF*, npy_intp, npy_intp) = nullptr;
if constexpr (sizeof(T) == sizeof(uint16_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort_16bit.dispatch.h"
#include "x86_simd_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
#if !DISABLE_HIGHWAY_OPTIMIZATION
else if constexpr (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "x86_simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSelect, <TF>);
}
Expand All @@ -79,7 +79,7 @@ inline bool argquickselect_dispatch(T* v, npy_intp* arg, npy_intp num, npy_intp
(sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t))) {
using TF = typename np::meta::FixedWidth<T>::Type;
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#include "x86_simd_argsort.dispatch.h"
#endif
void (*dispfunc)(TF*, npy_intp*, npy_intp, npy_intp) = nullptr;
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template ArgQSelect, <TF>);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,21 @@
/*@targets
* $maxopt $keep_baseline
* avx512_skx avx2
* asimd
*/
// policy $keep_baseline is used to avoid skip building avx512_skx
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#define USE_HIGHWAY defined(__aarch64__)

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif defined(NPY_HAVE_AVX2)
#include "x86-simd-sort/src/avx2-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx2-64bit-qsort.hpp"
#elif USE_HIGHWAY
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"
#endif

namespace np { namespace qsort_simd {
Expand Down Expand Up @@ -123,31 +117,6 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, npy_intp num)
avx2_qsort(arr, num, true);
#endif
}
#elif USE_HIGHWAY
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
#endif // NPY_HAVE_AVX512_SKX || NPY_HAVE_AVX2

}} // namespace np::simd
Expand Down
Original file line number Diff line number Diff line change
@@ -1,34 +1,28 @@
#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
#ifndef NUMPY_SRC_COMMON_NPYSORT_X86_SIMD_QSORT_HPP
#define NUMPY_SRC_COMMON_NPYSORT_X86_SIMD_QSORT_HPP

#include "common.hpp"

#define DISABLE_HIGHWAY_OPTIMIZATION defined(__arm__)

namespace np { namespace qsort_simd {

#if !DISABLE_HIGHWAY_OPTIMIZATION
#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort.dispatch.h"
#include "x86_simd_qsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))
#endif // DISABLE_HIGHWAY_OPTIMIZATION

#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_argsort.dispatch.h"
#include "x86_simd_argsort.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSort, (T *arr, npy_intp* arg, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSelect, (T *arr, npy_intp* arg, npy_intp kth, npy_intp size))

#ifndef NPY_DISABLE_OPTIMIZATION
#include "simd_qsort_16bit.dispatch.h"
#include "x86_simd_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))

} } // np::qsort_simd

#undef DISABLE_HIGHWAY_OPTIMIZATION

#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
#endif // NUMPY_SRC_COMMON_NPYSORT_X86_SIMD_QSORT_HPP
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// when its part of baseline features (--cpu-baseline), since
// 'baseline' option isn't specified within targets.

#include "simd_qsort.hpp"
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SPR)
Expand Down
0