From 130cac57681a5a2f125392c726c19b8bb068b37d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 31 Mar 2025 09:21:57 -0700 Subject: [PATCH 01/10] Update x86-simd-sort module to latest Pulls in 2 major changes: (1) Fixes a performance regression on 16-bit dtype sorting (see https://github.com/intel/x86-simd-sort/pull/190) (2) Adds openmp support for quicksort which speeds up sorting arrays > 100,000 by up to 3x. See: https://github.com/intel/x86-simd-sort/pull/179 --- numpy/_core/src/npysort/x86-simd-sort | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/npysort/x86-simd-sort b/numpy/_core/src/npysort/x86-simd-sort index 9a1b616d5cd4..b12fe4d04475 160000 --- a/numpy/_core/src/npysort/x86-simd-sort +++ b/numpy/_core/src/npysort/x86-simd-sort @@ -1 +1 @@ -Subproject commit 9a1b616d5cd4eaf49f7664fb86ccc1d18bad2b8d +Subproject commit b12fe4d0447597405379e040144b8bb6747f6683 From 7fd938b91d56465a77394ba0a318b675f42c471b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 31 Mar 2025 13:13:55 -0700 Subject: [PATCH 02/10] BLD: Add openmp flags to build x86-simd-sort Also adds a simple unit test to stress the openmp code paths --- numpy/_core/meson.build | 17 +++++++++++++++-- numpy/_core/tests/test_multiarray.py | 6 ++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 3bffed752474..e9093dcc3bb2 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -128,6 +128,16 @@ if use_intel_sort and not fs.exists('src/npysort/x86-simd-sort/README.md') error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.') endif +# Setup openmp flags for x86-simd-sort: +omp_cflags = [] +omp = [] +if use_intel_sort and cpp.has_argument('-fopenmp') + omp = dependency('openmp', required : false) + if omp.found() + omp_cflags = ['-fopenmp', '-DXSS_USE_OPENMP'] + endif +endif + if not fs.exists('src/common/pythoncapi-compat') error('Missing the `pythoncapi-compat` git submodule! ' + 'Run `git submodule update --init` to fix this.') @@ -867,6 +877,9 @@ foreach gen_mtargets : [ ] : [] ], ] + + + mtargets = mod_features.multi_targets( gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1], dispatch: gen_mtargets[2], @@ -874,7 +887,7 @@ foreach gen_mtargets : [ prefix: 'NPY_', dependencies: [py_dep, np_core_dep], c_args: c_args_common + max_opt, - cpp_args: cpp_args_common + max_opt, + cpp_args: cpp_args_common + max_opt + omp_cflags, include_directories: [ 'include', 'src/common', @@ -1286,7 +1299,7 @@ py.extension_module('_multiarray_umath', 'src/umath', 'src/highway' ], - dependencies: [blas_dep], + dependencies: [blas_dep, omp], link_with: [ npymath_lib, unique_hash_so, diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py index 0a62cb6945f0..089946ccffb1 100644 --- a/numpy/_core/tests/test_multiarray.py +++ b/numpy/_core/tests/test_multiarray.py @@ -10292,6 +10292,12 @@ def test_argsort_int(N, dtype): arr[N - 1] = maxv assert_arg_sorted(arr, np.argsort(arr, kind='quick')) +# Test large arrays that leverage openMP implementations from x86-simd-sort: +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +def test_sort_largearrays(dtype): + N = 1000000 + arr = np.random.rand(N) + assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") def test_gh_22683(): From f8cfa4ea676357a3ac7d29db6aa3b0636199721d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 10 Apr 2025 21:09:50 -0700 Subject: [PATCH 03/10] ENH: Update x86-simd-sort to port openmp support for argsort --- numpy/_core/src/npysort/x86-simd-sort | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/npysort/x86-simd-sort b/numpy/_core/src/npysort/x86-simd-sort index b12fe4d04475..14f504c35217 160000 --- a/numpy/_core/src/npysort/x86-simd-sort +++ b/numpy/_core/src/npysort/x86-simd-sort @@ -1 +1 @@ -Subproject commit b12fe4d0447597405379e040144b8bb6747f6683 +Subproject commit 14f504c35217f0b77e239c2a0c431a55d7f31337 From 02c47281386f7e54554ce5d3854070b2c70ad21b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 14 Apr 2025 14:43:38 -0700 Subject: [PATCH 04/10] Add meson option to toggle building with openMP --- meson.options | 2 ++ numpy/_core/meson.build | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/meson.options b/meson.options index 1be05d324756..b09992fe9b91 100644 --- a/meson.options +++ b/meson.options @@ -22,6 +22,8 @@ option('disable-intel-sort', type: 'boolean', value: false, description: 'Disables SIMD-optimized operations related to Intel x86-simd-sort') option('disable-threading', type: 'boolean', value: false, description: 'Disable threading support (see `NPY_ALLOW_THREADS` docs)') +option('enable-openmp', type: 'boolean', value: false, + description: 'Enable building NumPy with openmp support') option('disable-optimization', type: 'boolean', value: false, description: 'Disable CPU optimized code (dispatch,simd,unroll...)') option('cpu-baseline', type: 'string', value: 'min', diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index e9093dcc3bb2..0ea85912658d 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -128,10 +128,15 @@ if use_intel_sort and not fs.exists('src/npysort/x86-simd-sort/README.md') error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.') endif +use_openmp = not get_option('disable-threading') and get_option('enable-openmp') and cpp.has_argument('-fopenmp') +summary({ + 'Build with openMP? ' : use_openmp, +}) + # Setup openmp flags for x86-simd-sort: omp_cflags = [] omp = [] -if use_intel_sort and cpp.has_argument('-fopenmp') +if use_intel_sort and use_openmp omp = dependency('openmp', required : false) if omp.found() omp_cflags = ['-fopenmp', '-DXSS_USE_OPENMP'] From 21bc19f1349f0bff978cac6dde80ea4db6374f83 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 14 Apr 2025 15:25:06 -0700 Subject: [PATCH 05/10] TST: Add np.argsort test for openmp paths --- numpy/_core/tests/test_multiarray.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py index 089946ccffb1..d58471247682 100644 --- a/numpy/_core/tests/test_multiarray.py +++ b/numpy/_core/tests/test_multiarray.py @@ -10296,9 +10296,18 @@ def test_argsort_int(N, dtype): @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) def test_sort_largearrays(dtype): N = 1000000 - arr = np.random.rand(N) + rnd = np.random.RandomState(1100710816) + arr = -0.5 + rnd.random(N).astype(dtype) assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap')) +# Test large arrays that leverage openMP implementations from x86-simd-sort: +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_argsort_largearrays(dtype): + N = 1000000 + rnd = np.random.RandomState(1100710816) + arr = -0.5 + rnd.random(N).astype(dtype) + assert_arg_sorted(arr, np.argsort(arr, kind='quick')) + @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") def test_gh_22683(): b = 777.68760986 From e425de88aa7080ca39b9441184645de97ccd23b8 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 14 Apr 2025 15:27:51 -0700 Subject: [PATCH 06/10] CI: Add openmp flags to test openMP code paths --- .github/workflows/linux_simd.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux_simd.yml b/.github/workflows/linux_simd.yml index 4cd87ab37dd4..a9f065e25cc0 100644 --- a/.github/workflows/linux_simd.yml +++ b/.github/workflows/linux_simd.yml @@ -212,7 +212,7 @@ jobs: python -m pip install pytest pytest-xdist hypothesis typing_extensions - name: Build - run: CC=gcc-13 CXX=g++-13 spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_skx -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR' + run: CC=gcc-13 CXX=g++-13 spin build -- -Denable-openmp=true -Dallow-noblas=true -Dcpu-baseline=avx512_skx -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR' - name: Meson Log if: always() @@ -263,7 +263,7 @@ jobs: python -m pip install pytest pytest-xdist hypothesis typing_extensions - name: Build - run: CC=gcc-13 CXX=g++-13 spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr + run: CC=gcc-13 CXX=g++-13 spin build -- -Denable-openmp=true -Dallow-noblas=true -Dcpu-baseline=avx512_spr - name: Meson Log if: always() From ac59ea981df4a89479ef18c392cd96e9d90eb920 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 15 Apr 2025 14:34:57 -0700 Subject: [PATCH 07/10] Update x86-simd-sort: detect already sorted arrays for np.argsort --- numpy/_core/src/npysort/x86-simd-sort | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/npysort/x86-simd-sort b/numpy/_core/src/npysort/x86-simd-sort index 14f504c35217..c306ac581a59 160000 --- a/numpy/_core/src/npysort/x86-simd-sort +++ b/numpy/_core/src/npysort/x86-simd-sort @@ -1 +1 @@ -Subproject commit 14f504c35217f0b77e239c2a0c431a55d7f31337 +Subproject commit c306ac581a59f89585d778254c4ed7197e64ba2d From 8ba425e7eca84b0c26fafd360057549c2569c56b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 16 Apr 2025 12:06:17 -0700 Subject: [PATCH 08/10] DOCS: add release notes --- doc/release/upcoming_changes/28619.highlight.rst | 6 ++++++ doc/release/upcoming_changes/28619.performance.rst | 7 +++++++ 2 files changed, 13 insertions(+) create mode 100644 doc/release/upcoming_changes/28619.highlight.rst create mode 100644 doc/release/upcoming_changes/28619.performance.rst diff --git a/doc/release/upcoming_changes/28619.highlight.rst b/doc/release/upcoming_changes/28619.highlight.rst new file mode 100644 index 000000000000..6c296b92899e --- /dev/null +++ b/doc/release/upcoming_changes/28619.highlight.rst @@ -0,0 +1,6 @@ +Building NumPy with OpenMP Parallelization +------------------------------------------- +NumPy now supports OpenMP parallel processing capabilities when built with the +``-Denable_openmp=true`` Meson build flag. This feature is disabled by default. +When enabled, ``np.sort`` and ``np.argsort`` functions can utilize OpenMP for +parallel thread execution, improving performance for these operations. diff --git a/doc/release/upcoming_changes/28619.performance.rst b/doc/release/upcoming_changes/28619.performance.rst new file mode 100644 index 000000000000..904decbe0ba6 --- /dev/null +++ b/doc/release/upcoming_changes/28619.performance.rst @@ -0,0 +1,7 @@ +Performance improvements to ``np.sort`` and ``np.argsort`` +---------------------------------------------------------- +``np.sort`` and ``np.argsort`` functions now can leverage OpenMP for parallel +thread execution, resulting in up to 3.5x speedups on x86 architectures with +AVX2 or AVX-512 instructions. This opt-in feature requires NumPy to be built +with the -Denable_openmp Meson flag. Users can control the number of threads +used by setting the OMP_NUM_THREADS environment variable. From e0f024739d62a4e832e9ff569ee2c3fd980cc866 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 16 Apr 2025 12:06:53 -0700 Subject: [PATCH 09/10] Minor changes to meson.build --- numpy/_core/meson.build | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 0ea85912658d..62f62a14018d 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -128,19 +128,18 @@ if use_intel_sort and not fs.exists('src/npysort/x86-simd-sort/README.md') error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.') endif -use_openmp = not get_option('disable-threading') and get_option('enable-openmp') and cpp.has_argument('-fopenmp') -summary({ - 'Build with openMP? ' : use_openmp, -}) +# openMP related settings: +if get_option('disable-threading') and get_option('enable-openmp') + error('Build options `disable-threading` and `enable-openmp` are conflicting. Please set at most one to true.') +endif + +use_openmp = get_option('enable-openmp') and not get_option('disable-threading') # Setup openmp flags for x86-simd-sort: -omp_cflags = [] -omp = [] +omp_dep = [] if use_intel_sort and use_openmp - omp = dependency('openmp', required : false) - if omp.found() - omp_cflags = ['-fopenmp', '-DXSS_USE_OPENMP'] - endif + omp = dependency('openmp', required : true) + omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) endif if not fs.exists('src/common/pythoncapi-compat') @@ -890,9 +889,9 @@ foreach gen_mtargets : [ dispatch: gen_mtargets[2], # baseline: CPU_BASELINE, it doesn't provide baseline fallback prefix: 'NPY_', - dependencies: [py_dep, np_core_dep], + dependencies: [py_dep, np_core_dep, omp_dep], c_args: c_args_common + max_opt, - cpp_args: cpp_args_common + max_opt + omp_cflags, + cpp_args: cpp_args_common + max_opt, include_directories: [ 'include', 'src/common', From 6eff29e03c75d3fc31a9325780acd901c8b3651d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 16 Apr 2025 12:13:09 -0700 Subject: [PATCH 10/10] Initialize omp to empty variable --- numpy/_core/meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 62f62a14018d..a4d2050122c6 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -136,6 +136,7 @@ endif use_openmp = get_option('enable-openmp') and not get_option('disable-threading') # Setup openmp flags for x86-simd-sort: +omp = [] omp_dep = [] if use_intel_sort and use_openmp omp = dependency('openmp', required : true)