From cf4911312110d4d26abea7e8258c383298debcd3 Mon Sep 17 00:00:00 2001 From: Lu Yahan Date: Wed, 14 Aug 2024 14:46:27 +0800 Subject: [PATCH] Convert loop unary fp into highway --- meson_cpu/x86/meson.build | 6 +- numpy/_core/meson.build | 11 +- .../src/umath/loops_unary_fp.dispatch.c.src | 12 +- .../src/umath/loops_unary_fp.dispatch.cpp | 343 ++++++++++++++++++ 4 files changed, 360 insertions(+), 12 deletions(-) create mode 100644 numpy/_core/src/umath/loops_unary_fp.dispatch.cpp diff --git a/meson_cpu/x86/meson.build b/meson_cpu/x86/meson.build index 8c7a0fb59a57..9be3afba3ef6 100644 --- a/meson_cpu/x86/meson.build +++ b/meson_cpu/x86/meson.build @@ -19,7 +19,7 @@ SSE3 = mod_features.new( ) SSSE3 = mod_features.new( 'SSSE3', 4, implies: SSE3, - args: '-mssse3', + args: ['-mssse3','-DHWY_WANT_SSSE3=1'], test_code: files(source_root + '/numpy/distutils/checks/cpu_ssse3.c')[0] ) SSE41 = mod_features.new( @@ -33,7 +33,7 @@ POPCNT = mod_features.new( test_code: files(source_root + '/numpy/distutils/checks/cpu_popcnt.c')[0] ) SSE42 = mod_features.new( - 'SSE42', 7, implies: POPCNT, args: '-msse4.2', + 'SSE42', 7, implies: POPCNT, args: ['-msse4.2', '-DHWY_WANT_SSE4=1'], test_code: files(source_root + '/numpy/distutils/checks/cpu_sse42.c')[0] ) # 7-20 left as margin for any extra features @@ -196,6 +196,8 @@ if compiler_id == 'msvc' AVX512_ICL ] fet.update(args: '') + SSE42.update(args: '-DHWY_WANT_SSE4=1') + SSSE3.update(args: '-DHWY_WANT_SSSE3=1') endforeach # MSVC compiler doesn't support the following features independently FMA3.update(implies: [F16C, AVX2]) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index dbf1a144ed93..b2ef96df8207 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -106,7 +106,10 @@ if use_highway highway_lib = static_library('highway', [ # required for hwy::Abort symbol - 'src/highway/hwy/abort.cc' + 'src/highway/hwy/abort.cc', + 'src/highway/hwy/per_target.cc', + 'src/highway/hwy/targets.cc', + 'src/highway/hwy/print.cc' ], cpp_args: '-DTOOLCHAIN_MISS_ASM_HWCAP_H', include_directories: ['src/highway'], @@ -972,12 +975,12 @@ foreach gen_mtargets : [ ], [ 'loops_unary_fp.dispatch.h', - src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), + 'src/umath/loops_unary_fp.dispatch.cpp', [ - SSE41, SSE2, + SSE42, SSE2, VSX2, ASIMD, NEON, - VXE, VX + VXE, VX, RVV ] ], [ diff --git a/numpy/_core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/_core/src/umath/loops_unary_fp.dispatch.c.src index 6cce02cd37bc..eb13364f88a7 100644 --- a/numpy/_core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/_core/src/umath/loops_unary_fp.dispatch.c.src @@ -100,9 +100,9 @@ NPY_FINLINE double c_square_f64(double a) */ #if @VCHK@ /**begin repeat1 - * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# - * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# - * #repl_0w1 = 0*7, 1# + * #kind = reciprocal# + * #intr = recip# + * #repl_0w1 = 1# */ /**begin repeat2 * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# @@ -199,9 +199,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# - * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# - * #clear = 0, 0, 0, 0, 0, 1, 0, 0# + * #kind = reciprocal# + * #intr = recip# + * #clear = 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/_core/src/umath/loops_unary_fp.dispatch.cpp b/numpy/_core/src/umath/loops_unary_fp.dispatch.cpp new file mode 100644 index 000000000000..ac45b6df6d0c --- /dev/null +++ b/numpy/_core/src/umath/loops_unary_fp.dispatch.cpp @@ -0,0 +1,343 @@ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#define PY_SSIZE_T_CLEAN +#include +#include "numpy/ndarraytypes.h" +#include "numpy/npy_common.h" +#include "numpy/npy_math.h" +#include "numpy/utils.h" +#include "fast_loop_macros.h" +#include "loops_utils.h" +#include +#include "hwy/print-inl.h" +#include + +namespace hn = hwy::HWY_NAMESPACE; +HWY_BEFORE_NAMESPACE(); +template +struct OpRound { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { +#if HWY_ARCH_X86_64 && HWY_TARGET >= HWY_SSSE3 + hn::ScalableTag d; + auto infmask = hn::IsInf(a); + auto nanmask = hn::IsNaN(a); + auto mask = hn::Or(infmask, nanmask); + auto b = hn::IfThenElse(mask, hn::Set(d, 0), a); + b = hn::Round(b); + return hn::IfThenElse(mask, a, b); +#else + return hn::Round(a); +#endif + } +}; + +template +struct OpFloor { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { +#if HWY_ARCH_X86_64 && HWY_TARGET >= HWY_SSSE3 + hn::ScalableTag d; + auto infmask = hn::IsInf(a); + auto nanmask = hn::IsNaN(a); + auto mask = hn::Or(infmask, nanmask); + auto b = hn::IfThenElse(mask, hn::Set(d, 0), a); + b = hn::Floor(b); + return hn::IfThenElse(mask, a, b); +#else + return hn::Floor(a); +#endif + } +}; + +template +struct OpCeil { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { +#if HWY_ARCH_X86_64 && HWY_TARGET >= HWY_SSSE3 + hn::ScalableTag d; + auto infmask = hn::IsInf(a); + auto nanmask = hn::IsNaN(a); + auto mask = hn::Or(infmask, nanmask); + auto b = hn::IfThenElse(mask, hn::Set(d, 0), a); + b = hn::Ceil(b); + return hn::IfThenElse(mask, a, b); +#else + return hn::Ceil(a); +#endif + } +}; + +template +struct OpTrunc { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { +#if HWY_ARCH_X86_64 && HWY_TARGET >= HWY_SSSE3 + hn::ScalableTag d; + auto infmask = hn::IsInf(a); + auto nanmask = hn::IsNaN(a); + auto mask = hn::Or(infmask, nanmask); + auto b = hn::IfThenElse(mask, hn::Set(d, 0), a); + b = hn::Trunc(b); + return hn::IfThenElse(mask, a, b); +#else + return hn::Trunc(a); +#endif + } +}; + +template +struct OpSqrt { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { + return hn::Sqrt(a); + } +}; + +template +struct OpSquare { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { + return hn::Mul(a, a); + } +}; + +template +struct OpAbs { + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { + return hn::Abs(a); + } +}; + +template +struct OpReciprocal { + hn::ScalableTag d; + HWY_INLINE hn::VFromD> operator()( + hn::VFromD> a) { + return hn::Div(hn::Set(d, T(1.0)), a); + } +}; + +template +HWY_INLINE void Super(char** args, + npy_intp const* dimensions, + npy_intp const* steps, + bool IS_RECIP) { + const T* HWY_RESTRICT input_array = (const T*)args[0]; + T* HWY_RESTRICT output_array = (T*)args[1]; + const size_t size = dimensions[0]; + const hn::ScalableTag d; + auto one = hn::Set(d, 1); + OP op; + if (is_mem_overlap(input_array, steps[0], output_array, steps[1], size)) { + const int lsize = sizeof(input_array[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + for (size_t len = size; 0 < len; + len--, input_array += ssrc, output_array += sdst) { + hn::Vec> in; + if (IS_RECIP) { + in = hn::LoadNOr(one, d, input_array, 1); + } else { + in = hn::LoadN(d, input_array, 1); + } + auto x = op(in); + hn::StoreN(x, d, output_array, 1); + } + } else if (IS_UNARY_CONT(input_array[0], output_array[0])) { + const int vstep = hn::Lanes(d); + const int wstep = vstep * 4; + size_t len = size; + for (; len >= wstep; + len -= wstep, input_array += wstep, output_array += wstep) { + const auto in0 = hn::LoadU(d, input_array + vstep * 0); + auto x0 = op(in0); + const auto in1 = hn::LoadU(d, input_array + vstep * 1); + auto x1 = op(in1); + const auto in2 = hn::LoadU(d, input_array + vstep * 2); + auto x2 = op(in2); + const auto in3 = hn::LoadU(d, input_array + vstep * 3); + auto x3 = op(in3); + hn::StoreU(x0, d, output_array + vstep * 0); + hn::StoreU(x1, d, output_array + vstep * 1); + hn::StoreU(x2, d, output_array + vstep * 2); + hn::StoreU(x3, d, output_array + vstep * 3); + } + for (; len >= vstep; + len -= vstep, input_array += vstep, output_array += vstep) { + const auto in = hn::LoadU(d, input_array); + auto x = op(in); + hn::StoreU(x, d, output_array); + } + if (len) { + hn::Vec> in; + if (IS_RECIP) { + in = hn::LoadNOr(one, d, input_array, len); + } else { + in = hn::LoadN(d, input_array, len); + } + auto x = op(in); + hn::StoreN(x, d, output_array, len); + } + } else { + using TI = hwy::MakeSigned; + const hn::Rebind> di; + + const int lsize = sizeof(input_array[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + auto load_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, ssrc)); + auto store_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, sdst)); + size_t full = size & -hn::Lanes(d); + size_t remainder = size - full; + if (sdst == 1 && ssrc != 1) { + for (size_t i = 0; i < full; i += hn::Lanes(d)) { + const auto in = hn::GatherIndex(d, input_array + i * ssrc, load_index); + auto x = op(in); + hn::StoreU(x, d, output_array + i); + } + } else if (sdst != 1 && ssrc == 1) { + for (size_t i = 0; i < full; i += hn::Lanes(d)) { + const auto in = hn::LoadU(d, input_array + i); + auto x = op(in); + hn::ScatterIndex(x, d, output_array + i * sdst, store_index); + } + } else { + for (size_t i = 0; i < full; i += hn::Lanes(d)) { + const auto in = hn::GatherIndex(d, input_array + i * ssrc, load_index); + auto x = op(in); + hn::ScatterIndex(x, d, output_array + i * sdst, store_index); + } + } + if (remainder) { + hn::Vec> in; + if (IS_RECIP) { + if (ssrc == 1) { + in = hn::LoadNOr(one, d, input_array + full, remainder); + } else { + in = hn::GatherIndexNOr(one, d, input_array + full * ssrc, load_index, + remainder); + } + } else { + if (ssrc == 1) { + in = hn::LoadN(d, input_array + full, remainder); + } else { + in = hn::GatherIndexN(d, input_array + full * ssrc, load_index, + remainder); + } + } + auto x = op(in); + if (sdst == 1) { + hn::StoreN(x, d, output_array + full, remainder); + } else { + hn::ScatterIndexN(x, d, output_array + full * sdst, store_index, + remainder); + } + } + } +} + + +extern "C" { +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_rint) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_rint) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_floor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_floor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_ceil) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ceil) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_trunc) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_trunc) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sqrt) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sqrt) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + return Super>(args, dimensions, steps, false); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + Super>(args, dimensions, steps, false); + npy_clear_floatstatus_barrier((char*)dimensions); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + Super>(args, dimensions, steps, false); + npy_clear_floatstatus_barrier((char*)dimensions); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + Super>(args, dimensions, steps, true); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + Super>(args, dimensions, steps, true); +} +} +HWY_AFTER_NAMESPACE(); \ No newline at end of file