diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index a4d2050122c6..e4ee9523c61b 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -942,13 +942,14 @@ foreach gen_mtargets : [ ], [ 'loops_comparison.dispatch.h', - src_file.process('src/umath/loops_comparison.dispatch.c.src'), + 'src/umath/loops_comparison.dispatch.cpp', [ - AVX512_SKX, AVX512F, AVX2, SSE42, SSE2, + AVX512_SKX, AVX2, SSE42, SSE2, VSX3, VSX2, NEON, - VXE, VX, + VXE, LSX, + RVV, ] ], [ diff --git a/numpy/_core/src/umath/loops_comparison.dispatch.c.src b/numpy/_core/src/umath/loops_comparison.dispatch.c.src deleted file mode 100644 index 6450bed962b1..000000000000 --- a/numpy/_core/src/umath/loops_comparison.dispatch.c.src +++ /dev/null @@ -1,449 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "loops_utils.h" -#include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" - -/******************************************************************************** - ** Defining the SIMD kernels - ********************************************************************************/ -/**begin repeat - * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# - * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# - * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# - * #VECTOR = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# - */ -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal# - * #eq = 1, 0, 0, 0# - * #neq = 0, 1, 0, 0# - * #OP = ==, !=, <, <=# - * #VOP = cmpeq, cmpneq, cmplt, cmple# - */ -#if @VECTOR@ && !((@eq@ || @neq@) && @signed@) -static void simd_binary_@kind@_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 truemask = npyv_setall_u8(0x1); - const int vstep = npyv_nlanes_u8; - - // Unroll the loop to get a resultant vector with 'vsteps' elements. - for (; len >= vstep; - len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { -#if @len@ >= 8 - npyv_@sfx@ a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0); - npyv_@sfx@ b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0); - npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1); -#if @len@ >= 16 - npyv_@sfx@ a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1); - npyv_@sfx@ b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1); - npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2); -#if @len@ >= 32 - npyv_@sfx@ a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2); - npyv_@sfx@ b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2); - npyv_@sfx@ a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3); - npyv_@sfx@ b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3); - npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3); - npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4); -#if @len@ == 64 - npyv_@sfx@ a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4); - npyv_@sfx@ b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4); - npyv_@sfx@ a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5); - npyv_@sfx@ b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5); - npyv_@sfx@ a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6); - npyv_@sfx@ b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6); - npyv_@sfx@ a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7); - npyv_@sfx@ b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7); - npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5); - npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6); - npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7); - npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8); -#endif // @len@ >= 64 -#endif // @len@ >= 32 -#endif // @len@ >= 16 -#endif // @len@ >= 8 - - // Pack the 'c' vectors into a single vector 'r' -#if @len@ == 8 - npyv_u8 r = npyv_cvt_u8_b8(c1); -#elif @len@ == 16 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); -#elif @len@ == 32 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); -#elif @len@ == 64 - npyv_u8 r = - npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); -#endif - npyv_store_u8(dst, npyv_and_u8(r, truemask)); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst) { - const npyv_lanetype_@sfx@ a = *src1; - const npyv_lanetype_@sfx@ b = *src2; - *dst = a @OP@ b; - } -} - -static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_@sfx@ a = npyv_setall_@sfx@(scalar); - const npyv_u8 truemask = npyv_setall_u8(0x1); - const int vstep = npyv_nlanes_u8; - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { -#if @len@ >= 8 - npyv_@sfx@ b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); - npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1); -#if @len@ >= 16 - npyv_@sfx@ b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); - npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2); -#if @len@ >= 32 - npyv_@sfx@ b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); - npyv_@sfx@ b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); - npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3); - npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4); -#if @len@ == 64 - npyv_@sfx@ b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); - npyv_@sfx@ b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); - npyv_@sfx@ b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); - npyv_@sfx@ b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); - npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5); - npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6); - npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7); - npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8); -#endif // @len@ >= 64 -#endif // @len@ >= 32 -#endif // @len@ >= 16 -#endif // @len@ >= 8 - -#if @len@ == 8 - npyv_u8 r = npyv_cvt_u8_b8(c1); -#elif @len@ == 16 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); -#elif @len@ == 32 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); -#elif @len@ == 64 - npyv_u8 r = - npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); -#endif - npyv_store_u8(dst, npyv_and_u8(r, truemask)); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ b = *src; - *dst = scalar @OP@ b; - } -} - -static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_@sfx@ b = npyv_setall_@sfx@(scalar); - const npyv_u8 truemask = npyv_setall_u8(0x1); - const int vstep = npyv_nlanes_u8; - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { -#if @len@ >= 8 - npyv_@sfx@ a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); - npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b); -#if @len@ >= 16 - npyv_@sfx@ a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); - npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b); -#if @len@ >= 32 - npyv_@sfx@ a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); - npyv_@sfx@ a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); - npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b); - npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b); -#if @len@ == 64 - npyv_@sfx@ a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); - npyv_@sfx@ a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); - npyv_@sfx@ a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); - npyv_@sfx@ a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); - npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b); - npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b); - npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b); - npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b); -#endif // @len@ >= 64 -#endif // @len@ >= 32 -#endif // @len@ >= 16 -#endif // @len@ >= 8 - -#if @len@ == 8 - npyv_u8 r = npyv_cvt_u8_b8(c1); -#elif @len@ == 16 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); -#elif @len@ == 32 - npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); -#elif @len@ == 64 - npyv_u8 r = - npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); -#endif - npyv_store_u8(dst, npyv_and_u8(r, truemask)); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - *dst = a @OP@ scalar; - } -} -#endif - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #kind = equal, not_equal, less, less_equal# - * #eq = 1, 0, 0, 0# - * #neq = 0, 1, 0, 0# - * #OP = ==, !=, <, <=# - * #VOP = xnor, xor, andc, orc# - */ - -#if NPY_SIMD -static void simd_binary_@kind@_b8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 truemask = npyv_setall_u8(0x1); - const npyv_u8 vzero = npyv_setall_u8(0x0); - const int vstep = npyv_nlanes_u8; - - for (; len >= vstep; - len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { - // Whatever element in src != 0x0 is converted to 0xFF - npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero); - npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero); - npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst) { - const npyv_lanetype_u8 a = *src1 != 0; - const npyv_lanetype_u8 b = *src2 != 0; - *dst = a @OP@ b; - } -} - -static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) -{ - npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 vzero = npyv_setall_u8(0x0); - const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 a = npyv_cmpeq_u8(vscalar, vzero); - const npyv_u8 truemask = npyv_setall_u8(0x1); - const int vstep = npyv_nlanes_u8; - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero); - npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_u8 b = *src != 0; - *dst = scalar @OP@ b; - } -} - -static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 vzero = npyv_setall_u8(0x0); - const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 b = npyv_cmpeq_u8(vscalar, vzero); - const npyv_u8 truemask = npyv_setall_u8(0x1); - const int vstep = npyv_nlanes_u8; - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero); - npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_u8 a = *src != 0; - *dst = a @OP@ scalar; - } -} -#endif -/**end repeat**/ - -/**begin repeat - * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int, - npy_ulonglong, npy_longlong, npy_float, npy_double# - * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# - * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# - * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# - * #VECTOR = NPY_SIMD*9, NPY_SIMD_F32, NPY_SIMD_F64# - */ -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal# - * #eq = 1, 0, 0, 0# - * #neq = 0, 1, 0, 0# - * #OP = ==, !=, <, <=# - */ -#if !((@eq@ || @neq@) && @signed@) -static inline void -run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @VECTOR@ - if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) && - !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]) - ) { - /* argument one scalar */ - if (IS_BINARY_CONT_S1(@type@, npy_bool)) { - simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); - return; - } - /* argument two scalar */ - else if (IS_BINARY_CONT_S2(@type@, npy_bool)) { - simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); - return; - } - else if (IS_BINARY_CONT(@type@, npy_bool)) { - simd_binary_@kind@_@sfx@(args, dimensions[0]); - return; - } - } -#endif - - BINARY_LOOP { -#if @bool@ - npy_bool in1 = *((npy_bool *)ip1) != 0; - npy_bool in2 = *((npy_bool *)ip2) != 0; -#else - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; -#endif - *((npy_bool *)op1) = in1 @OP@ in2; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ - -/* - * In order to reduce the size of the binary generated from this source, the - * following rules are applied: 1) each data type implements its function - * 'greater' as a call to the function 'less' but with the arguments swapped, - * the same applies to the function 'greater_equal', which is implemented - * with a call to the function 'less_equal', and 2) for the integer datatypes - * of the same size (eg 8-bit), a single kernel of the functions 'equal' and - * 'not_equal' is used to implement both signed and unsigned types. - */ - -/**begin repeat - * Signed and Unsigned types - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, - * BYTE, SHORT, INT, LONG, LONGLONG# - * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, - * BYTE, SHORT, INT, LONG, LONGLONG# - * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# - */ -#undef TO_SIMD_SFX -#undef TO_SIMD_UTYPE -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_BITSOF_@STYPE@ == @len@ - #define TO_SIMD_UTYPE(X) X##_u@len@ - #if @signed@ - #define TO_SIMD_SFX(X) X##_s@len@ - #else - #define TO_SIMD_SFX(X) X##_u@len@ - #endif -/**end repeat1**/ -#endif - -/**begin repeat1 - * #kind = greater, greater_equal# - * #kind_to = less, less_equal# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - char *nargs[3] = {args[1], args[0], args[2]}; - npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; - TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps); -} -/**end repeat1**/ - -/**begin repeat1 - * #kind = less, less_equal# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps); -} -/**end repeat1**/ - -/**begin repeat1 - * #kind = equal, not_equal# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps); -} -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * Boolean & Float types - * #TYPE = BOOL, FLOAT, DOUBLE# - * #sfx = b8, f32, f64# - * #fp = 0, 1, 1# - */ -/**begin repeat1 - * #kind = greater, greater_equal# - * #kind_to = less, less_equal# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - char *nargs[3] = {args[1], args[0], args[2]}; - npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; - run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps); -#if @fp@ - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} -/**end repeat1**/ - -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - run_binary_simd_@kind@_@sfx@(args, dimensions, steps); -#if @fp@ - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} -/**end repeat1**/ -/**end repeat**/ diff --git a/numpy/_core/src/umath/loops_comparison.dispatch.cpp b/numpy/_core/src/umath/loops_comparison.dispatch.cpp new file mode 100644 index 000000000000..d37b24a366d9 --- /dev/null +++ b/numpy/_core/src/umath/loops_comparison.dispatch.cpp @@ -0,0 +1,578 @@ +#include "numpy/npy_common.h" +#include "common.hpp" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "simd/simd.hpp" +#include + +namespace { + +using namespace np::simd; + +template +struct OpEq { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { return v; } + + template >> + HWY_INLINE auto operator()(const V &a, const V &b) + { return hn::Eq(a, b); } +#endif + HWY_INLINE T operator()(T a) + { return a; } + + HWY_INLINE npy_bool operator()(T a, T b) + { return a == b; } +}; + +template +struct OpNe { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { return v; } + + template >> + HWY_INLINE auto operator()(const V &a, const V &b) + { return hn::Ne(a, b); } +#endif + HWY_INLINE T operator()(T a) + { return a; } + + HWY_INLINE npy_bool operator()(T a, T b) + { return a != b; } +}; + +template +struct OpLt { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { return v; } + + template >> + HWY_INLINE auto operator()(const V &a, const V &b) + { return hn::Lt(a, b); } +#endif + HWY_INLINE T operator()(T a) + { return a; } + + HWY_INLINE npy_bool operator()(T a, T b) + { return a < b; } +}; + +template +struct OpLe { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { return v; } + + template >> + HWY_INLINE auto operator()(const V &a, const V &b) + { return hn::Le(a, b); } +#endif + HWY_INLINE T operator()(T a) + { return a; } + + HWY_INLINE npy_bool operator()(T a, T b) + { return a <= b; } +}; + +// as tags only +template +struct OpGt {}; +template +struct OpGe {}; + +template +struct OpEqBool { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { + const auto zero = Zero(); + return hn::Eq(v, zero); + } + + template >> + HWY_INLINE auto operator()(const M &a, const M &b) + { return hn::Not(hn::Xor(a, b)); } +#endif + HWY_INLINE bool operator()(T v) + { return v != 0; } + + HWY_INLINE npy_bool operator()(bool a, bool b) + { return a == b; } +}; + +template +struct OpNeBool { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { + const auto zero = Zero(); + return hn::Eq(v, zero); + } + + template >> + HWY_INLINE auto operator()(const M &a, const M &b) + { return hn::Xor(a, b); } +#endif + HWY_INLINE bool operator()(T v) + { return v != 0; } + + HWY_INLINE npy_bool operator()(bool a, bool b) + { + return a != b; + } +}; + +template +struct OpLtBool { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { + const auto zero = Zero(); + return hn::Eq(v, zero); + } + + template >> + HWY_INLINE auto operator()(const M &a, const M &b) + { return hn::AndNot(b, a); } +#endif + HWY_INLINE bool operator()(T v) + { return v != 0; } + + HWY_INLINE npy_bool operator()(bool a, bool b) + { return a < b; } +}; + +template +struct OpLeBool { +#if NPY_HWY + template >> + HWY_INLINE auto operator()(const V &v) + { + const auto zero = Zero(); + return hn::Eq(v, zero); + } + + template >> + HWY_INLINE auto operator()(const M &a, const M &b) + { return hn::Or(a, hn::Not(b)); } +#endif + HWY_INLINE bool operator()(T v) + { return v != 0; } + + HWY_INLINE npy_bool operator()(bool a, bool b) + { return a <= b; } +}; + +// as tags only +template +struct OpGtBool {}; +template +struct OpGeBool {}; + +#if NPY_HWY +HWY_INLINE Vec simd_pack_b8_b16(Vec a, Vec b) { + return hn::OrderedDemote2To(_Tag(), a, b); +} + +HWY_INLINE Vec simd_pack_b8_b32(Vec a, Vec b, Vec c, Vec d) { + auto ab = hn::OrderedDemote2To(_Tag(), a, b); + auto cd = hn::OrderedDemote2To(_Tag(), c, d); + return simd_pack_b8_b16(ab, cd); +} + +HWY_INLINE Vec simd_pack_b8_b64(Vec a, Vec b, Vec c, Vec d, + Vec e, Vec f, Vec g, Vec h) { + auto ab = hn::OrderedDemote2To(_Tag(), a, b); + auto cd = hn::OrderedDemote2To(_Tag(), c, d); + auto ef = hn::OrderedDemote2To(_Tag(), e, f); + auto gh = hn::OrderedDemote2To(_Tag(), g, h); + return simd_pack_b8_b32(ab, cd, ef, gh); +} +#endif + +template +inline void binary(char **args, size_t len) +{ + OP op; + const T *src1 = reinterpret_cast(args[0]); + const T *src2 = reinterpret_cast(args[1]); + npy_bool *dst = reinterpret_cast(args[2]); +#if NPY_HWY + if constexpr (kSupportLane && sizeof(npy_bool) == sizeof(uint8_t)) { + const int vstep = Lanes(); + const size_t nlanes = Lanes(); + const Vec truemask = Set(uint8_t(0x1)); + Vec ret = Undefined(); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { + auto a1 = op(LoadU(src1 + nlanes * 0)); + auto b1 = op(LoadU(src2 + nlanes * 0)); + auto m1 = op(a1, b1); + auto m1_vec = VecFromMask(m1); + if constexpr (sizeof(T) >= 2) { + auto a2 = op(LoadU(src1 + nlanes * 1)); + auto b2 = op(LoadU(src2 + nlanes * 1)); + auto m2 = op(a2, b2); + auto m2_vec = VecFromMask(m2); + if constexpr (sizeof(T) >= 4) { + auto a3 = op(LoadU(src1 + nlanes * 2)); + auto b3 = op(LoadU(src2 + nlanes * 2)); + auto a4 = op(LoadU(src1 + nlanes * 3)); + auto b4 = op(LoadU(src2 + nlanes * 3)); + auto m3 = op(a3, b3); + auto m4 = op(a4, b4); + auto m3_vec = VecFromMask(m3); + auto m4_vec = VecFromMask(m4); + if constexpr (sizeof(T) == 8) { + auto a5 = op(LoadU(src1 + nlanes * 4)); + auto b5 = op(LoadU(src2 + nlanes * 4)); + auto a6 = op(LoadU(src1 + nlanes * 5)); + auto b6 = op(LoadU(src2 + nlanes * 5)); + auto a7 = op(LoadU(src1 + nlanes * 6)); + auto b7 = op(LoadU(src2 + nlanes * 6)); + auto a8 = op(LoadU(src1 + nlanes * 7)); + auto b8 = op(LoadU(src2 + nlanes * 7)); + auto m5 = op(a5, b5); + auto m6 = op(a6, b6); + auto m7 = op(a7, b7); + auto m8 = op(a8, b8); + auto m5_vec = VecFromMask(m5); + auto m6_vec = VecFromMask(m6); + auto m7_vec = VecFromMask(m7); + auto m8_vec = VecFromMask(m8); + ret = simd_pack_b8_b64( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec), + BitCast(m5_vec), + BitCast(m6_vec), + BitCast(m7_vec), + BitCast(m8_vec) + ); + } + else { + ret = simd_pack_b8_b32( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec) + ); + } + } + else { + ret = simd_pack_b8_b16(BitCast(m1_vec), BitCast(m2_vec)); + } + } + else { + ret = BitCast(m1_vec); + } + StoreU(And(ret, truemask), dst); + } + } +#endif + for (; len > 0; --len, ++src1, ++src2, ++dst) { + const auto a = op(*src1); + const auto b = op(*src2); + *dst = op(a, b); + } +} + +template +inline void binary_scalar1(char **args, size_t len) +{ + OP op; + const T *src1 = reinterpret_cast(args[0]); + const T *src2 = reinterpret_cast(args[1]); + npy_bool *dst = reinterpret_cast(args[2]); +#if NPY_HWY + if constexpr (kSupportLane && sizeof(npy_bool) == sizeof(uint8_t)) { + const int vstep = Lanes(); + const size_t nlanes = Lanes(); + const Vec truemask = Set(uint8_t(0x1)); + const auto a1 = op(Set(T(*src1) )); + Vec ret = Undefined(); + + for (; len >= vstep; len -= vstep, src2 += vstep, dst += vstep) { + auto b1 = op(LoadU(src2 + nlanes * 0)); + auto m1 = op(a1, b1); + auto m1_vec = VecFromMask(m1); + if constexpr (sizeof(T) >= 2) { + auto b2 = op(LoadU(src2 + nlanes * 1)); + auto m2 = op(a1, b2); + auto m2_vec = VecFromMask(m2); + if constexpr (sizeof(T) >= 4) { + auto b3 = op(LoadU(src2 + nlanes * 2)); + auto b4 = op(LoadU(src2 + nlanes * 3)); + auto m3 = op(a1, b3); + auto m4 = op(a1, b4); + auto m3_vec = VecFromMask(m3); + auto m4_vec = VecFromMask(m4); + if constexpr (sizeof(T) == 8) { + auto b5 = op(LoadU(src2 + nlanes * 4)); + auto b6 = op(LoadU(src2 + nlanes * 5)); + auto b7 = op(LoadU(src2 + nlanes * 6)); + auto b8 = op(LoadU(src2 + nlanes * 7)); + auto m5 = op(a1, b5); + auto m6 = op(a1, b6); + auto m7 = op(a1, b7); + auto m8 = op(a1, b8); + auto m5_vec = VecFromMask(m5); + auto m6_vec = VecFromMask(m6); + auto m7_vec = VecFromMask(m7); + auto m8_vec = VecFromMask(m8); + ret = simd_pack_b8_b64( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec), + BitCast(m5_vec), + BitCast(m6_vec), + BitCast(m7_vec), + BitCast(m8_vec) + ); + } + else { + ret = simd_pack_b8_b32( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec) + ); + } + } + else { + ret = simd_pack_b8_b16(BitCast(m1_vec), BitCast(m2_vec)); + } + } + else { + ret = BitCast(m1_vec); + } + StoreU(And(ret, truemask), dst); + } + } +#endif + const auto a = op(*src1); + for (; len > 0; --len, ++src2, ++dst) { + const auto b = op(*src2); + *dst = op(a, b); + } +} + +template +inline void binary_scalar2(char **args, size_t len) +{ + OP op; + const T *src1 = reinterpret_cast(args[0]); + const T *src2 = reinterpret_cast(args[1]); + npy_bool *dst = reinterpret_cast(args[2]); +#if NPY_HWY + if constexpr (kSupportLane && sizeof(npy_bool) == sizeof(uint8_t)) { + const int vstep = Lanes(); + const size_t nlanes = Lanes(); + const Vec truemask = Set(uint8_t(0x1)); + const auto b1 = op(Set(T(*src2) )); + Vec ret = Undefined(); + + for (; len >= vstep; len -= vstep, src1 += vstep, dst += vstep) { + auto a1 = op(LoadU(src1 + nlanes * 0)); + auto m1 = op(a1, b1); + auto m1_vec = VecFromMask(m1); + if constexpr (sizeof(T) >= 2) { + auto a2 = op(LoadU(src1 + nlanes * 1)); + auto m2 = op(a2, b1); + auto m2_vec = VecFromMask(m2); + if constexpr (sizeof(T) >= 4) { + auto a3 = op(LoadU(src1 + nlanes * 2)); + auto a4 = op(LoadU(src1 + nlanes * 3)); + auto m3 = op(a3, b1); + auto m4 = op(a4, b1); + auto m3_vec = VecFromMask(m3); + auto m4_vec = VecFromMask(m4); + if constexpr (sizeof(T) == 8) { + auto a5 = op(LoadU(src1 + nlanes * 4)); + auto a6 = op(LoadU(src1 + nlanes * 5)); + auto a7 = op(LoadU(src1 + nlanes * 6)); + auto a8 = op(LoadU(src1 + nlanes * 7)); + auto m5 = op(a5, b1); + auto m6 = op(a6, b1); + auto m7 = op(a7, b1); + auto m8 = op(a8, b1); + auto m5_vec = VecFromMask(m5); + auto m6_vec = VecFromMask(m6); + auto m7_vec = VecFromMask(m7); + auto m8_vec = VecFromMask(m8); + ret = simd_pack_b8_b64( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec), + BitCast(m5_vec), + BitCast(m6_vec), + BitCast(m7_vec), + BitCast(m8_vec) + ); + } + else { + ret = simd_pack_b8_b32( + BitCast(m1_vec), + BitCast(m2_vec), + BitCast(m3_vec), + BitCast(m4_vec) + ); + } + } + else { + ret = simd_pack_b8_b16(BitCast(m1_vec), BitCast(m2_vec)); + } + } + else { + ret = BitCast(m1_vec); + } + StoreU(And(ret, truemask), dst); + } + } +#endif + const auto b = op(*src2); + for (; len > 0; --len, ++src1, ++dst) { + const auto a = op(*src1); + *dst = op(a, b); + } +} + +template +static void cmp_binary_branch(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; + npy_intp n = dimensions[0]; + +#if NPY_HWY + if (!is_mem_overlap(ip1, is1, op1, os1, n) && + !is_mem_overlap(ip2, is2, op1, os1, n)) { + assert(n >= 0); + size_t len = static_cast(n); + // argument one scalar + if (is1 == 0 && is2 == sizeof(T) && os1 == sizeof(npy_bool)) { + binary_scalar1(args, len); + return; + } + // argument two scalar + if ((is1 == sizeof(T) && is2 == 0 && os1 == sizeof(npy_bool))) { + binary_scalar2(args, len); + return; + } + if (is1 == sizeof(T) && is2 == sizeof(T) && os1 == sizeof(npy_bool)) { + binary(args, len); + return; + } + } +#endif + + OP op; + for (npy_intp i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { + const auto a = op(reinterpret_cast(ip1)[0]); + const auto b = op(reinterpret_cast(ip2)[0]); + reinterpret_cast(op1)[0] = op(a, b); + } +} + +template typename OP> +inline void cmp_binary(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + /* + * In order to reduce the size of the binary generated from this source, the + * following rules are applied: 1) each data type implements its function + * 'greater' as a call to the function 'less' but with the arguments swapped, + * the same applies to the function 'greater_equal', which is implemented + * with a call to the function 'less_equal', and 2) for the integer datatypes + * of the same size (eg 8-bit), a single kernel of the functions 'equal' and + * 'not_equal' is used to implement both signed and unsigned types. + */ + constexpr bool kSwapToUnsigned_ = std::is_integral_v && ( + std::is_same_v, OpEq> || + std::is_same_v, OpNe> + ); + using SwapUnsigned_ = std::make_unsigned_t< + std::conditional_t + >; + using TLane_ = std::conditional_t; + using TLaneFixed_ = typename np::meta::FixedWidth::Type; + + using TOperation_ = OP; + using SwapOperationGt_ = std::conditional_t< + std::is_same_v>, + OpLt, TOperation_ + >; + using SwapOperationGe_ = std::conditional_t< + std::is_same_v>, + OpLe, SwapOperationGt_ + >; + using SwapOperationGtBool_ = std::conditional_t< + std::is_same_v>, + OpLtBool, SwapOperationGe_ + >; + using SwapOperation_ = std::conditional_t< + std::is_same_v>, + OpLeBool, SwapOperationGtBool_ + >; + + if constexpr (std::is_same_v) { + cmp_binary_branch(args, dimensions, steps); + } + else { + char *nargs[] = {args[1], args[0], args[2]}; + npy_intp nsteps[] = {steps[1], steps[0], steps[2]}; + cmp_binary_branch(nargs, dimensions, nsteps); + } + + if constexpr (std::is_same_v || + std::is_same_v) { + // clear any FP exceptions + np::FloatStatus(); + } +} +} // namespace anonymous + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +#define UMATH_IMPL_CMP_UFUNC(TYPE, NAME, T, OP) \ + void NPY_CPU_DISPATCH_CURFX(TYPE##_##NAME)(char **args, npy_intp const *dimensions, \ + npy_intp const *steps, void*) \ + { \ + cmp_binary(args, dimensions, steps); \ + } + +#define UMATH_IMPL_CMP_UFUNC_TYPES(NAME, OP, BOOL_OP) \ + UMATH_IMPL_CMP_UFUNC(BOOL, NAME, npy_bool, BOOL_OP) \ + UMATH_IMPL_CMP_UFUNC(UBYTE, NAME, npy_ubyte, OP) \ + UMATH_IMPL_CMP_UFUNC(BYTE, NAME, npy_byte, OP) \ + UMATH_IMPL_CMP_UFUNC(USHORT, NAME, npy_ushort, OP) \ + UMATH_IMPL_CMP_UFUNC(SHORT, NAME, npy_short, OP) \ + UMATH_IMPL_CMP_UFUNC(UINT, NAME, npy_uint, OP) \ + UMATH_IMPL_CMP_UFUNC(INT, NAME, npy_int, OP) \ + UMATH_IMPL_CMP_UFUNC(ULONG, NAME, npy_ulong, OP) \ + UMATH_IMPL_CMP_UFUNC(LONG, NAME, npy_long, OP) \ + UMATH_IMPL_CMP_UFUNC(ULONGLONG, NAME, npy_ulonglong, OP) \ + UMATH_IMPL_CMP_UFUNC(LONGLONG, NAME, npy_longlong, OP) \ + UMATH_IMPL_CMP_UFUNC(FLOAT, NAME, npy_float, OP) \ + UMATH_IMPL_CMP_UFUNC(DOUBLE, NAME, npy_double, OP) + +UMATH_IMPL_CMP_UFUNC_TYPES(equal, OpEq, OpEqBool) +UMATH_IMPL_CMP_UFUNC_TYPES(not_equal, OpNe, OpNeBool) +UMATH_IMPL_CMP_UFUNC_TYPES(less, OpLt, OpLtBool) +UMATH_IMPL_CMP_UFUNC_TYPES(less_equal, OpLe, OpLeBool) +UMATH_IMPL_CMP_UFUNC_TYPES(greater, OpGt, OpGtBool) +UMATH_IMPL_CMP_UFUNC_TYPES(greater_equal, OpGe, OpGeBool) \ No newline at end of file