diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index a4d2050122c6..e4ee9523c61b 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -942,13 +942,14 @@ foreach gen_mtargets : [
   ],
   [
     'loops_comparison.dispatch.h',
-    src_file.process('src/umath/loops_comparison.dispatch.c.src'),
+    'src/umath/loops_comparison.dispatch.cpp',
     [
-      AVX512_SKX, AVX512F, AVX2, SSE42, SSE2,
+      AVX512_SKX, AVX2, SSE42, SSE2,
       VSX3, VSX2,
       NEON,
-      VXE, VX,
+      VXE,
       LSX,
+      RVV,
     ]
   ],
   [
diff --git a/numpy/_core/src/umath/loops_comparison.dispatch.c.src b/numpy/_core/src/umath/loops_comparison.dispatch.c.src
deleted file mode 100644
index 6450bed962b1..000000000000
--- a/numpy/_core/src/umath/loops_comparison.dispatch.c.src
+++ /dev/null
@@ -1,449 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "simd/simd.h"
-#include "loops_utils.h"
-#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
-
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-/**begin repeat
- * #sfx    = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
- * #len    =  8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
- * #signed =  0,  1,   0,   1,   0,   1,   0,   1,   0,   0#
- * #VECTOR = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
- */
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal#
- * #eq = 1, 0, 0, 0#
- * #neq = 0, 1, 0, 0#
- * #OP = ==, !=, <, <=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple#
- */
-#if @VECTOR@ && !((@eq@ || @neq@) && @signed@)
-static void simd_binary_@kind@_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
-    const npyv_u8 truemask    = npyv_setall_u8(0x1);
-    const int vstep           = npyv_nlanes_u8;
-
-    // Unroll the loop to get a resultant vector with 'vsteps' elements.
-    for (; len >= vstep;
-         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
-#if @len@ >= 8
-        npyv_@sfx@  a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0);
-        npyv_@sfx@  b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0);
-        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1);
-#if @len@ >= 16
-        npyv_@sfx@  a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1);
-        npyv_@sfx@  b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1);
-        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2);
-#if @len@ >= 32
-        npyv_@sfx@  a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2);
-        npyv_@sfx@  b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2);
-        npyv_@sfx@  a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3);
-        npyv_@sfx@  b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3);
-        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3);
-        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4);
-#if @len@ == 64
-        npyv_@sfx@  a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4);
-        npyv_@sfx@  b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4);
-        npyv_@sfx@  a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5);
-        npyv_@sfx@  b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5);
-        npyv_@sfx@  a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6);
-        npyv_@sfx@  b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6);
-        npyv_@sfx@  a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7);
-        npyv_@sfx@  b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7);
-        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5);
-        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6);
-        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7);
-        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8);
-#endif // @len@ >= 64
-#endif // @len@ >= 32
-#endif // @len@ >= 16
-#endif // @len@ >= 8
-
-        // Pack the 'c' vectors into a single vector 'r'
-#if @len@ == 8
-        npyv_u8 r = npyv_cvt_u8_b8(c1);
-#elif @len@ == 16
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
-#elif @len@ == 32
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
-#elif @len@ == 64
-        npyv_u8 r =
-            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
-#endif
-        npyv_store_u8(dst, npyv_and_u8(r, truemask));
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst) {
-        const npyv_lanetype_@sfx@ a = *src1;
-        const npyv_lanetype_@sfx@ b = *src2;
-        *dst = a @OP@ b;
-    }
-}
-
-static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
-    const npyv_@sfx@ a         = npyv_setall_@sfx@(scalar);
-    const npyv_u8 truemask     = npyv_setall_u8(0x1);
-    const int vstep            = npyv_nlanes_u8;
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-#if @len@ >= 8
-        npyv_@sfx@  b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
-        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1);
-#if @len@ >= 16
-        npyv_@sfx@  b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
-        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2);
-#if @len@ >= 32
-        npyv_@sfx@  b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
-        npyv_@sfx@  b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
-        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3);
-        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4);
-#if @len@ == 64
-        npyv_@sfx@  b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
-        npyv_@sfx@  b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
-        npyv_@sfx@  b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
-        npyv_@sfx@  b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
-        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5);
-        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6);
-        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7);
-        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8);
-#endif // @len@ >= 64
-#endif // @len@ >= 32
-#endif // @len@ >= 16
-#endif // @len@ >= 8
-
-#if @len@ == 8
-        npyv_u8 r = npyv_cvt_u8_b8(c1);
-#elif @len@ == 16
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
-#elif @len@ == 32
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
-#elif @len@ == 64
-        npyv_u8 r =
-            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
-#endif
-        npyv_store_u8(dst, npyv_and_u8(r, truemask));
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_@sfx@ b = *src;
-        *dst = scalar @OP@ b;
-    }
-}
-
-static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
-    const npyv_@sfx@ b         = npyv_setall_@sfx@(scalar);
-    const npyv_u8 truemask     = npyv_setall_u8(0x1);
-    const int vstep            = npyv_nlanes_u8;
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-#if @len@ >= 8
-        npyv_@sfx@  a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
-        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b);
-#if @len@ >= 16
-        npyv_@sfx@  a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
-        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b);
-#if @len@ >= 32
-        npyv_@sfx@  a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
-        npyv_@sfx@  a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
-        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b);
-        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b);
-#if @len@ == 64
-        npyv_@sfx@  a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
-        npyv_@sfx@  a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
-        npyv_@sfx@  a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
-        npyv_@sfx@  a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
-        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b);
-        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b);
-        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b);
-        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b);
-#endif // @len@ >= 64
-#endif // @len@ >= 32
-#endif // @len@ >= 16
-#endif // @len@ >= 8
-
-#if @len@ == 8
-        npyv_u8 r = npyv_cvt_u8_b8(c1);
-#elif @len@ == 16
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
-#elif @len@ == 32
-        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
-#elif @len@ == 64
-        npyv_u8 r =
-            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
-#endif
-        npyv_store_u8(dst, npyv_and_u8(r, truemask));
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_@sfx@ a = *src;
-        *dst = a @OP@ scalar;
-    }
-}
-#endif
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #kind = equal, not_equal, less, less_equal#
- * #eq = 1, 0, 0, 0#
- * #neq = 0, 1, 0, 0#
- * #OP = ==, !=, <, <=#
- * #VOP = xnor, xor, andc, orc#
- */
-
-#if NPY_SIMD
-static void simd_binary_@kind@_b8(char **args, npy_intp len)
-{
-    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
-    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
-    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
-    const npyv_u8 truemask = npyv_setall_u8(0x1);
-    const npyv_u8 vzero    = npyv_setall_u8(0x0);
-    const int vstep        = npyv_nlanes_u8;
-
-    for (; len >= vstep;
-         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
-        // Whatever element in src != 0x0 is converted to 0xFF
-        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
-        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
-        npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst) {
-        const npyv_lanetype_u8 a = *src1 != 0;
-        const npyv_lanetype_u8 b = *src2 != 0;
-        *dst = a @OP@ b;
-    }
-}
-
-static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
-{
-    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
-    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
-    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
-    const npyv_u8 vzero     = npyv_setall_u8(0x0);
-    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
-    const npyv_u8 truemask  = npyv_setall_u8(0x1);
-    const int vstep         = npyv_nlanes_u8;
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
-        npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_u8 b = *src != 0;
-        *dst = scalar @OP@ b;
-    }
-}
-
-static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
-{
-    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
-    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
-    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
-    const npyv_u8 vzero     = npyv_setall_u8(0x0);
-    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
-    const npyv_u8 truemask  = npyv_setall_u8(0x1);
-    const int vstep         = npyv_nlanes_u8;
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
-        npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_u8 a = *src != 0;
-        *dst = a @OP@ scalar;
-    }
-}
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int,
-           npy_ulonglong, npy_longlong, npy_float, npy_double#
- * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
- * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
- * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
- * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
- * #VECTOR = NPY_SIMD*9, NPY_SIMD_F32, NPY_SIMD_F64#
- */
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal#
- * #eq   = 1, 0, 0, 0#
- * #neq  = 0, 1, 0, 0#
- * #OP = ==, !=, <, <=#
- */
-#if !((@eq@ || @neq@) && @signed@)
-static inline void
-run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @VECTOR@
-    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
-        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
-    ) {
-        /* argument one scalar */
-        if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
-            simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-            return;
-        }
-        /* argument two scalar */
-        else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
-            simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-            return;
-        }
-        else if (IS_BINARY_CONT(@type@, npy_bool)) {
-            simd_binary_@kind@_@sfx@(args, dimensions[0]);
-            return;
-        }
-    }
-#endif
-
-    BINARY_LOOP {
-#if @bool@
-        npy_bool in1 = *((npy_bool *)ip1) != 0;
-        npy_bool in2 = *((npy_bool *)ip2) != 0;
-#else
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-#endif
-        *((npy_bool *)op1) = in1 @OP@ in2;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-
-/*
- * In order to reduce the size of the binary generated from this source, the
- * following rules are applied: 1) each data type implements its function
- * 'greater' as a call to the function 'less' but with the arguments swapped,
- * the same applies to the function 'greater_equal', which is implemented
- * with a call to the function 'less_equal', and 2) for the integer datatypes
- * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
- * 'not_equal' is used to implement both signed and unsigned types.
- */
-
-/**begin repeat
- * Signed and Unsigned types
- *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG,
- *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
- *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG,
- *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
- *  #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
- */
-#undef TO_SIMD_SFX
-#undef TO_SIMD_UTYPE
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_BITSOF_@STYPE@ == @len@
-    #define TO_SIMD_UTYPE(X) X##_u@len@
-    #if @signed@
-        #define TO_SIMD_SFX(X) X##_s@len@
-    #else
-        #define TO_SIMD_SFX(X) X##_u@len@
-    #endif
-/**end repeat1**/
-#endif
-
-/**begin repeat1
- * #kind = greater, greater_equal#
- * #kind_to = less, less_equal#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    char *nargs[3] = {args[1], args[0], args[2]};
-    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
-    TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps);
-}
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = less, less_equal#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps);
-}
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = equal, not_equal#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps);
-}
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * Boolean & Float types
- * #TYPE = BOOL, FLOAT, DOUBLE#
- * #sfx = b8, f32, f64#
- * #fp = 0, 1, 1#
- */
-/**begin repeat1
- * #kind = greater, greater_equal#
- * #kind_to = less, less_equal#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    char *nargs[3] = {args[1], args[0], args[2]};
-    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
-    run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps);
-#if @fp@
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    run_binary_simd_@kind@_@sfx@(args, dimensions, steps);
-#if @fp@
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-/**end repeat1**/
-/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_comparison.dispatch.cpp b/numpy/_core/src/umath/loops_comparison.dispatch.cpp
new file mode 100644
index 000000000000..d37b24a366d9
--- /dev/null
+++ b/numpy/_core/src/umath/loops_comparison.dispatch.cpp
@@ -0,0 +1,578 @@
+#include "numpy/npy_common.h"
+#include "common.hpp"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "simd/simd.hpp"
+#include <hwy/highway.h>
+
+namespace {
+
+using namespace np::simd;
+
+template <typename T>
+struct OpEq {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    { return v; }
+
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &a, const V &b)
+    { return hn::Eq(a, b); }
+#endif
+    HWY_INLINE  T operator()(T a)
+    { return a; }
+
+    HWY_INLINE  npy_bool operator()(T a, T b)
+    { return a == b; }
+};
+
+template <typename T>
+struct OpNe {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    { return v; }
+
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &a, const V &b)
+    { return hn::Ne(a, b); }
+#endif
+    HWY_INLINE  T operator()(T a)
+    { return a; }
+
+    HWY_INLINE  npy_bool operator()(T a, T b)
+    { return a != b; }
+};
+
+template <typename T>
+struct OpLt {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    { return v; }
+
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &a, const V &b)
+    { return hn::Lt(a, b); }
+#endif
+    HWY_INLINE  T operator()(T a)
+    { return a; }
+
+    HWY_INLINE  npy_bool operator()(T a, T b)
+    { return a < b; }
+};
+
+template <typename T>
+struct OpLe {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    { return v; }
+
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &a, const V &b)
+    { return hn::Le(a, b); }
+#endif
+    HWY_INLINE  T operator()(T a)
+    { return a; }
+
+    HWY_INLINE  npy_bool operator()(T a, T b)
+    { return a <= b; }
+};
+
+// as tags only
+template <typename T>
+struct OpGt {};
+template <typename T>
+struct OpGe {};
+
+template <typename T>
+struct OpEqBool {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    {
+        const auto zero = Zero<uint8_t>();
+        return hn::Eq(v, zero);
+    }
+
+    template <typename M, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const M &a, const M &b)
+    { return hn::Not(hn::Xor(a, b)); }
+#endif
+    HWY_INLINE  bool operator()(T v)
+    { return v != 0; }
+
+    HWY_INLINE  npy_bool operator()(bool a, bool b)
+    { return a == b; }
+};
+
+template <typename T>
+struct OpNeBool {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    {
+        const auto zero = Zero<uint8_t>();
+        return hn::Eq(v, zero);
+    }
+
+    template <typename M, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const M &a, const M &b)
+    { return hn::Xor(a, b); }
+#endif
+    HWY_INLINE  bool operator()(T v)
+    { return v != 0; }
+
+    HWY_INLINE  npy_bool operator()(bool a, bool b)
+    {
+        return a != b;
+    }
+};
+
+template <typename T>
+struct OpLtBool {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    {
+        const auto zero = Zero<uint8_t>();
+        return hn::Eq(v, zero);
+    }
+
+    template <typename M, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const M &a, const M &b)
+    { return hn::AndNot(b, a); }
+#endif
+    HWY_INLINE  bool operator()(T v)
+    { return v != 0; }
+
+    HWY_INLINE  npy_bool operator()(bool a, bool b)
+    { return a < b; }
+};
+
+template <typename T>
+struct OpLeBool {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const V &v)
+    {
+        const auto zero = Zero<uint8_t>();
+        return hn::Eq(v, zero);
+    }
+
+    template <typename M, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE  auto operator()(const M &a, const M &b)
+    { return hn::Or(a, hn::Not(b)); }
+#endif
+    HWY_INLINE  bool operator()(T v)
+    { return v != 0; }
+
+    HWY_INLINE  npy_bool operator()(bool a, bool b)
+    { return a <= b; }
+};
+
+// as tags only
+template <typename T=uint8_t>
+struct OpGtBool {};
+template <typename T=uint8_t>
+struct OpGeBool {};
+
+#if NPY_HWY
+HWY_INLINE  Vec<uint8_t> simd_pack_b8_b16(Vec<uint16_t> a, Vec<uint16_t> b) {
+    return hn::OrderedDemote2To(_Tag<uint8_t>(), a, b);
+}
+
+HWY_INLINE  Vec<uint8_t> simd_pack_b8_b32(Vec<uint32_t> a, Vec<uint32_t> b, Vec<uint32_t> c, Vec<uint32_t> d) {
+    auto ab = hn::OrderedDemote2To(_Tag<uint16_t>(), a, b);
+    auto cd = hn::OrderedDemote2To(_Tag<uint16_t>(), c, d);
+    return simd_pack_b8_b16(ab, cd);
+}
+
+HWY_INLINE  Vec<uint8_t> simd_pack_b8_b64(Vec<uint64_t> a, Vec<uint64_t> b, Vec<uint64_t> c, Vec<uint64_t> d,
+                                     Vec<uint64_t> e, Vec<uint64_t> f, Vec<uint64_t> g, Vec<uint64_t> h) {
+    auto ab = hn::OrderedDemote2To(_Tag<uint32_t>(), a, b);
+    auto cd = hn::OrderedDemote2To(_Tag<uint32_t>(), c, d);
+    auto ef = hn::OrderedDemote2To(_Tag<uint32_t>(), e, f);
+    auto gh = hn::OrderedDemote2To(_Tag<uint32_t>(), g, h);
+    return simd_pack_b8_b32(ab, cd, ef, gh);
+}
+#endif
+
+template <typename T, typename OP>
+inline void binary(char **args, size_t len)
+{
+    OP op;
+    const T *src1 = reinterpret_cast<T*>(args[0]);
+    const T *src2 = reinterpret_cast<T*>(args[1]);
+    npy_bool *dst = reinterpret_cast<npy_bool*>(args[2]);
+#if NPY_HWY
+    if constexpr (kSupportLane<T> && sizeof(npy_bool) == sizeof(uint8_t)) {
+        const int vstep = Lanes<uint8_t>();
+        const size_t nlanes = Lanes<T>();
+        const Vec<uint8_t> truemask = Set(uint8_t(0x1));
+        Vec<uint8_t> ret = Undefined<uint8_t>();
+
+        for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+            auto a1 = op(LoadU(src1 + nlanes * 0));
+            auto b1 = op(LoadU(src2 + nlanes * 0));
+            auto m1 = op(a1, b1);
+            auto m1_vec = VecFromMask<T>(m1);
+            if constexpr (sizeof(T) >= 2) {
+                auto a2 = op(LoadU(src1 + nlanes * 1));
+                auto b2 = op(LoadU(src2 + nlanes * 1));
+                auto m2 = op(a2, b2);
+                auto m2_vec = VecFromMask<T>(m2);
+                if constexpr (sizeof(T) >= 4) {
+                    auto a3 = op(LoadU(src1 + nlanes * 2));
+                    auto b3 = op(LoadU(src2 + nlanes * 2));
+                    auto a4 = op(LoadU(src1 + nlanes * 3));
+                    auto b4 = op(LoadU(src2 + nlanes * 3));
+                    auto m3 = op(a3, b3);
+                    auto m4 = op(a4, b4);
+                    auto m3_vec = VecFromMask<T>(m3);
+                    auto m4_vec = VecFromMask<T>(m4);
+                    if constexpr (sizeof(T) == 8) {
+                        auto a5 = op(LoadU(src1 + nlanes * 4));
+                        auto b5 = op(LoadU(src2 + nlanes * 4));
+                        auto a6 = op(LoadU(src1 + nlanes * 5));
+                        auto b6 = op(LoadU(src2 + nlanes * 5));
+                        auto a7 = op(LoadU(src1 + nlanes * 6));
+                        auto b7 = op(LoadU(src2 + nlanes * 6));
+                        auto a8 = op(LoadU(src1 + nlanes * 7));
+                        auto b8 = op(LoadU(src2 + nlanes * 7));
+                        auto m5 = op(a5, b5);
+                        auto m6 = op(a6, b6);
+                        auto m7 = op(a7, b7);
+                        auto m8 = op(a8, b8);
+                        auto m5_vec = VecFromMask<T>(m5);
+                        auto m6_vec = VecFromMask<T>(m6);
+                        auto m7_vec = VecFromMask<T>(m7);
+                        auto m8_vec = VecFromMask<T>(m8);
+                        ret = simd_pack_b8_b64(
+                            BitCast<uint64_t>(m1_vec),
+                            BitCast<uint64_t>(m2_vec),
+                            BitCast<uint64_t>(m3_vec),
+                            BitCast<uint64_t>(m4_vec),
+                            BitCast<uint64_t>(m5_vec),
+                            BitCast<uint64_t>(m6_vec),
+                            BitCast<uint64_t>(m7_vec),
+                            BitCast<uint64_t>(m8_vec)
+                        );
+                    }
+                    else {
+                        ret = simd_pack_b8_b32(
+                            BitCast<uint32_t>(m1_vec),
+                            BitCast<uint32_t>(m2_vec),
+                            BitCast<uint32_t>(m3_vec),
+                            BitCast<uint32_t>(m4_vec)
+                        );
+                    }
+                }
+                else {
+                    ret = simd_pack_b8_b16(BitCast<uint16_t>(m1_vec), BitCast<uint16_t>(m2_vec));
+                }
+            }
+            else {
+                ret = BitCast<uint8_t>(m1_vec);
+            }
+            StoreU(And(ret, truemask), dst);
+        }
+    }
+#endif
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const auto a = op(*src1);
+        const auto b = op(*src2);
+        *dst = op(a, b);
+    }
+}
+
+template <typename T, typename OP>
+inline void binary_scalar1(char **args, size_t len)
+{
+    OP op;
+    const T *src1 = reinterpret_cast<T*>(args[0]);
+    const T *src2 = reinterpret_cast<T*>(args[1]);
+    npy_bool *dst = reinterpret_cast<npy_bool*>(args[2]);
+#if NPY_HWY
+    if constexpr (kSupportLane<T> && sizeof(npy_bool) == sizeof(uint8_t)) {
+        const int vstep = Lanes<uint8_t>();
+        const size_t nlanes = Lanes<T>();
+        const Vec<uint8_t> truemask = Set(uint8_t(0x1));
+        const auto a1 = op(Set(T(*src1) ));
+        Vec<uint8_t> ret = Undefined<uint8_t>();
+
+        for (; len >= vstep; len -= vstep, src2 += vstep, dst += vstep) {
+            auto b1 = op(LoadU(src2 + nlanes * 0));
+            auto m1 = op(a1, b1);
+            auto m1_vec = VecFromMask<T>(m1);
+            if constexpr (sizeof(T) >= 2) {
+                auto b2 = op(LoadU(src2 + nlanes * 1));
+                auto m2 = op(a1, b2);
+                auto m2_vec = VecFromMask<T>(m2);
+                if constexpr (sizeof(T) >= 4) {
+                    auto b3 = op(LoadU(src2 + nlanes * 2));
+                    auto b4 = op(LoadU(src2 + nlanes * 3));
+                    auto m3 = op(a1, b3);
+                    auto m4 = op(a1, b4);
+                    auto m3_vec = VecFromMask<T>(m3);
+                    auto m4_vec = VecFromMask<T>(m4);
+                    if constexpr (sizeof(T) == 8) {
+                        auto b5 = op(LoadU(src2 + nlanes * 4));
+                        auto b6 = op(LoadU(src2 + nlanes * 5));
+                        auto b7 = op(LoadU(src2 + nlanes * 6));
+                        auto b8 = op(LoadU(src2 + nlanes * 7));
+                        auto m5 = op(a1, b5);
+                        auto m6 = op(a1, b6);
+                        auto m7 = op(a1, b7);
+                        auto m8 = op(a1, b8);
+                        auto m5_vec = VecFromMask<T>(m5);
+                        auto m6_vec = VecFromMask<T>(m6);
+                        auto m7_vec = VecFromMask<T>(m7);
+                        auto m8_vec = VecFromMask<T>(m8);
+                        ret = simd_pack_b8_b64(
+                            BitCast<uint64_t>(m1_vec),
+                            BitCast<uint64_t>(m2_vec),
+                            BitCast<uint64_t>(m3_vec),
+                            BitCast<uint64_t>(m4_vec),
+                            BitCast<uint64_t>(m5_vec),
+                            BitCast<uint64_t>(m6_vec),
+                            BitCast<uint64_t>(m7_vec),
+                            BitCast<uint64_t>(m8_vec)
+                        );
+                    }
+                    else {
+                        ret = simd_pack_b8_b32(
+                            BitCast<uint32_t>(m1_vec),
+                            BitCast<uint32_t>(m2_vec),
+                            BitCast<uint32_t>(m3_vec),
+                            BitCast<uint32_t>(m4_vec)
+                        );
+                    }
+                }
+                else {
+                    ret = simd_pack_b8_b16(BitCast<uint16_t>(m1_vec), BitCast<uint16_t>(m2_vec));
+                }
+            }
+            else {
+                ret = BitCast<uint8_t>(m1_vec);
+            }
+            StoreU(And(ret, truemask), dst);
+        }
+    }
+#endif
+    const auto a = op(*src1);
+    for (; len > 0; --len, ++src2, ++dst) {
+        const auto b = op(*src2);
+        *dst = op(a, b);
+    }
+}
+
+template <typename T, typename OP>
+inline void binary_scalar2(char **args, size_t len)
+{
+    OP op;
+    const T *src1 = reinterpret_cast<T*>(args[0]);
+    const T *src2 = reinterpret_cast<T*>(args[1]);
+    npy_bool *dst  = reinterpret_cast<npy_bool*>(args[2]);
+#if NPY_HWY
+    if constexpr (kSupportLane<T> && sizeof(npy_bool) == sizeof(uint8_t)) {
+        const int vstep = Lanes<uint8_t>();
+        const size_t nlanes = Lanes<T>();
+        const Vec<uint8_t> truemask = Set(uint8_t(0x1));
+        const auto b1 = op(Set(T(*src2) ));
+        Vec<uint8_t> ret = Undefined<uint8_t>();
+
+        for (; len >= vstep; len -= vstep, src1 += vstep, dst += vstep) {
+            auto a1 = op(LoadU(src1 + nlanes * 0));
+            auto m1 = op(a1, b1);
+            auto m1_vec = VecFromMask<T>(m1);
+            if constexpr (sizeof(T) >= 2) {
+                auto a2 = op(LoadU(src1 + nlanes * 1));
+                auto m2 = op(a2, b1);
+                auto m2_vec = VecFromMask<T>(m2);
+                if constexpr (sizeof(T) >= 4) {
+                    auto a3 = op(LoadU(src1 + nlanes * 2));
+                    auto a4 = op(LoadU(src1 + nlanes * 3));
+                    auto m3 = op(a3, b1);
+                    auto m4 = op(a4, b1);
+                    auto m3_vec = VecFromMask<T>(m3);
+                    auto m4_vec = VecFromMask<T>(m4);
+                    if constexpr (sizeof(T) == 8) {
+                        auto a5 = op(LoadU(src1 + nlanes * 4));
+                        auto a6 = op(LoadU(src1 + nlanes * 5));
+                        auto a7 = op(LoadU(src1 + nlanes * 6));
+                        auto a8 = op(LoadU(src1 + nlanes * 7));
+                        auto m5 = op(a5, b1);
+                        auto m6 = op(a6, b1);
+                        auto m7 = op(a7, b1);
+                        auto m8 = op(a8, b1);
+                        auto m5_vec = VecFromMask<T>(m5);
+                        auto m6_vec = VecFromMask<T>(m6);
+                        auto m7_vec = VecFromMask<T>(m7);
+                        auto m8_vec = VecFromMask<T>(m8);
+                        ret = simd_pack_b8_b64(
+                            BitCast<uint64_t>(m1_vec),
+                            BitCast<uint64_t>(m2_vec),
+                            BitCast<uint64_t>(m3_vec),
+                            BitCast<uint64_t>(m4_vec),
+                            BitCast<uint64_t>(m5_vec),
+                            BitCast<uint64_t>(m6_vec),
+                            BitCast<uint64_t>(m7_vec),
+                            BitCast<uint64_t>(m8_vec)
+                        );
+                    }
+                    else {
+                        ret = simd_pack_b8_b32(
+                            BitCast<uint32_t>(m1_vec),
+                            BitCast<uint32_t>(m2_vec),
+                            BitCast<uint32_t>(m3_vec),
+                            BitCast<uint32_t>(m4_vec)
+                        );
+                    }
+                }
+                else {
+                    ret = simd_pack_b8_b16(BitCast<uint16_t>(m1_vec), BitCast<uint16_t>(m2_vec));
+                }
+            }
+            else {
+                ret = BitCast<uint8_t>(m1_vec);
+            }
+            StoreU(And(ret, truemask), dst);
+        }
+    }
+#endif
+    const auto b = op(*src2);
+    for (; len > 0; --len, ++src1, ++dst) {
+        const auto a = op(*src1);
+        *dst = op(a, b);
+    }
+}
+
+template <typename T, typename OP>
+static void cmp_binary_branch(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+
+#if NPY_HWY
+    if (!is_mem_overlap(ip1, is1, op1, os1, n) &&
+        !is_mem_overlap(ip2, is2, op1, os1, n)) {
+        assert(n >= 0);
+        size_t len = static_cast<size_t>(n);
+        // argument one scalar
+        if (is1 == 0 && is2 == sizeof(T) && os1 == sizeof(npy_bool)) {
+            binary_scalar1<T, OP>(args, len);
+            return;
+        }
+        // argument two scalar
+        if ((is1 == sizeof(T) && is2 == 0 && os1 == sizeof(npy_bool))) {
+            binary_scalar2<T, OP>(args, len);
+            return;
+        }
+        if (is1 == sizeof(T) && is2 == sizeof(T) && os1 == sizeof(npy_bool)) {
+            binary<T, OP>(args, len);
+            return;
+        }
+    }
+#endif
+
+    OP op;
+    for (npy_intp i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) {
+        const auto a = op(reinterpret_cast<const T*>(ip1)[0]);
+        const auto b = op(reinterpret_cast<const T*>(ip2)[0]);
+        reinterpret_cast<npy_bool*>(op1)[0] = op(a, b);
+    }
+}
+
+template <typename T, template<typename> typename OP>
+inline void cmp_binary(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    /*
+     * In order to reduce the size of the binary generated from this source, the
+     * following rules are applied: 1) each data type implements its function
+     * 'greater' as a call to the function 'less' but with the arguments swapped,
+     * the same applies to the function 'greater_equal', which is implemented
+     * with a call to the function 'less_equal', and 2) for the integer datatypes
+     * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
+     * 'not_equal' is used to implement both signed and unsigned types.
+     */
+    constexpr bool kSwapToUnsigned_ = std::is_integral_v<T> && (
+        std::is_same_v<OP<T>, OpEq<T>> ||
+        std::is_same_v<OP<T>, OpNe<T>>
+    );
+    using SwapUnsigned_ = std::make_unsigned_t<
+        std::conditional_t<kSwapToUnsigned_, T, int>
+    >;
+    using TLane_ = std::conditional_t<kSwapToUnsigned_, SwapUnsigned_, T>;
+    using TLaneFixed_ = typename np::meta::FixedWidth<TLane_>::Type;
+
+    using TOperation_ = OP<TLaneFixed_>;
+    using SwapOperationGt_ = std::conditional_t<
+        std::is_same_v<TOperation_, OpGt<TLaneFixed_>>,
+        OpLt<TLaneFixed_>, TOperation_
+    >;
+    using SwapOperationGe_ = std::conditional_t<
+        std::is_same_v<TOperation_, OpGe<TLaneFixed_>>,
+        OpLe<TLaneFixed_>, SwapOperationGt_
+    >;
+    using SwapOperationGtBool_ = std::conditional_t<
+        std::is_same_v<TOperation_, OpGtBool<TLaneFixed_>>,
+        OpLtBool<TLaneFixed_>, SwapOperationGe_
+    >;
+    using SwapOperation_ = std::conditional_t<
+        std::is_same_v<TOperation_, OpGeBool<TLaneFixed_>>,
+        OpLeBool<TLaneFixed_>, SwapOperationGtBool_
+    >;
+
+    if constexpr (std::is_same_v<SwapOperation_, TOperation_>) {
+        cmp_binary_branch<TLaneFixed_, SwapOperation_>(args, dimensions, steps);
+    }
+    else {
+        char *nargs[] = {args[1], args[0], args[2]};
+        npy_intp nsteps[] = {steps[1], steps[0], steps[2]};
+        cmp_binary_branch<TLaneFixed_, SwapOperation_>(nargs, dimensions, nsteps);
+    }
+
+    if constexpr (std::is_same_v<T, npy_float> ||
+                  std::is_same_v<T, npy_double>) {
+        // clear any FP exceptions
+        np::FloatStatus();
+    }
+}
+} // namespace anonymous
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#define UMATH_IMPL_CMP_UFUNC(TYPE, NAME, T, OP)                                          \
+    void NPY_CPU_DISPATCH_CURFX(TYPE##_##NAME)(char **args, npy_intp const *dimensions,  \
+                                               npy_intp const *steps, void*)             \
+    {                                                                                    \
+        cmp_binary<T, OP>(args, dimensions, steps);                                      \
+    }
+
+#define UMATH_IMPL_CMP_UFUNC_TYPES(NAME, OP, BOOL_OP)         \
+    UMATH_IMPL_CMP_UFUNC(BOOL, NAME, npy_bool, BOOL_OP)       \
+    UMATH_IMPL_CMP_UFUNC(UBYTE, NAME, npy_ubyte, OP)          \
+    UMATH_IMPL_CMP_UFUNC(BYTE, NAME, npy_byte, OP)            \
+    UMATH_IMPL_CMP_UFUNC(USHORT, NAME, npy_ushort, OP)        \
+    UMATH_IMPL_CMP_UFUNC(SHORT, NAME, npy_short, OP)          \
+    UMATH_IMPL_CMP_UFUNC(UINT, NAME, npy_uint, OP)            \
+    UMATH_IMPL_CMP_UFUNC(INT, NAME, npy_int, OP)              \
+    UMATH_IMPL_CMP_UFUNC(ULONG, NAME, npy_ulong, OP)          \
+    UMATH_IMPL_CMP_UFUNC(LONG, NAME, npy_long, OP)            \
+    UMATH_IMPL_CMP_UFUNC(ULONGLONG, NAME, npy_ulonglong, OP)  \
+    UMATH_IMPL_CMP_UFUNC(LONGLONG, NAME, npy_longlong, OP)    \
+    UMATH_IMPL_CMP_UFUNC(FLOAT, NAME, npy_float, OP)          \
+    UMATH_IMPL_CMP_UFUNC(DOUBLE, NAME, npy_double, OP)
+
+UMATH_IMPL_CMP_UFUNC_TYPES(equal, OpEq, OpEqBool)
+UMATH_IMPL_CMP_UFUNC_TYPES(not_equal, OpNe, OpNeBool)
+UMATH_IMPL_CMP_UFUNC_TYPES(less, OpLt, OpLtBool)
+UMATH_IMPL_CMP_UFUNC_TYPES(less_equal, OpLe, OpLeBool)
+UMATH_IMPL_CMP_UFUNC_TYPES(greater, OpGt, OpGtBool)
+UMATH_IMPL_CMP_UFUNC_TYPES(greater_equal, OpGe, OpGeBool)
\ No newline at end of file