ENH: use an indexed loop if possible in ufunc_at

numpy · seberg · Feb 7, 2023 · Jan 18, 2023 · Jan 22, 2023 · Jan 23, 2023
commit eb21b25093c79ac0bc135aed3d192c06079c5bbf
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
@@ -914,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
-     * Get the actual DTypes we operate with by mixing the operand array
-     * ones with the passed signature.
+     * Get the actual DTypes we operate with by setting op_dtypes[i] from
+     * signature[i].
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL) {

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -547,7 +547,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
@@ -1559,7 +1559,7 @@ LONGDOUBLE_@kind@_indexed(char **args, npy_intp const *dimensions, npy_intp cons
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_longdouble *indexed;
@@ -1681,7 +1681,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dime
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_half *indexed;

diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -553,7 +553,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -402,7 +402,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
@@ -488,7 +488,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;

diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
@@ -453,7 +453,7 @@ static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
 {
     char *args[3] = {(char *)&a, (char *)&b, (char *)out};
-    npy_intp steps[3];
+    npy_intp steps[3] = {0, 0, 0};
     npy_intp size = 1;
     @TYPE@_divide(args, &size, steps, NULL);
     return 0;

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
@@ -5893,13 +5893,45 @@ new_array_op(PyArrayObject *op_array, char *data)
     return (PyArrayObject *)r;
 }
 
+/*
+ * If operands are 1D we can use the indexed loop to do the work
+ * Returns 0 if successful, -1 otherwise
+ */
+static int
+try_trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_Context *context)
+{
+    if (PyArray_NDIM(op1_array) != 1) {
+        return -1;
+    }
+    /* check that nop == 3 */
+    if (ufuncimpl->nin >1 && PyArray_NDIM(op2_array) != 1) {
+        return -1;
+    }
+    if (iter->numiter > 1) {
+        return -1;
+    }
+    char *args[3];
+    npy_intp dimensions = iter->size;
+    npy_intp steps[3];
+    args[0] = (char *) iter->baseoffset;
+    args[1] = (char *) iter->outer_ptrs[0];
+    args[2] = (char *)PyArray_DATA(op2_array);
+    steps[0] = iter->fancy_strides[0];
+    steps[1] = iter->outer_strides[0];
+    steps[2] = PyArray_STRIDE(op2_array, 0);
+    ufuncimpl->contiguous_indexed_loop(context, args, &dimensions, steps, NULL);
+    return 0;
+}
+
 static int
 ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
                     PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
                     PyArrayObject *op1_array, PyArrayObject *op2_array,
                     PyArrayMethod_StridedLoop *strided_loop,
                     PyArrayMethod_Context *context,
-                    npy_intp strides[3],
                     NpyAuxData *auxdata
                     )
 {
@@ -5921,6 +5953,7 @@ ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
         NPY_BEGIN_THREADS;
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     /*
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
@@ -5972,7 +6005,6 @@ ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
                     PyArray_Descr *operation_descrs[3],
                     PyArrayMethod_StridedLoop *strided_loop,
                     PyArrayMethod_Context *context,
-                    npy_intp strides[3],
                     NpyAuxData *auxdata
                     )
 {
@@ -6070,6 +6102,7 @@ ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
         npy_clear_floatstatus_barrier((char*)&iter);
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6305,7 +6338,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
 
-    /* 
+    /*
      * Create a map iterator (there is no multi-mapiter, so we need at least 2
      * iterators: one for the mapping, and another for the second operand)
      */
@@ -6383,11 +6416,25 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
     if (fast_path == 1) {
-    res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array, op2_array,
-                                  strided_loop, &context, strides, auxdata);
+        /*
+         * Try to use trivial loop (1d, no casting, aligned) if
+         * - the matching info has a indexed loop
+         * - all operands are 1d
+         */
+        if (ufuncimpl->contiguous_indexed_loop != NULL) {
+            res = try_trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+                        op2_array, &context);
+
+        }
+        if (res == -1) {
+            /* Couldn't use the fastest path, use the faster path */
+            res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, strided_loop, &context, auxdata);
+        }
     } else {
-        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array, op2_array,
-                                  operation_descrs, strided_loop, &context, strides, auxdata);
+        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, operation_descrs, strided_loop, &context,
+                        auxdata);
     }
 
 fail: