ENH: add indexed lower loops and connect them to ufuncs

numpy · seberg · Feb 7, 2023 · Jan 18, 2023 · Jan 22, 2023 · Jan 23, 2023
commit 7910e2bf14b8bef95c3180fbf667458e29426a0e
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -1,10 +1,16 @@
+"""
+Generate the code to build all the internal ufuncs. At the base is the defdict:
+a dictionary ofUfunc classes. This is fed to make_code to generate
+__umath_generated.c
+"""
 import os
 import re
 import struct
 import sys
 import textwrap
 import argparse
 
+# identity objects
 Zero = "PyLong_FromLong(0)"
 One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
@@ -125,7 +131,7 @@ def check_td_order(tds):
         _check_order(signatures[prev_i], sign)
 
 
-_fdata_map = dict(
+_floatformat_map = dict(
     e='npy_%sf',
     f='npy_%sf',
     d='npy_%s',
@@ -136,11 +142,13 @@ def check_td_order(tds):
 )
 
 def build_func_data(types, f):
-    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
 def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
        simd=None, dispatch=None):
+    """Generate a TypeDescription instance for each item in types
+    """
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -188,12 +196,15 @@ class Ufunc:
     ----------
     nin : number of input arguments
     nout : number of output arguments
-    identity : identity element for a two-argument function
+    identity : identity element for a two-argument function (like Zero)
     docstring : docstring for the ufunc
-    type_descriptions : list of TypeDescription objects
+    typereso: type resolver function of type PyUFunc_TypeResolutionFunc
+    type_descriptions : TypeDescription objects
+    signature: a generalized ufunc signature (like for matmul)
+    indexed: add indexed loops (ufunc.at) for these type characters
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions, signature=None):
+                 *type_descriptions, signature=None, indexed=''):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -203,6 +214,7 @@ def __init__(self, nin, nout, identity, docstring, typereso,
         self.typereso = typereso
         self.type_descriptions = []
         self.signature = signature
+        self.indexed = indexed
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
@@ -347,6 +359,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
           ],
           TD(O, f='PyNumber_Add'),
+          indexed=flts + ints
           ),
 'subtract':
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
@@ -359,6 +372,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
           ],
           TD(O, f='PyNumber_Subtract'),
+          indexed=flts + ints
           ),
 'multiply':
     Ufunc(2, 1, One,
@@ -374,6 +388,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'dm', 'm'),
           ],
           TD(O, f='PyNumber_Multiply'),
+          indexed=flts + ints
           ),
 #'true_divide' : aliased to divide in umathmodule.c:initumath
 'floor_divide':
@@ -388,6 +403,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
+          indexed=flts + ints
           ),
 'divide':
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
@@ -1255,6 +1271,40 @@ def make_ufuncs(funcdict):
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        for c in uf.indexed:
+            # Handle indexed loops by getting the underlying ArrayMethodObject
+            # from the list in f._loops and setting its field appropriately
+            fmt = textwrap.dedent("""
+            {{
+                PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f, dtype, {count});
+                if (info == NULL) {{
+                    return -1;
+                }}
+                if (info == Py_None) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "cannot add indexed loop to ufunc {name} with {typenum}");
+                    return -1;
+                }}
+                if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "Not a PyArrayMethodObject in ufunc {name} with {typenum}");
+                }}
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop = {funcname};
+                /* info is borrowed, no need to decref*/
+            }}    
+            """)
+            cname = name
+            if cname == "floor_divide":
+                # XXX there must be a better way...
+                cname = "divide"
+            mlist.append(fmt.format(
+                typenum=f"NPY_{english_upper(chartoname[c])}",
+                count=uf.nin+uf.nout,
+                name=name,
+                funcname = f"{english_upper(chartoname[c])}_{cname}_indexed",
+            ))
+
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1277,8 +1327,46 @@ def make_code(funcdict, filename):
     #include "loops.h"
     #include "matmul.h"
     #include "clip.h"
+    #include "dtypemeta.h"
     #include "_umath_doc_generated.h"
+
     %s
+    /*
+     * Return the PyArrayMethodObject or PyCapsule that matches a registered
+     * tuple of identical dtypes. Return a borrowed ref of the first match.
+     */
+    static PyObject *
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype, int ndtypes)
+    {
+
+        PyObject *t_dtypes = PyTuple_New(ndtypes);
+        if (t_dtypes == NULL) {
+            return NULL;
+        }
+        for (int i=0; i < ndtypes; i++) {
+            PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+        }
+        PyObject *loops = ufunc->_loops;
+        Py_ssize_t length = PyList_Size(loops);
+        for (Py_ssize_t i = 0; i < length; i++) {
+            PyObject *item = PyList_GetItem(loops, i);
+            PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+            int cmp = PyObject_RichCompareBool(cur_DType_tuple, t_dtypes, Py_EQ);
+            if (cmp < 0) {
+                Py_DECREF(t_dtypes);
+                return NULL;
+            }
+            if (cmp == 0) {
+                continue;
+            }
+            /* Got the match */
+            Py_DECREF(t_dtypes);
+            return PyTuple_GetItem(item, 1); 
+        }
+        Py_DECREF(t_dtypes);
+        Py_RETURN_NONE;
+    }
+
 
     static int
     InitOperators(PyObject *dictionary) {

diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((long long int)nnan != ((1LL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npy
F438
v_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((long long int)nnan != ((1LL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;

diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
@@ -291,6 +291,9 @@ fill_arraymethod_from_slots(
             case NPY_METH_get_reduction_initial:
                 meth->get_reduction_initial = slot->pfunc;
                 continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -891,7 +894,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.

diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
@@ -73,7 +73,7 @@ struct PyArrayMethodObject_tag;
  * TODO: Before making this public, we should review which information should
  *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
  */
-typedef struct {
+typedef struct PyArrayMethod_Context_t
10000
ag {
     PyObject *caller;  /* E.g. the original ufunc, may be NULL */
     struct PyArrayMethodObject_tag *method;
 
@@ -223,6 +223,7 @@ typedef struct PyArrayMethodObject_tag {
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
@@ -266,6 +267,7 @@ extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 #define NPY_METH_contiguous_loop 5
 #define NPY_METH_unaligned_strided_loop 6
 #define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
 
 
 /*

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -541,15 +541,21 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
     }
 }
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@_indexed(char const *ip1, npy_intp const *indx, char *value, 
-                      npy_intp const *dimensions, npy_intp const *steps) {
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
+@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
+    @type@ *indexed;
     for(i = 0; i < n; i++, indx += is2, value += os1) {
-        @type@ op1 = *(@type@ *)(ip1 + is1 * indx[0]) +  value[0];
+        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        indexed[0] = indexed[0] @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 #endif
@@ -1546,6 +1552,23 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
         }
     }
 }
+
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@_indexed(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (npy_longdouble *)(ip1 + is1 * indx[0]);
+        indexed[0] = indexed[0] @OP@ *(npy_longdouble *)value;
+    }
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1651,6 +1674,24 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         }
     }
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (npy_half *)(ip1 + is1 * indx[0]);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        indexed[0] = npy_float_to_half(npy_half_to_float(indexed[0]) @OP@ v);
+    }
+    return 0; 
+}
 /**end repeat**/
 
 #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))

diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
@@ -33,6 +33,9 @@
 // #define BOOL_fmax BOOL_maximum
 // #define BOOL_fmin BOOL_minimum
 
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -81,6 +84,11 @@ BOOL_@kind@(char **args
57AE
, npy_intp const *dimensions, npy_intp const *steps, void
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -163,6 +171,9 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat3**/
 
 /**begin repeat3
@@ -285,10 +296,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
-
8C24
 * # OP = +, -, *, /#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 /**end repeat**/
 
@@ -424,6 +438,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
 /**end repeat1**/
 
 /**begin repeat1