numpy · seberg · Feb 7, 2023 · Jan 18, 2023 · Jan 22, 2023 · Jan 23, 2023
diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -150,7 +150,7 @@ def g(self,z,c):
         return np.sum(np.multiply(z,z) + c)
 
     def mandelbrot_numpy(self, c, maxiter):
-        output = np.zeros(c.shape, np.int)
+        output = np.zeros(c.shape, np.int32)
         z = np.empty(c.shape, np.complex64)
         for it in range(maxiter):
             notdone = self.f(z)

diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
@@ -0,0 +1,17 @@
+``ufunc.at`` can be much faster
+-------------------------------
+Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:
+
+- operands are aligned
+- no casting
+
+If ufuncs with appropriate indexed loops on 1d arguments with the above
+conditions, ``ufunc.at`` can be up to 60x faster (an additional 7x speedup).
+Appropriate indexed loops have been added to ``add``, ``subtract``,
+``multiply``, ``divide`` (and ``floor_divide``)
+
+The internal logic is similar to the logic used for regular ufuncs, which also
+have fast paths.
+
+Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
+work.
diff --git a/numpy/conftest.py b/numpy/conftest.py
@@ -40,6 +40,8 @@
     "numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
 )
 
+# The experimentalAPI is used in _umath_tests
+os.environ["NUMPY_EXPERIMENTAL_DTYPE_API"] = "1"
 
 def pytest_configure(config):
     config.addinivalue_line("markers",

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -1,10 +1,16 @@
+"""
+Generate the code to build all the internal ufuncs. At the base is the defdict:
+a dictionary ofUfunc classes. This is fed to make_code to generate
+__umath_generated.c
+"""
 import os
 import re
 import struct
 import sys
 import textwrap
 import argparse
 
+# identity objects
 Zero = "PyLong_FromLong(0)"
 One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
@@ -121,11 +127,11 @@ def check_td_order(tds):
     for prev_i, sign in enumerate(signatures[1:]):
         if sign in signatures[:prev_i+1]:
             continue  # allow duplicates...
-        
+
         _check_order(signatures[prev_i], sign)
 
 
-_fdata_map = dict(
+_floatformat_map = dict(
     e='npy_%sf',
     f='npy_%sf',
     d='npy_%s',
@@ -136,11 +142,13 @@ def check_td_order(tds):
 )
 
 def build_func_data(types, f):
-    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
 def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
        simd=None, dispatch=None):
+    """Generate a TypeDescription instance for each item in types
+    """
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -188,12 +196,15 @@ class Ufunc:
     ----------
     nin : number of input arguments
     nout : number of output arguments
-    identity : identity element for a two-argument function
+    identity : identity element for a two-argument function (like Zero)
     docstring : docstring for the ufunc
-    type_descriptions : list of TypeDescription objects
+    typereso: type resolver function of type PyUFunc_TypeResolutionFunc
+    type_descriptions : TypeDescription objects
+    signature: a generalized ufunc signature (like for matmul)
+    indexed: add indexed loops (ufunc.at) for these type characters
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions, signature=None):
+                 *type_descriptions, signature=None, indexed=''):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -203,6 +214,7 @@ def __init__(self, nin, nout, identity, docstring, typereso,
         self.typereso = typereso
         self.type_descriptions = []
         self.signature = signature
+        self.indexed = indexed
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
@@ -347,6 +359,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
           ],
           TD(O, f='PyNumber_Add'),
+          indexed=flts + ints
           ),
 'subtract':
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
@@ -359,6 +372,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
           ],
           TD(O, f='PyNumber_Subtract'),
+          indexed=flts + ints
           ),
 'multiply':
     Ufunc(2, 1, One,
@@ -374,6 +388,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'dm', 'm'),
           ],
           TD(O, f='PyNumber_Multiply'),
+          indexed=flts + ints
           ),
 #'true_divide' : aliased to divide in umathmodule.c:initumath
 'floor_divide':
@@ -388,6 +403,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
+          indexed=flts + ints
           ),
 'divide':
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
@@ -399,6 +415,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
+          indexed=flts
           ),
 'conjugate':
     Ufunc(1, 1, None,
@@ -1255,6 +1272,40 @@ def make_ufuncs(funcdict):
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        for c in uf.indexed:
+            # Handle indexed loops by getting the underlying ArrayMethodObject
+            # from the list in f._loops and setting its field appropriately
+            fmt = textwrap.dedent("""
+            {{
+                PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                                   dtype, {count});
+                if (info == NULL) {{
+                    return -1;
+                }}
+                if (info == Py_None) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "cannot add indexed loop to ufunc "
+                        "{name} with {typenum}");
+                    return -1;
+                }}
+                if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "Not a PyArrayMethodObject in ufunc "
+                        "{name} with {typenum}");
+                }}
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                                 {funcname};
+                /* info is borrowed, no need to decref*/
+            }}
+            """)
+            mlist.append(fmt.format(
+                typenum=f"NPY_{english_upper(chartoname[c])}",
+                count=uf.nin+uf.nout,
+                name=name,
+                funcname = f"{english_upper(chartoname[c])}_{name}_indexed",
+            ))
+
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1277,8 +1328,14 @@ def make_code(funcdict, filename):
     #include "loops.h"
     #include "matmul.h"
     #include "clip.h"
+    #include "dtypemeta.h"
     #include "_umath_doc_generated.h"
+
     %s
+    /* Returns a borrowed ref of the second value in the matching info tuple */
+    PyObject *
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                     int ndtypes);
 
     static int
     InitOperators(PyObject *dictionary) {

diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -311,6 +311,7 @@ typedef NPY_CASTING (resolve_descriptors_function)(
 #define NPY_METH_contiguous_loop 5
 #define NPY_METH_unaligned_strided_loop 6
 #define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
 
 
 typedef struct {
@@ -488,7 +489,7 @@ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
  */
 #if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
 
-#define __EXPERIMENTAL_DTYPE_VERSION 6
+#define __EXPERIMENTAL_DTYPE_VERSION 7
 
 static int
 import_experimental_dtype_api(int version)

diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;

diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
@@ -291,6 +291,9 @@ fill_arraymethod_from_slots(
             case NPY_METH_get_reduction_initial:
                 meth->get_reduction_initial = slot->pfunc;
                 continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -891,7 +894,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.

diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
@@ -73,7 +73,7 @@ struct PyArrayMethodObject_tag;
  * TODO: Before making this public, we should review which information should
  *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
  */
-typedef struct {
+typedef struct PyArrayMethod_Context_tag {
     PyObject *caller;  /* E.g. the original ufunc, may be NULL */
     struct PyArrayMethodObject_tag *method;
 
@@ -223,6 +223,7 @@ typedef struct PyArrayMethodObject_tag {
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
@@ -266,6 +267,7 @@ extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 #define NPY_METH_contiguous_loop 5
 #define NPY_METH_unaligned_strided_loop 6
 #define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
 
 
 /*

diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -16,7 +16,7 @@
 #include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 6
+#define EXPERIMENTAL_DTYPE_API_VERSION 7
 
 
 typedef struct{