From 90243f53f8efa200e22ee13ef1f0a8e1120b1c3a Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 18 Jan 2023 08:17:59 +0200
Subject: [PATCH 01/20] ENH: add indexed loops

---
 numpy/core/src/umath/loops.c.src                    | 12 ++++++++++++
 numpy/core/src/umath/loops_arithm_fp.dispatch.c.src | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 2b70a4b9a484..085712f380e3 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -540,6 +540,18 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
         BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
     }
 }
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
+@TYPE@_@kind@@isa@_indexed(char const *ip1, npy_intp const *indx, char *value, 
+                      npy_intp const *dimensions, npy_intp const *steps) {
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        @type@ op1 = *(@type@ *)(ip1 + is1 * indx[0]) +  value[0];
+    }
+}
+
 #endif
 
 /**end repeat2**/
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index c1bfaa63a9a7..1436c46f4779 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -546,6 +546,18 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
         }
     }
 }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(char const *ip1, npy_intp const *indx, char *value, 
+                      npy_intp const *dimensions, npy_intp const *steps) {
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        @type@ op1 = *(@type@ *)(ip1 + is1 * indx[0]) +  value[0];
+    }
+}
+
 /**end repeat1**/
 /**end repeat**/
 

From 3ba259d20ec57052ee8df915eab308370e4a7bae Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Sun, 22 Jan 2023 21:55:07 +0200
Subject: [PATCH 02/20] refactor ufunc_at to first get the inner loop, then
 create the iterators

---
 numpy/core/src/umath/ufunc_object.c | 81 ++++++++++++++---------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 792a373a545b..8739a5095996 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -6238,49 +6238,11 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
 
-    /* 
-     * Create a map iterator (there is no multi-mapiter, so we need at least 2
-     * iterators: one for the mapping, and another for the second operand)
-     */
-    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
-        op1_array, idx, 1, op2_array);
-    if (iter == NULL) {
-        goto fail;
-    }
-    op1_array = iter->array;  /* May be updateifcopied on overlap */
-
-    if (op2_array != NULL) {
-        /*
-         * May need to swap axes so that second operand is
-         * iterated over correctly
-         */
-        if ((iter->subspace != NULL) && (iter->consec)) {
-            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
-            if (op2_array == NULL) {
-                /* only on memory allocation failure */
-                goto fail;
-            }
-        }
-
-        /*
-         * Create array iter object for second operand that
-         * "matches" the map iter object for the first operand.
-         * Then we can just iterate over the first and second
-         * operands at the same time and not have to worry about
-         * picking the correct elements from each operand to apply
-         * the ufunc to.
-         */
-        if ((iter2 = (PyArrayIterObject *)\
-             PyArray_BroadcastToShape((PyObject *)op2_array,
-                                        iter->dimensions, iter->nd))==NULL) {
-            goto fail;
-        }
-    }
-
     PyArrayMethodObject *ufuncimpl = NULL;
-
     {
         /* Do all the dtype handling and find the correct ufuncimpl */
+        Py_INCREF(PyArray_DESCR(op1_array));
+
 
         PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
         PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
@@ -6343,7 +6305,44 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
 
-    Py_INCREF(PyArray_DESCR(op1_array));
+    /* 
+     * Create a map iterator (there is no multi-mapiter, so we need at least 2
+     * iterators: one for the mapping, and another for the second operand)
+     */
+    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+        op1_array, idx, 1, op2_array);
+    if (iter == NULL) {
+        goto fail;
+    }
+    op1_array = iter->array;  /* May be updateifcopied on overlap */
+
+    if (op2_array != NULL) {
+        /*
+         * May need to swap axes so that second operand is
+         * iterated over correctly
+         */
+        if ((iter->subspace != NULL) && (iter->consec)) {
+            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
+            if (op2_array == NULL) {
+                /* only on memory allocation failure */
+                goto fail;
+            }
+        }
+
+        /*
+         * Create array iter object for second operand that
+         * "matches" the map iter object for the first operand.
+         * Then we can just iterate over the first and second
+         * operands at the same time and not have to worry about
+         * picking the correct elements from each operand to apply
+         * the ufunc to.
+         */
+        if ((iter2 = (PyArrayIterObject *)\
+             PyArray_BroadcastToShape((PyObject *)op2_array,
+                                        iter->dimensions, iter->nd))==NULL) {
+            goto fail;
+        }
+    }
 
     PyArrayMethod_Context context = {
             .caller = (PyObject *)ufunc,

From 7910e2bf14b8bef95c3180fbf667458e29426a0e Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Mon, 23 Jan 2023 21:18:05 +0200
Subject: [PATCH 03/20] ENH: add indexed lower loops and connect them to ufuncs

---
 numpy/core/code_generators/generate_umath.py  | 98 ++++++++++++++++++-
 .../src/multiarray/argfunc.dispatch.c.src     |  4 +-
 numpy/core/src/multiarray/array_method.c      |  5 +-
 numpy/core/src/multiarray/array_method.h      |  4 +-
 numpy/core/src/umath/loops.c.src              | 49 +++++++++-
 numpy/core/src/umath/loops.h.src              | 21 +++-
 .../src/umath/loops_arithm_fp.dispatch.c.src  | 14 ++-
 .../src/umath/loops_arithmetic.dispatch.c.src | 42 ++++++++
 8 files changed, 219 insertions(+), 18 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index ae1dcee7bf7e..ac95acafb569 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -1,3 +1,8 @@
+"""
+Generate the code to build all the internal ufuncs. At the base is the defdict:
+a dictionary ofUfunc classes. This is fed to make_code to generate
+__umath_generated.c
+"""
 import os
 import re
 import struct
@@ -5,6 +10,7 @@
 import textwrap
 import argparse
 
+# identity objects
 Zero = "PyLong_FromLong(0)"
 One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
@@ -125,7 +131,7 @@ def check_td_order(tds):
         _check_order(signatures[prev_i], sign)
 
 
-_fdata_map = dict(
+_floatformat_map = dict(
     e='npy_%sf',
     f='npy_%sf',
     d='npy_%s',
@@ -136,11 +142,13 @@ def check_td_order(tds):
 )
 
 def build_func_data(types, f):
-    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
 def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
        simd=None, dispatch=None):
+    """Generate a TypeDescription instance for each item in types
+    """
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -188,12 +196,15 @@ class Ufunc:
     ----------
     nin : number of input arguments
     nout : number of output arguments
-    identity : identity element for a two-argument function
+    identity : identity element for a two-argument function (like Zero)
     docstring : docstring for the ufunc
-    type_descriptions : list of TypeDescription objects
+    typereso: type resolver function of type PyUFunc_TypeResolutionFunc
+    type_descriptions : TypeDescription objects
+    signature: a generalized ufunc signature (like for matmul)
+    indexed: add indexed loops (ufunc.at) for these type characters
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions, signature=None):
+                 *type_descriptions, signature=None, indexed=''):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -203,6 +214,7 @@ def __init__(self, nin, nout, identity, docstring, typereso,
         self.typereso = typereso
         self.type_descriptions = []
         self.signature = signature
+        self.indexed = indexed
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
@@ -347,6 +359,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
           ],
           TD(O, f='PyNumber_Add'),
+          indexed=flts + ints
           ),
 'subtract':
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
@@ -359,6 +372,7 @@ def english_upper(s):
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
           ],
           TD(O, f='PyNumber_Subtract'),
+          indexed=flts + ints
           ),
 'multiply':
     Ufunc(2, 1, One,
@@ -374,6 +388,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'dm', 'm'),
           ],
           TD(O, f='PyNumber_Multiply'),
+          indexed=flts + ints
           ),
 #'true_divide' : aliased to divide in umathmodule.c:initumath
 'floor_divide':
@@ -388,6 +403,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
+          indexed=flts + ints
           ),
 'divide':
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
@@ -1255,6 +1271,40 @@ def make_ufuncs(funcdict):
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        for c in uf.indexed:
+            # Handle indexed loops by getting the underlying ArrayMethodObject
+            # from the list in f._loops and setting its field appropriately
+            fmt = textwrap.dedent("""
+            {{
+                PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f, dtype, {count});
+                if (info == NULL) {{
+                    return -1;
+                }}
+                if (info == Py_None) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "cannot add indexed loop to ufunc {name} with {typenum}");
+                    return -1;
+                }}
+                if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "Not a PyArrayMethodObject in ufunc {name} with {typenum}");
+                }}
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop = {funcname};
+                /* info is borrowed, no need to decref*/
+            }}    
+            """)
+            cname = name
+            if cname == "floor_divide":
+                # XXX there must be a better way...
+                cname = "divide"
+            mlist.append(fmt.format(
+                typenum=f"NPY_{english_upper(chartoname[c])}",
+                count=uf.nin+uf.nout,
+                name=name,
+                funcname = f"{english_upper(chartoname[c])}_{cname}_indexed",
+            ))
+            
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1277,8 +1327,46 @@ def make_code(funcdict, filename):
     #include "loops.h"
     #include "matmul.h"
     #include "clip.h"
+    #include "dtypemeta.h"
     #include "_umath_doc_generated.h"
+
     %s
+    /*
+     * Return the PyArrayMethodObject or PyCapsule that matches a registered
+     * tuple of identical dtypes. Return a borrowed ref of the first match.
+     */
+    static PyObject *
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype, int ndtypes)
+    {
+        
+        PyObject *t_dtypes = PyTuple_New(ndtypes);
+        if (t_dtypes == NULL) {
+            return NULL;
+        }
+        for (int i=0; i < ndtypes; i++) {
+            PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+        }
+        PyObject *loops = ufunc->_loops;
+        Py_ssize_t length = PyList_Size(loops);
+        for (Py_ssize_t i = 0; i < length; i++) {
+            PyObject *item = PyList_GetItem(loops, i);
+            PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+            int cmp = PyObject_RichCompareBool(cur_DType_tuple, t_dtypes, Py_EQ);
+            if (cmp < 0) {
+                Py_DECREF(t_dtypes);
+                return NULL;
+            }
+            if (cmp == 0) {
+                continue;
+            }
+            /* Got the match */
+            Py_DECREF(t_dtypes);
+            return PyTuple_GetItem(item, 1); 
+        }
+        Py_DECREF(t_dtypes);
+        Py_RETURN_NONE;
+    }
+
 
     static int
     InitOperators(PyObject *dictionary) {
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 1d7753275d7c..88c1028dc2f6 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((long long int)nnan != ((1LL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((long long int)nnan != ((1LL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index eeebf2d9bfa7..5001a67e9422 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -291,6 +291,9 @@ fill_arraymethod_from_slots(
             case NPY_METH_get_reduction_initial:
                 meth->get_reduction_initial = slot->pfunc;
                 continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -891,7 +894,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index ef4648443f70..138df69fa9f0 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -73,7 +73,7 @@ struct PyArrayMethodObject_tag;
  * TODO: Before making this public, we should review which information should
  *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
  */
-typedef struct {
+typedef struct PyArrayMethod_Context_tag {
     PyObject *caller;  /* E.g. the original ufunc, may be NULL */
     struct PyArrayMethodObject_tag *method;
 
@@ -223,6 +223,7 @@ typedef struct PyArrayMethodObject_tag {
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
@@ -266,6 +267,7 @@ extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 #define NPY_METH_contiguous_loop 5
 #define NPY_METH_unaligned_strided_loop 6
 #define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
 
 
 /*
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 085712f380e3..d4e246783f66 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -541,15 +541,21 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
     }
 }
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@_indexed(char const *ip1, npy_intp const *indx, char *value, 
-                      npy_intp const *dimensions, npy_intp const *steps) {
+NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
+@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
+    @type@ *indexed;
     for(i = 0; i < n; i++, indx += is2, value += os1) {
-        @type@ op1 = *(@type@ *)(ip1 + is1 * indx[0]) +  value[0];
+        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        indexed[0] = indexed[0] @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 #endif
@@ -1546,6 +1552,23 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
         }
     }
 }
+
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@_indexed(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (npy_longdouble *)(ip1 + is1 * indx[0]);
+        indexed[0] = indexed[0] @OP@ *(npy_longdouble *)value;
+    }
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1651,6 +1674,24 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         }
     }
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (npy_half *)(ip1 + is1 * indx[0]);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        indexed[0] = npy_float_to_half(npy_half_to_float(indexed[0]) @OP@ v);
+    }
+    return 0; 
+}
 /**end repeat**/
 
 #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 26742946fcfe..ee69330dd78a 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -33,6 +33,9 @@
 // #define BOOL_fmax BOOL_maximum
 // #define BOOL_fmin BOOL_minimum
 
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -81,6 +84,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -163,6 +171,9 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat3**/
 
 /**begin repeat3
@@ -285,10 +296,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 /**end repeat**/
 
@@ -424,6 +438,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
 /**end repeat1**/
 
 /**begin repeat1
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 1436c46f4779..1874573ce6f2 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -547,15 +547,21 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     }
 }
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
-(char const *ip1, npy_intp const *indx, char *value, 
-                      npy_intp const *dimensions, npy_intp const *steps) {
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
+    @type@ *indexed;
     for(i = 0; i < n; i++, indx += is2, value += os1) {
-        @type@ op1 = *(@type@ *)(ip1 + is1 * indx[0]) +  value[0];
+        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        indexed[0] = indexed[0] @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 /**end repeat1**/
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index ea8685002ac1..573590b1faac 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        indexed[0] = floor_div_@TYPE@(indexed[0], *(@type@ *)value);
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    npy_intp *indx = (npy_intp *)args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += is2, value += os1) {
+        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        @type@ in2 = *(@type@ *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            indexed[0] = 0;
+        } else {
+            indexed[0] = indexed[0] / in2;
+        }
+    }
+    return 0;
+}
+
 /**end repeat**/

From eb21b25093c79ac0bc135aed3d192c06079c5bbf Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Mon, 30 Jan 2023 19:54:44 +0200
Subject: [PATCH 04/20] ENH: use an indexed loop if possible in ufunc_at

---
 numpy/core/src/umath/dispatching.c            |  4 +-
 numpy/core/src/umath/loops.c.src              |  6 +-
 .../src/umath/loops_arithm_fp.dispatch.c.src  |  2 +-
 .../src/umath/loops_arithmetic.dispatch.c.src |  4 +-
 numpy/core/src/umath/scalarmath.c.src         |  2 +-
 numpy/core/src/umath/ufunc_object.c           | 61 ++++++++++++++++---
 6 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 6d6c481fb148..e09568d6994f 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -914,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
-     * Get the actual DTypes we operate with by mixing the operand array
-     * ones with the passed signature.
+     * Get the actual DTypes we operate with by setting op_dtypes[i] from
+     * signature[i].
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index d4e246783f66..cd5a431cd607 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -547,7 +547,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
@@ -1559,7 +1559,7 @@ LONGDOUBLE_@kind@_indexed(char **args, npy_intp const *dimensions, npy_intp cons
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_longdouble *indexed;
@@ -1681,7 +1681,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dime
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_half *indexed;
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 1874573ce6f2..d37facef8138 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -553,7 +553,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 573590b1faac..cef2adb34678 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -402,7 +402,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
@@ -488,7 +488,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 2e6da3cd2415..47d42b899cb8 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -453,7 +453,7 @@ static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
 {
     char *args[3] = {(char *)&a, (char *)&b, (char *)out};
-    npy_intp steps[3];
+    npy_intp steps[3] = {0, 0, 0};
     npy_intp size = 1;
     @TYPE@_divide(args, &size, steps, NULL);
     return 0;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 8739a5095996..d088e22de276 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5893,13 +5893,45 @@ new_array_op(PyArrayObject *op_array, char *data)
     return (PyArrayObject *)r;
 }
 
+/*
+ * If operands are 1D we can use the indexed loop to do the work
+ * Returns 0 if successful, -1 otherwise
+ */
+static int
+try_trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_Context *context)
+{
+    if (PyArray_NDIM(op1_array) != 1) {
+        return -1;
+    }
+    /* check that nop == 3 */
+    if (ufuncimpl->nin >1 && PyArray_NDIM(op2_array) != 1) {
+        return -1;
+    }
+    if (iter->numiter > 1) {
+        return -1;
+    }
+    char *args[3];
+    npy_intp dimensions = iter->size;
+    npy_intp steps[3];
+    args[0] = (char *) iter->baseoffset;
+    args[1] = (char *) iter->outer_ptrs[0];
+    args[2] = (char *)PyArray_DATA(op2_array);
+    steps[0] = iter->fancy_strides[0];
+    steps[1] = iter->outer_strides[0];
+    steps[2] = PyArray_STRIDE(op2_array, 0);
+    ufuncimpl->contiguous_indexed_loop(context, args, &dimensions, steps, NULL);
+    return 0;
+}
+
 static int
 ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
                     PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
                     PyArrayObject *op1_array, PyArrayObject *op2_array,
                     PyArrayMethod_StridedLoop *strided_loop,
                     PyArrayMethod_Context *context,
-                    npy_intp strides[3],
                     NpyAuxData *auxdata
                     )
 {
@@ -5921,6 +5953,7 @@ ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
         NPY_BEGIN_THREADS;
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     /*
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
@@ -5972,7 +6005,6 @@ ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
                     PyArray_Descr *operation_descrs[3],
                     PyArrayMethod_StridedLoop *strided_loop,
                     PyArrayMethod_Context *context,
-                    npy_intp strides[3],
                     NpyAuxData *auxdata
                     )
 {
@@ -6070,6 +6102,7 @@ ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
         npy_clear_floatstatus_barrier((char*)&iter);
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6305,7 +6338,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
 
-    /* 
+    /*
      * Create a map iterator (there is no multi-mapiter, so we need at least 2
      * iterators: one for the mapping, and another for the second operand)
      */
@@ -6383,11 +6416,25 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
     if (fast_path == 1) {
-    res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array, op2_array,
-                                  strided_loop, &context, strides, auxdata);
+        /*
+         * Try to use trivial loop (1d, no casting, aligned) if
+         * - the matching info has a indexed loop
+         * - all operands are 1d
+         */
+        if (ufuncimpl->contiguous_indexed_loop != NULL) {
+            res = try_trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+                        op2_array, &context);
+
+        }
+        if (res == -1) {
+            /* Couldn't use the fastest path, use the faster path */
+            res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, strided_loop, &context, auxdata);
+        }
     } else {
-        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array, op2_array,
-                                  operation_descrs, strided_loop, &context, strides, auxdata);
+        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, operation_descrs, strided_loop, &context,
+                        auxdata);
     }
 
 fail:

From 888bba25582aaae35bbfdcdd9d632580f1a8be16 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 31 Jan 2023 18:03:38 +0200
Subject: [PATCH 05/20] MAINT: fix linting

---
 numpy/core/code_generators/generate_umath.py | 28 ++++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index ac95acafb569..ff32cf1b5103 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -127,7 +127,7 @@ def check_td_order(tds):
     for prev_i, sign in enumerate(signatures[1:]):
         if sign in signatures[:prev_i+1]:
             continue  # allow duplicates...
-        
+
         _check_order(signatures[prev_i], sign)
 
 
@@ -1277,22 +1277,26 @@ def make_ufuncs(funcdict):
             fmt = textwrap.dedent("""
             {{
                 PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
-                PyObject *info = get_info_no_cast((PyUFuncObject *)f, dtype, {count});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                                   dtype, {count});
                 if (info == NULL) {{
                     return -1;
                 }}
                 if (info == Py_None) {{
                     PyErr_SetString(PyExc_RuntimeError,
-                        "cannot add indexed loop to ufunc {name} with {typenum}");
+                        "cannot add indexed loop to ufunc "
+                        "{name} with {typenum}");
                     return -1;
                 }}
                 if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
                     PyErr_SetString(PyExc_RuntimeError,
-                        "Not a PyArrayMethodObject in ufunc {name} with {typenum}");
+                        "Not a PyArrayMethodObject in ufunc "
+                        "{name} with {typenum}");
                 }}
-                ((PyArrayMethodObject*)info)->contiguous_indexed_loop = {funcname};
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                                 {funcname};
                 /* info is borrowed, no need to decref*/
-            }}    
+            }}
             """)
             cname = name
             if cname == "floor_divide":
@@ -1304,7 +1308,7 @@ def make_ufuncs(funcdict):
                 name=name,
                 funcname = f"{english_upper(chartoname[c])}_{cname}_indexed",
             ))
-            
+
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1336,9 +1340,10 @@ def make_code(funcdict, filename):
      * tuple of identical dtypes. Return a borrowed ref of the first match.
      */
     static PyObject *
-    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype, int ndtypes)
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                     int ndtypes)
     {
-        
+
         PyObject *t_dtypes = PyTuple_New(ndtypes);
         if (t_dtypes == NULL) {
             return NULL;
@@ -1351,7 +1356,8 @@ def make_code(funcdict, filename):
         for (Py_ssize_t i = 0; i < length; i++) {
             PyObject *item = PyList_GetItem(loops, i);
             PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
-            int cmp = PyObject_RichCompareBool(cur_DType_tuple, t_dtypes, Py_EQ);
+            int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+                                               t_dtypes, Py_EQ);
             if (cmp < 0) {
                 Py_DECREF(t_dtypes);
                 return NULL;
@@ -1361,7 +1367,7 @@ def make_code(funcdict, filename):
             }
             /* Got the match */
             Py_DECREF(t_dtypes);
-            return PyTuple_GetItem(item, 1); 
+            return PyTuple_GetItem(item, 1);
         }
         Py_DECREF(t_dtypes);
         Py_RETURN_NONE;

From be6417316c19eb59ce2d844cb93b331007430a72 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 31 Jan 2023 18:23:48 +0200
Subject: [PATCH 06/20] ENH: update and sync the public API

---
 numpy/core/include/numpy/experimental_dtype_api.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index e33df31564ca..9ecb582d074d 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -311,6 +311,7 @@ typedef NPY_CASTING (resolve_descriptors_function)(
 #define NPY_METH_contiguous_loop 5
 #define NPY_METH_unaligned_strided_loop 6
 #define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
 
 
 typedef struct {
@@ -488,7 +489,7 @@ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
  */
 #if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
 
-#define __EXPERIMENTAL_DTYPE_VERSION 6
+#define __EXPERIMENTAL_DTYPE_VERSION 7
 
 static int
 import_experimental_dtype_api(int version)

From 8bfaaf35c80a02657c353b641652de9fa4719daa Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 31 Jan 2023 19:42:47 +0200
Subject: [PATCH 07/20] BENCH: fix benchmark

---
 benchmarks/benchmarks/bench_ufunc_strides.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
index f80bf90f9fbe..79613a0b9f61 100644
--- a/benchmarks/benchmarks/bench_ufunc_strides.py
+++ b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -150,7 +150,7 @@ def g(self,z,c):
         return np.sum(np.multiply(z,z) + c)
 
     def mandelbrot_numpy(self, c, maxiter):
-        output = np.zeros(c.shape, np.int)
+        output = np.zeros(c.shape, np.int32)
         z = np.empty(c.shape, np.complex64)
         for it in range(maxiter):
             notdone = self.f(z)

From 28706afcbe1bf35413049d0283e6e01ef9abcb1a Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 1 Feb 2023 16:14:23 +0200
Subject: [PATCH 08/20] MAINT, BUG: fixes from review and testing

---
 numpy/core/code_generators/generate_umath.py  | 47 ++---------
 numpy/core/src/umath/dispatching.c            | 37 +++++++++
 numpy/core/src/umath/loops.c.src              | 79 +++++++++++++++----
 numpy/core/src/umath/loops.h.src              |  4 +
 .../src/umath/loops_arithm_fp.dispatch.c.src  |  4 +-
 .../src/umath/loops_arithmetic.dispatch.c.src |  8 +-
 numpy/core/src/umath/ufunc_object.c           | 45 +++++++----
 numpy/core/tests/test_ufunc.py                | 34 ++++++++
 8 files changed, 177 insertions(+), 81 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index ff32cf1b5103..b45f89344d69 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -415,6 +415,7 @@ def english_upper(s):
            TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
+          indexed=flts
           ),
 'conjugate':
     Ufunc(1, 1, None,
@@ -1298,15 +1299,11 @@ def make_ufuncs(funcdict):
                 /* info is borrowed, no need to decref*/
             }}
             """)
-            cname = name
-            if cname == "floor_divide":
-                # XXX there must be a better way...
-                cname = "divide"
             mlist.append(fmt.format(
                 typenum=f"NPY_{english_upper(chartoname[c])}",
                 count=uf.nin+uf.nout,
                 name=name,
-                funcname = f"{english_upper(chartoname[c])}_{cname}_indexed",
+                funcname = f"{english_upper(chartoname[c])}_{name}_indexed",
             ))
 
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
@@ -1335,44 +1332,10 @@ def make_code(funcdict, filename):
     #include "_umath_doc_generated.h"
 
     %s
-    /*
-     * Return the PyArrayMethodObject or PyCapsule that matches a registered
-     * tuple of identical dtypes. Return a borrowed ref of the first match.
-     */
-    static PyObject *
+    /* Returns a borrowed ref of the second value in the matching info tuple */
+    PyObject *
     get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
-                     int ndtypes)
-    {
-
-        PyObject *t_dtypes = PyTuple_New(ndtypes);
-        if (t_dtypes == NULL) {
-            return NULL;
-        }
-        for (int i=0; i < ndtypes; i++) {
-            PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
-        }
-        PyObject *loops = ufunc->_loops;
-        Py_ssize_t length = PyList_Size(loops);
-        for (Py_ssize_t i = 0; i < length; i++) {
-            PyObject *item = PyList_GetItem(loops, i);
-            PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
-            int cmp = PyObject_RichCompareBool(cur_DType_tuple,
-                                               t_dtypes, Py_EQ);
-            if (cmp < 0) {
-                Py_DECREF(t_dtypes);
-                return NULL;
-            }
-            if (cmp == 0) {
-                continue;
-            }
-            /* Got the match */
-            Py_DECREF(t_dtypes);
-            return PyTuple_GetItem(item, 1);
-        }
-        Py_DECREF(t_dtypes);
-        Py_RETURN_NONE;
-    }
-
+                     int ndtypes);
 
     static int
     InitOperators(PyObject *dictionary) {
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index e09568d6994f..fee77b2d6f03 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -1240,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc)
 
     return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
 }
+
+/*
+ * Return the PyArrayMethodObject or PyCapsule that matches a registered
+ * tuple of identical dtypes. Return a borrowed ref of the first match.
+ */
+NPY_NO_EXPORT PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                 int ndtypes)
+{
+    PyObject *t_dtypes = PyTuple_New(ndtypes);
+    if (t_dtypes == NULL) {
+        return NULL;
+    }
+    for (int i=0; i < ndtypes; i++) {
+        PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+    }
+    PyObject *loops = ufunc->_loops;
+    Py_ssize_t length = PyList_Size(loops);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *item = PyList_GetItem(loops, i);
+        PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+        int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+                                           t_dtypes, Py_EQ);
+        if (cmp < 0) {
+            Py_DECREF(t_dtypes);
+            return NULL;
+        }
+        if (cmp == 0) {
+            continue;
+        }
+        /* Got the match */
+        Py_DECREF(t_dtypes);
+        return PyTuple_GetItem(item, 1);
+    }
+    Py_DECREF(t_dtypes);
+    Py_RETURN_NONE;
+}
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index cd5a431cd607..87ef2fca961a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -460,6 +460,7 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed
 #define @TYPE@_fmax @TYPE@_maximum
 #define @TYPE@_fmin @TYPE@_minimum
 
@@ -531,7 +532,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 #if @CHK@
 NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions,
+                   npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
@@ -542,17 +544,19 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 }
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
-@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
-        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
         indexed[0] = indexed[0] @OP@ *(@type@ *)value;
     }
     return 0;
@@ -1418,6 +1422,25 @@ NPY_NO_EXPORT void
     }
 }
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        indexed[0] = npy_floor_divide@c@(indexed[0], *(@type@ *)value);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1553,20 +1576,23 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
-NPY_NO_EXPORT void
-LONGDOUBLE_@kind@_indexed(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_NO_EXPORT int
+LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_longdouble *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
-        indexed = (npy_longdouble *)(ip1 + is1 * indx[0]);
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
         indexed[0] = indexed[0] @OP@ *(npy_longdouble *)value;
     }
+    return 0;
 }
 
 /**end repeat**/
@@ -1679,14 +1705,14 @@ NPY_NO_EXPORT int
 HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     npy_half *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
-        indexed = (npy_half *)(ip1 + is1 * indx[0]);
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
         const float v = npy_half_to_float(*(npy_half *)value);
         indexed[0] = npy_float_to_half(npy_half_to_float(indexed[0]) @OP@ v);
     }
@@ -1828,6 +1854,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        float v = npy_half_to_float(*(npy_half *)value);
+        float div = npy_floor_dividef(npy_half_to_float(indexed[0]), v);
+        indexed[0] = npy_float_to_half(div);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index ee69330dd78a..b25f6770f32c 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -134,6 +134,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  */
 
 #define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
 #define @S@@TYPE@_fmax @S@@TYPE@_maximum
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
 
@@ -491,6 +492,9 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index d37facef8138..a00ef1e4ba63 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -553,11 +553,11 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (@type@ *)(ip1 + is1 * indx[0]);
         indexed[0] = indexed[0] @OP@ *(@type@ *)value;
     }
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index cef2adb34678..9f3043f48199 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -402,11 +402,11 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (@type@ *)(ip1 + is1 * indx[0]);
         indexed[0] = floor_div_@TYPE@(indexed[0], *(@type@ *)value);
     }
@@ -488,11 +488,11 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
     char *ip1 = args[0];
     npy_intp *indx = (npy_intp *)args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1] / sizeof(npy_intp), os1 = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
-    for(i = 0; i < n; i++, indx += is2, value += os1) {
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (@type@ *)(ip1 + is1 * indx[0]);
         @type@ in2 = *(@type@ *)value;
         if (NPY_UNLIKELY(in2 == 0)) {
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index d088e22de276..5f199129f0c3 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5894,26 +5894,22 @@ new_array_op(PyArrayObject *op_array, char *data)
 }
 
 /*
- * If operands are 1D we can use the indexed loop to do the work
- * Returns 0 if successful, -1 otherwise
+ * Use an indexed loop to do the work
+ * Returns 0 if successful
  */
 static int
-try_trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
                     PyArrayMapIterObject *iter,
                     PyArrayObject *op1_array, PyArrayObject *op2_array,
                     PyArrayMethod_Context *context)
 {
-    if (PyArray_NDIM(op1_array) != 1) {
-        return -1;
-    }
-    /* check that nop == 3 */
-    if (ufuncimpl->nin >1 && PyArray_NDIM(op2_array) != 1) {
-        return -1;
-    }
-    if (iter->numiter > 1) {
+    int buffersize=0, errormask = 0;
+    int res;
+    char *args[3];
+    const char * ufunc_name = ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+    if (_get_bufsize_errmask(NULL, ufunc_name, &buffersize, &errormask) < 0) {
         return -1;
     }
-    char *args[3];
     npy_intp dimensions = iter->size;
     npy_intp steps[3];
     args[0] = (char *) iter->baseoffset;
@@ -5922,8 +5918,20 @@ try_trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     steps[0] = iter->fancy_strides[0];
     steps[1] = iter->outer_strides[0];
     steps[2] = PyArray_STRIDE(op2_array, 0);
-    ufuncimpl->contiguous_indexed_loop(context, args, &dimensions, steps, NULL);
-    return 0;
+    res = ufuncimpl->contiguous_indexed_loop(context, args, &dimensions, steps, NULL);
+    /*
+     * An error should only be possible if `res != 0` is already set.
+     * But this is not strictly correct since old-style ufuncs (e.g. `power`
+     * released the GIL but manually set an Exception).
+     */
+    if (PyErr_Occurred()) {
+        res = -1;
+    }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
+    }
+    return res;
 }
 
 static int
@@ -6421,12 +6429,15 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
          * - the matching info has a indexed loop
          * - all operands are 1d
          */
-        if (ufuncimpl->contiguous_indexed_loop != NULL) {
-            res = try_trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+        if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
+                (PyArray_NDIM(op1_array) == 1)  &&
+                (ufuncimpl->nin == 1 || PyArray_NDIM(op2_array) == 1) && 
+                (iter->numiter == 1)) {
+            res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
                         op2_array, &context);
 
         }
-        if (res == -1) {
+        else {
             /* Couldn't use the fastest path, use the faster path */
             res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
                         op2_array, strided_loop, &context, auxdata);
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 55122d6975a0..22370bbd82f7 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1986,6 +1986,40 @@ def test_ufunc_at_basic(self, a):
         with pytest.raises(ValueError):
             # second operand cannot be converted to an array
             np.add.at(a, [2, 5, 3], [[1, 2], 1])
+    
+    # ufuncs with indexed loops for perfomance in ufunc.at
+    indexed_ufuncs = [np.add, np.subtract, np.multiply, np.floor_divide, np.divide]
+    @pytest.mark.parametrize(
+                "typecode", np.typecodes['AllInteger'] + np.typecodes['Float'])
+    @pytest.mark.parametrize("ufunc", indexed_ufuncs)
+    def test_ufunc_at_inner_loops(self, typecode, ufunc):
+        if ufunc is np.divide and typecode in  np.typecodes['AllInteger']:
+            # Avoid divide-by-zero and inf for integer divide
+            a = np.ones(100, dtype=typecode)
+            indx = np.random.randint(100, size=30, dtype=np.intp)
+            vals = np.arange(1, 31, dtype=typecode)
+        else:
+            a = np.ones(1000, dtype=typecode)
+            indx = np.random.randint(1000, size=3000, dtype=np.intp)
+            vals = np.arange(3000, dtype=typecode)
+        atag = a.copy()
+        # Do the calculation twice and compare the answers
+        with warnings.catch_warnings(record=True) as w_at:
+            warnings.simplefilter('always')
+            ufunc.at(a, indx, vals)
+        with warnings.catch_warnings(record=True) as w_loop:
+            warnings.simplefilter('always')
+            for i,v in zip(indx, vals):
+                # Make sure all the work happens inside the ufunc
+                # in order to duplicate error/warning handling
+                ufunc(atag[i], v, out=atag[i:i+1], casting="unsafe")
+        assert_equal(atag, a)
+        # If w_loop warned, make sure w_at warned as well
+        if len(w_loop) > 0:
+            #
+            assert len(w_at) > 0
+            assert w_at[0].category == w_loop[0].category
+            assert str(w_at[0].message)[:10] == str(w_loop[0].message)[:10]
 
     def test_ufunc_at_multiD(self):
         a = np.arange(9).reshape(3, 3)

From 1db1ffe140b3d5f9dd54ac2dc9e426234e9c7df0 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 1 Feb 2023 16:22:26 +0200
Subject: [PATCH 09/20] MAINT: linting

---
 numpy/core/tests/test_ufunc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 22370bbd82f7..c8d13918f003 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1988,12 +1988,14 @@ def test_ufunc_at_basic(self, a):
             np.add.at(a, [2, 5, 3], [[1, 2], 1])
     
     # ufuncs with indexed loops for perfomance in ufunc.at
-    indexed_ufuncs = [np.add, np.subtract, np.multiply, np.floor_divide, np.divide]
+    indexed_ufuncs = [np.add, np.subtract, np.multiply,
+                      np.floor_divide, np.divide]
+
     @pytest.mark.parametrize(
                 "typecode", np.typecodes['AllInteger'] + np.typecodes['Float'])
     @pytest.mark.parametrize("ufunc", indexed_ufuncs)
     def test_ufunc_at_inner_loops(self, typecode, ufunc):
-        if ufunc is np.divide and typecode in  np.typecodes['AllInteger']:
+        if ufunc is np.divide and typecode in np.typecodes['AllInteger']:
             # Avoid divide-by-zero and inf for integer divide
             a = np.ones(100, dtype=typecode)
             indx = np.random.randint(100, size=30, dtype=np.intp)
@@ -2009,7 +2011,7 @@ def test_ufunc_at_inner_loops(self, typecode, ufunc):
             ufunc.at(a, indx, vals)
         with warnings.catch_warnings(record=True) as w_loop:
             warnings.simplefilter('always')
-            for i,v in zip(indx, vals):
+            for i, v in zip(indx, vals):
                 # Make sure all the work happens inside the ufunc
                 # in order to duplicate error/warning handling
                 ufunc(atag[i], v, out=atag[i:i+1], casting="unsafe")

From ec1a4345d66992a56de6207e692ae7586fca2042 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 1 Feb 2023 16:24:02 +0200
Subject: [PATCH 10/20] MAINT: update EXPERIMENTAL_DTYPE_API_VERSION too

---
 numpy/core/src/multiarray/experimental_public_dtype_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index 734955ac4f8d..1870a726bc29 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -16,7 +16,7 @@
 #include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 6
+#define EXPERIMENTAL_DTYPE_API_VERSION 7
 
 
 typedef struct{

From 6541cf5c607a3b9500d04551dc28bfbecd1fc7a6 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 1 Feb 2023 16:33:06 +0200
Subject: [PATCH 11/20] DOC: add a performance release note

---
 .../upcoming_changes/23136.performance.rst      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 doc/release/upcoming_changes/23136.performance.rst

diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
new file mode 100644
index 000000000000..560f35885ed2
--- /dev/null
+++ b/doc/release/upcoming_changes/23136.performance.rst
@@ -0,0 +1,17 @@
+``ufunc.at`` can be much faster
+-------------------------------
+If called on ufuncs with appropriate indexed loops, ``ufunc.at`` can be up to
+60x faster. Generic ``ufunc.at`` can be up to 9x faster. The conditions for
+any speedup::
+
+- contiguous arguments
+- no casting
+
+The conditions for the extra speedup::
+
+- calling the ufuncs ``add``, ``subtract``, ``multiply``, ``divide`` (and
+  ``floor_divide``)
+- 1d arguments
+
+The internal logic is similar to the logic used for regular ufuncs, which also
+have a fast path for contiguous, non-casting, 1d arrays.

From 3802b1664706fc71dac6d8932084e8b246e1770d Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Sun, 5 Feb 2023 13:54:49 +0200
Subject: [PATCH 12/20] MAINT: rework release note, changes from review

---
 .../upcoming_changes/23136.performance.rst     | 16 ++++++++--------
 .../core/src/multiarray/argfunc.dispatch.c.src |  4 ++--
 numpy/core/src/umath/loops.c.src               | 12 ++++++------
 .../src/umath/loops_arithm_fp.dispatch.c.src   |  8 ++++----
 .../src/umath/loops_arithmetic.dispatch.c.src  | 18 +++++++++---------
 numpy/core/src/umath/ufunc_object.c            | 10 ++++++----
 6 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
index 560f35885ed2..97d0e4a5db13 100644
--- a/doc/release/upcoming_changes/23136.performance.rst
+++ b/doc/release/upcoming_changes/23136.performance.rst
@@ -1,17 +1,17 @@
 ``ufunc.at`` can be much faster
 -------------------------------
-If called on ufuncs with appropriate indexed loops, ``ufunc.at`` can be up to
-60x faster. Generic ``ufunc.at`` can be up to 9x faster. The conditions for
-any speedup::
+Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:
 
 - contiguous arguments
 - no casting
 
-The conditions for the extra speedup::
-
-- calling the ufuncs ``add``, ``subtract``, ``multiply``, ``divide`` (and
-  ``floor_divide``)
-- 1d arguments
+If ufuncs with appropriate indexed loops on 1d arguments with the above
+conditions, ``ufunc.at`` can be up to 60x faster (an additional 7x speedup).
+Appropriate indexed loops have been added to ``add``, ``subtract``,
+``multiply``, ``divide`` (and ``floor_divide``)
 
 The internal logic is similar to the logic used for regular ufuncs, which also
 have a fast path for contiguous, non-casting, 1d arrays.
+
+Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
+work.
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 88c1028dc2f6..d79be1df5034 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if ((long long int)nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if ((long long int)nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 87ef2fca961a..038dc0a2b5ca 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -557,7 +557,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
     @type@ *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
-        indexed[0] = indexed[0] @OP@ *(@type@ *)value;
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
     return 0;
 }
@@ -1436,7 +1436,7 @@ NPY_NO_EXPORT int
     @type@ *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
-        indexed[0] = npy_floor_divide@c@(indexed[0], *(@type@ *)value);
+        *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value);
     }
     return 0;
 }
@@ -1590,7 +1590,7 @@ LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
     npy_longdouble *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
-        indexed[0] = indexed[0] @OP@ *(npy_longdouble *)value;
+        *indexed = *indexed @OP@ *(npy_longdouble *)value;
     }
     return 0;
 }
@@ -1714,7 +1714,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context), char **args, npy_intp const *dime
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
         const float v = npy_half_to_float(*(npy_half *)value);
-        indexed[0] = npy_float_to_half(npy_half_to_float(indexed[0]) @OP@ v);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
     }
     return 0; 
 }
@@ -1869,8 +1869,8 @@ HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
         indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
         float v = npy_half_to_float(*(npy_half *)value);
-        float div = npy_floor_dividef(npy_half_to_float(indexed[0]), v);
-        indexed[0] = npy_float_to_half(div);
+        float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+        *indexed = npy_float_to_half(div);
     }
     return 0;
 }
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index a00ef1e4ba63..9bce0029fbf6 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -551,15 +551,15 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
 (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
-        indexed = (@type@ *)(ip1 + is1 * indx[0]);
-        indexed[0] = indexed[0] @OP@ *(@type@ *)value;
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
     return 0;
 }
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 9f3043f48199..b6f12629807b 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -400,15 +400,15 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
 (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
-        indexed = (@type@ *)(ip1 + is1 * indx[0]);
-        indexed[0] = floor_div_@TYPE@(indexed[0], *(@type@ *)value);
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
     }
     return 0;
 }
@@ -486,20 +486,20 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
 (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
     char *ip1 = args[0];
-    npy_intp *indx = (npy_intp *)args[1];
+    char *indx = args[1];
     char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1] / sizeof(npy_intp), isb = steps[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
     npy_intp n = dimensions[0];
     npy_intp i;
     @type@ *indexed;
     for(i = 0; i < n; i++, indx += isindex, value += isb) {
-        indexed = (@type@ *)(ip1 + is1 * indx[0]);
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
         @type@ in2 = *(@type@ *)value;
         if (NPY_UNLIKELY(in2 == 0)) {
             npy_set_floatstatus_divbyzero();
-            indexed[0] = 0;
+            *indexed = 0;
         } else {
-            indexed[0] = indexed[0] / in2;
+            *indexed = *indexed / in2;
         }
     }
     return 0;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 5f199129f0c3..fa10e154fe53 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5906,10 +5906,6 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     int buffersize=0, errormask = 0;
     int res;
     char *args[3];
-    const char * ufunc_name = ufunc_get_name_cstr((PyUFuncObject *)context->caller);
-    if (_get_bufsize_errmask(NULL, ufunc_name, &buffersize, &errormask) < 0) {
-        return -1;
-    }
     npy_intp dimensions = iter->size;
     npy_intp steps[3];
     args[0] = (char *) iter->baseoffset;
@@ -5929,6 +5925,12 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        const char * ufunc_name = 
+                        ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        if (_get_bufsize_errmask(NULL, ufunc_name,
+                                 &buffersize, &errormask) < 0) {
+            return -1;
+        }
         res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
     }
     return res;

From f064552773a10f6f2f9600c6c5e01c4424cbe26c Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 6 Feb 2023 15:02:25 +0100
Subject: [PATCH 13/20] BUG: Clean up reference handling in `ufunc.at` a
 little.

---
 numpy/core/src/umath/ufunc_object.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index fa10e154fe53..551315fa4ca4 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5887,6 +5887,7 @@ static inline PyArrayObject *
 new_array_op(PyArrayObject *op_array, char *data)
 {
     npy_intp dims[1] = {1};
+    Py_INCREF(PyArray_DESCR(op_array));  /* NewFromDescr steals a reference */
     PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array),
                                        1, dims, NULL, data,
                                        NPY_ARRAY_WRITEABLE, NULL);
@@ -6033,14 +6034,11 @@ ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
     }
     array_operands[0] = new_array_op(op1_array, iter->dataptr);
     if (iter2 != NULL) {
-        Py_INCREF(PyArray_DESCR(op2_array));
         array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2));
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[2] = new_array_op(op1_array, iter->dataptr);
         nop = 3;
     }
     else {
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[1] = new_array_op(op1_array, iter->dataptr);
         array_operands[2] = NULL;
         nop = 2;
@@ -6284,8 +6282,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     PyArrayMethodObject *ufuncimpl = NULL;
     {
         /* Do all the dtype handling and find the correct ufuncimpl */
-        Py_INCREF(PyArray_DESCR(op1_array));
-
 
         PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
         PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};

From 6cf9cc91233e63c2f2aa26166a50cac84574d67c Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 6 Feb 2023 15:23:18 +0100
Subject: [PATCH 14/20] BUG: Deal with casting and non-contig/1-D indices

---
 numpy/core/src/umath/ufunc_object.c | 14 ++++++++++----
 numpy/core/tests/test_ufunc.py      |  8 ++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 551315fa4ca4..efcfa73fe3de 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5907,15 +5907,21 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     int buffersize=0, errormask = 0;
     int res;
     char *args[3];
-    npy_intp dimensions = iter->size;
     npy_intp steps[3];
     args[0] = (char *) iter->baseoffset;
-    args[1] = (char *) iter->outer_ptrs[0];
     args[2] = (char *)PyArray_DATA(op2_array);
     steps[0] = iter->fancy_strides[0];
-    steps[1] = iter->outer_strides[0];
     steps[2] = PyArray_STRIDE(op2_array, 0);
-    res = ufuncimpl->contiguous_indexed_loop(context, args, &dimensions, steps, NULL);
+
+    npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
+
+    do {
+        args[1] = (char *) iter->outer_ptrs[0];
+        steps[1] = iter->outer_strides[0];
+
+        res = ufuncimpl->contiguous_indexed_loop(
+                context, args, inner_size, steps, NULL);
+    } while (res == 0 && iter->outer_next(iter->outer));
     /*
      * An error should only be possible if `res != 0` is already set.
      * But this is not strictly correct since old-style ufuncs (e.g. `power`
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index c8d13918f003..f31482666c9b 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -2023,6 +2023,14 @@ def test_ufunc_at_inner_loops(self, typecode, ufunc):
             assert w_at[0].category == w_loop[0].category
             assert str(w_at[0].message)[:10] == str(w_loop[0].message)[:10]
 
+    def test_cast_index_fastpath(self):
+        arr = np.zeros(10)
+        values = np.ones(100000)
+        # index must be cast, which may be buffered in chunks:
+        index = np.zeros(len(values), dtype=np.uint8)
+        np.add.at(arr, index, values)
+        assert arr[0] == len(values)
+
     def test_ufunc_at_multiD(self):
         a = np.arange(9).reshape(3, 3)
         b = np.array([[100, 100, 100], [200, 200, 200], [300, 300, 300]])

From ff97723de0ff365f5d7f2b30be5ac7987511a584 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Mon, 6 Feb 2023 23:03:13 +0200
Subject: [PATCH 15/20] BUG: fixes from review: use iter->nd and adapt to unary
 ufuncs

---
 numpy/core/src/umath/ufunc_object.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index efcfa73fe3de..63994c2589e8 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5039,7 +5039,7 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
     PyObject *thedict;
     PyObject *res;
-    
+
     thedict = PyThreadState_GetDict();
     if (thedict == NULL) {
         thedict = PyEval_GetBuiltins();
@@ -5909,9 +5909,14 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     char *args[3];
     npy_intp steps[3];
     args[0] = (char *) iter->baseoffset;
-    args[2] = (char *)PyArray_DATA(op2_array);
     steps[0] = iter->fancy_strides[0];
-    steps[2] = PyArray_STRIDE(op2_array, 0);
+    if (ufuncimpl->nin == 1) {
+        args[2] = NULL;
+        steps[2] = 0;
+    } else {
+        args[2] = (char *)PyArray_DATA(op2_array);
+        steps[2] = PyArray_STRIDE(op2_array, 0);
+    }
 
     npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
 
@@ -5932,7 +5937,7 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
     }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        const char * ufunc_name = 
+        const char * ufunc_name =
                         ufunc_get_name_cstr((PyUFuncObject *)context->caller);
         if (_get_bufsize_errmask(NULL, ufunc_name,
                                  &buffersize, &errormask) < 0) {
@@ -6435,8 +6440,8 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
          */
         if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
                 (PyArray_NDIM(op1_array) == 1)  &&
-                (ufuncimpl->nin == 1 || PyArray_NDIM(op2_array) == 1) && 
-                (iter->numiter == 1)) {
+                (op2_array != NULL && PyArray_NDIM(op2_array) == 1) &&
+                (iter->nd == 1)) {
             res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
                         op2_array, &context);
 

From 163d3c533d37fa76727ecdee429446ad41aac7f2 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 7 Feb 2023 10:51:58 +0200
Subject: [PATCH 16/20] BUG: improve check for 1d operands (from review)

---
 numpy/core/src/umath/ufunc_object.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 63994c2589e8..60efdce96d5b 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -6436,12 +6436,15 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /*
          * Try to use trivial loop (1d, no casting, aligned) if
          * - the matching info has a indexed loop
+         * - idx must be exactly one integer index array
          * - all operands are 1d
+         * A future enhancement could loosen the restriction on 1d operands
+         * by adding an iteration loop inside trivial_at_loop
          */
         if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
                 (PyArray_NDIM(op1_array) == 1)  &&
                 (op2_array != NULL && PyArray_NDIM(op2_array) == 1) &&
-                (iter->nd == 1)) {
+                (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
             res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
                         op2_array, &context);
 

From f79325e70e1a79e1fdda9edfcdebd78048e67d2c Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 7 Feb 2023 11:55:34 +0200
Subject: [PATCH 17/20] TST: add test (from review)

---
 numpy/core/tests/test_ufunc.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index f31482666c9b..88668c81b7fd 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -2023,6 +2023,13 @@ def test_ufunc_at_inner_loops(self, typecode, ufunc):
             assert w_at[0].category == w_loop[0].category
             assert str(w_at[0].message)[:10] == str(w_loop[0].message)[:10]
 
+    def test_ufunc_at_ellipsis(self):
+        # Make sure the indexed loop check does not choke on iters
+        # with subspaces
+        arr = np.zeros(5)
+        np.add.at(arr, slice(None), np.ones(5))
+        assert_array_equal(arr, np.ones(5))
+
     def test_cast_index_fastpath(self):
         arr = np.zeros(10)
         values = np.ones(100000)

From b80bc2d324ddbbb1fae7e690c641b62c3923ddcf Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 7 Feb 2023 12:04:45 +0200
Subject: [PATCH 18/20] ENH: fixes from review

---
 numpy/core/src/umath/ufunc_object.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 60efdce96d5b..35fb2aea5dad 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -5920,6 +5920,10 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
 
     npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
 
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
+    }
+
     do {
         args[1] = (char *) iter->outer_ptrs[0];
         steps[1] = iter->outer_strides[0];
@@ -5927,14 +5931,6 @@ trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
         res = ufuncimpl->contiguous_indexed_loop(
                 context, args, inner_size, steps, NULL);
     } while (res == 0 && iter->outer_next(iter->outer));
-    /*
-     * An error should only be possible if `res != 0` is already set.
-     * But this is not strictly correct since old-style ufuncs (e.g. `power`
-     * released the GIL but manually set an Exception).
-     */
-    if (PyErr_Occurred()) {
-        res = -1;
-    }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
         const char * ufunc_name =
@@ -6355,10 +6351,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         }
     }
 
-    /*
-     * Create a map iterator (there is no multi-mapiter, so we need at least 2
-     * iterators: one for the mapping, and another for the second operand)
-     */
     iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
         op1_array, idx, 1, op2_array);
     if (iter == NULL) {

From 26978d72e795dc6cd2f31150e1f9d016b41dcf5c Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 7 Feb 2023 15:13:23 +0200
Subject: [PATCH 19/20] DOC: improve release snippet (from review)

---
 doc/release/upcoming_changes/23136.performance.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
index 97d0e4a5db13..107241dda0f2 100644
--- a/doc/release/upcoming_changes/23136.performance.rst
+++ b/doc/release/upcoming_changes/23136.performance.rst
@@ -2,7 +2,7 @@
 -------------------------------
 Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:
 
-- contiguous arguments
+- operands are aligned
 - no casting
 
 If ufuncs with appropriate indexed loops on 1d arguments with the above
@@ -11,7 +11,7 @@ Appropriate indexed loops have been added to ``add``, ``subtract``,
 ``multiply``, ``divide`` (and ``floor_divide``)
 
 The internal logic is similar to the logic used for regular ufuncs, which also
-have a fast path for contiguous, non-casting, 1d arrays.
+have fast paths.
 
 Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
 work.

From 790426e16466af4ef05ed295a27cbf5e7ff76a14 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 7 Feb 2023 18:00:46 +0200
Subject: [PATCH 20/20] TST, BUG: add a test for unary ufuncs, fix condition
 for indexed loops

---
 numpy/conftest.py                       |  2 +
 numpy/core/src/umath/_umath_tests.c.src | 98 ++++++++++++++++++++++++-
 numpy/core/src/umath/ufunc_object.c     |  2 +-
 numpy/core/tests/test_ufunc.py          |  7 ++
 4 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/numpy/conftest.py b/numpy/conftest.py
index 8aa6587ee634..d450cc99b58c 100644
--- a/numpy/conftest.py
+++ b/numpy/conftest.py
@@ -40,6 +40,8 @@
     "numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
 )
 
+# The experimentalAPI is used in _umath_tests
+os.environ["NUMPY_EXPERIMENTAL_DTYPE_API"] = "1"
 
 def pytest_configure(config):
     config.addinivalue_line("markers",
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 1bf459ce6664..624dcefda891 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -9,6 +9,9 @@
 #include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -19,6 +22,9 @@
 #include "npy_cpu_features.h"
 #include "npy_cpu_dispatch.h"
 #include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+#include "dtypemeta.h"
 
 /*
  *****************************************************************************
@@ -300,7 +306,7 @@ static void
                     ptr_this += stride_d;
                     ptr_that += stride_d;
                 }
-                *(@typ@ *)data_out = npy_@sqrt_func@(out);
+                *(@typ@ *)data_out = @sqrt_func@(out);
                 data_that += stride_n;
                 data_out += stride_p;
             }
@@ -343,6 +349,50 @@ static void
 
 /**end repeat**/
 
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+               char **args, npy_intp const *dimensions,
+               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    char *ip=args[0], *op=args[1];
+    for (i = 0; i < di; i++, ip += is, op += os) {
+        if (i == 3) {
+            *(int32_t *)op = - 100;
+        } else {
+            *(int32_t *)op = - *(int32_t *)ip;
+        }
+    }
+    return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    npy_intp is1 = steps[0], isindex = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    int32_t *indexed;
+    for(i = 0; i < n; i++, indx += isindex) {
+        indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx);
+        if (i == 3) {
+            *indexed = -200;
+        } else {
+            *indexed = - *indexed;
+        }
+    }
+    return 0;
+}
+
+
+
 /*  The following lines were generated using a slightly modified
     version of code_generators/generate_umath.py and adding these
     lines to defdict:
@@ -671,6 +721,44 @@ err:
     return NULL;
 }
 
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+    if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_VERSION) < 0) {
+        return -1;
+    }
+
+    PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+                                    PyUFunc_Zero, "indexed_negative", NULL, 0);
+    if (negative == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT32);
+    PyArray_DTypeMeta *dtypes[] = {dtype, dtype};
+
+    PyType_Slot slots[] = {
+        {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+        {NPY_METH_strided_loop, INT32_negative},
+        {0, NULL}
+    };
+
+    PyArrayMethod_Spec spec = {
+        .name = "negative_indexed_loop",
+        .nin = 1,
+        .nout = 1,
+        .dtypes = dtypes,
+        .slots = slots,
+        .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+    };
+
+    if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+        Py_DECREF(negative);
+        return -1;
+    }
+    PyDict_SetItemString(dict, "indexed_negative", negative);
+    Py_DECREF(negative);
+    return 0;
+}
+
 static PyMethodDef UMath_TestsMethods[] = {
     {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
      "Test signature parsing of ufunc. \n"
@@ -733,5 +821,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
                         "cannot load _umath_tests module.");
         return NULL;
     }
+
+    if (add_INT32_negative_indexed(m, d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
     return m;
 }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 35fb2aea5dad..bfb4a8143e8d 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -6435,7 +6435,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
          */
         if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
                 (PyArray_NDIM(op1_array) == 1)  &&
-                (op2_array != NULL && PyArray_NDIM(op2_array) == 1) &&
+                (op2_array == NULL || PyArray_NDIM(op2_array) == 1) &&
                 (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
             res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
                         op2_array, &context);
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 88668c81b7fd..58f9ef0b54d3 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -2030,6 +2030,13 @@ def test_ufunc_at_ellipsis(self):
         np.add.at(arr, slice(None), np.ones(5))
         assert_array_equal(arr, np.ones(5))
 
+    def test_ufunc_at_negative(self):
+        arr = np.ones(5, dtype=np.int32)
+        indx = np.arange(5)
+        umt.indexed_negative.at(arr, indx)
+        # If it is [-1, -1, -1, -100, 0] then the regular strided loop was used
+        assert np.all(arr == [-1, -1, -1, -200, -1])
+
     def test_cast_index_fastpath(self):
         arr = np.zeros(10)
         values = np.ones(100000)