numpy
diff --git a/‎.github/workflows/linux.yml
Lines changed: 3 additions & 6 deletions b/‎.github/workflows/linux.yml
Lines changed: 3 additions & 6 deletions
diff --git a/‎benchmarks/benchmarks/bench_core.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmarks/bench_core.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎numpy/_core/code_generators/generate_umath.py
Lines changed: 20 additions & 20 deletions b/‎numpy/_core/code_generators/generate_umath.py
Lines changed: 20 additions & 20 deletions
diff --git a/‎numpy/_core/meson.build
Lines changed: 24 additions & 7 deletions b/‎numpy/_core/meson.build
Lines changed: 24 additions & 7 deletions
diff --git a/‎numpy/_core/src/multiarray/arraytypes.c.src
Lines changed: 36 additions & 0 deletions b/‎numpy/_core/src/multiarray/arraytypes.c.src
Lines changed: 36 additions & 0 deletions
diff --git a/‎numpy/_core/src/multiarray/arraytypes.h.src
Lines changed: 3 additions & 0 deletions b/‎numpy/_core/src/multiarray/arraytypes.h.src
Lines changed: 3 additions & 0 deletions
diff --git a/‎numpy/_core/src/multiarray/item_selection.c
Lines changed: 11 additions & 2 deletions b/‎numpy/_core/src/multiarray/item_selection.c
Lines changed: 11 additions & 2 deletions
diff --git a/‎numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
Lines changed: 1 addition & 1 deletion b/‎numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
Lines changed: 1 addition & 1 deletion
@@ -58,22 +58,19 @@ jobs:
       MESON_ARGS: "-Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none"
     strategy:
       matrix:
-        version: ["3.11", "3.12", "3.13", "3.13t"]
+        version: ["3.11", "3.12", "3.13", "3.14-dev", "3.14t-dev"]
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       with:
         submodules: recursive
         fetch-tags: true
         persist-credentials: false
-    - uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86
+    - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
       with:
         python-version: ${{ matrix.version }}
-        enable-cache: false
-    - run:
-        uv pip install --python=${{ matrix.version }} pip
     # TODO: remove cython nightly install when cython does a release
     - name: Install nightly Cython
-      if: matrix.version == '3.13t'
+      if: matrix.version == '3.14t-dev'
       run: |
         pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
     - uses: ./.github/meson_actions
 
@@ -151,7 +151,7 @@ class CountNonzero(Benchmark):
     params = [
         [1, 2, 3],
         [100, 10000, 1000000],
-        [bool, np.int8, np.int16, np.int32, np.int64, str, object]
+        [bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, str, object]
     ]
 
     def setup(self, numaxes, size, dtype):
 
@@ -776,55 +776,55 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arccos'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='acos', astype={'e': 'f'}),
           TD(P, f='arccos'),
           ),
 'arccosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arccosh'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='acosh', astype={'e': 'f'}),
           TD(P, f='arccosh'),
           ),
 'arcsin':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arcsin'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='asin', astype={'e': 'f'}),
           TD(P, f='arcsin'),
           ),
 'arcsinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arcsinh'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch<
D7AE
span class="pl-c1">=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='asinh', astype={'e': 'f'}),
           TD(P, f='arcsinh'),
           ),
 'arctan':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arctan'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='atan', astype={'e': 'f'}),
           TD(P, f='arctan'),
           ),
 'arctanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.arctanh'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='atanh', astype={'e': 'f'}),
           TD(P, f='arctanh'),
           ),
 'cos':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.cos'),
           None,
-          TD('e', dispatch=[('loops_umath_fp', 'e')]),
+          TD('e', dispatch=[('loops_half', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
           TD('d', dispatch=[('loops_trigonometric', 'd')]),
           TD('g' + cmplx, f='cos'),
@@ -834,7 +834,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.sin'),
           None,
-          TD('e', dispatch=[('loops_umath_fp', 'e')]),
+          TD('e', dispatch=[('loops_half', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
           TD('d', dispatch=[('loops_trigonometric', 'd')]),
           TD('g' + cmplx, f='sin'),
@@ -844,31 +844,31 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.tan'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='tan', astype={'e': 'f'}),
           TD(P, f<
F438
/span>='tan'),
           ),
 'cosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.cosh'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='cosh', astype={'e': 'f'}),
           TD(P, f='cosh'),
           ),
 'sinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.sinh'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='sinh', astype={'e': 'f'}),
           TD(P, f='sinh'),
           ),
 'tanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.tanh'),
           None,
-          TD('e', dispatch=[('loops_umath_fp', 'e')]),
+          TD('e', dispatch=[('loops_half', 'e')]),
           TD('fd', dispatch=[('loops_hyperbolic', 'fd')]),
           TD(inexact, f='tanh', astype={'e': 'f'}),
           TD(P, f='tanh'),
@@ -877,7 +877,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.exp'),
           None,
-          TD('e', dispatch=[('loops_umath_fp', 'e')]),
+          TD('e', dispatch=[('loops_half', 'e')]),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='exp'),
           TD(P, f='exp'),
@@ -886,23 +886,23 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.exp2'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='exp2', astype={'e': 'f'}),
           TD(P, f='exp2'),
           ),
 'expm1':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.expm1'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='expm1', astype={'e': 'f'}),
           TD(P, f='expm1'),
           ),
 'log':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.log'),
           None,
-          TD('e', dispatch=[('loops_umath_fp', 'e')]),
+          TD('e', dispatch=[('loops_half', 'e')]),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
@@ -911,23 +911,23 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.log2'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='log2', astype={'e': 'f'}),
           TD(P, f='log2'),
           ),
 'log10':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.log10'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='log10', astype={'e': 'f'}),
           TD(P, f='log10'),
           ),
 'log1p':
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.log1p'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(inexact, f='log1p', astype={'e': 'f'}),
           TD(P, f='log1p'),
           ),
@@ -944,7 +944,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy._core.umath.cbrt'),
           None,
-          TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
+          TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
           TD(flts, f='cbrt', astype={'e': 'f'}),
           TD(P, f='cbrt'),
           ),
 
@@ -343,14 +343,26 @@ endif
 optional_function_attributes = [
   ['optimize("unroll-loops")', 'OPTIMIZE_UNROLL_LOOPS'],
   ['optimize("O3")', 'OPTIMIZE_OPT_3'],
-  ['optimize("O2")', 'OPTIMIZE_OPT_2'],
-  ['optimize("nonnull (1)")', 'NONNULL'],
+  ['nonnull(1)', 'NONNULL'],
 ]
-#foreach attr: optional_function_attributes
-#  if cc.has_function_attribute(attr[0])
-#    cdata.set10('HAVE_ATTRIBUTE_' + attr[1], true)
-#  endif
-#endforeach
+if get_option('disable-optimization') == false
+  foreach attr: optional_function_attributes
+    test_code = '''
+      __attribute__((@0@)) void test_function(void *ptr) {
+        (void*)ptr;
+        return;
+      }
+      int main(void) {
+        int dummy = 0;
+        test_function(&dummy);
+        return 0;
+      }
+    '''.format(attr[0])
+    if cc.compiles(test_code, name: '__attribute__((' + attr[0] + '))', args: ['-Werror', '-Wattributes'])
+      cdata.set10('HAVE_ATTRIBUTE_' + attr[1], true)
+    endif
+  endforeach
+endif
 
 # Max possible optimization flags. We pass this flags to all our dispatch-able
 # (multi_targets) sources.
@@ -1039,6 +1051,11 @@ foreach gen_mtargets : [
       LSX,
     ]
   ],
+  [
+    'loops_half.dispatch.h',
+    src_file.process('src/umath/loops_half.dispatch.c.src'),
+    [AVX512_SPR, AVX512_SKX]
+  ],
 ]
   mtargets = mod_features.multi_targets(
     gen_mtargets[0], umath_gen_headers + gen_mtargets[1],
 
@@ -2534,6 +2534,42 @@ static npy_bool
 }
 /**end repeat**/
 
+/**begin repeat
+ *
+ * #name = BOOL, BYTE, UBYTE, USHORT, SHORT, UINT, INT, ULONG, LONG, FLOAT, DOUBLE#
+ * #type = npy_bool, npy_byte, npy_byte, npy_uint16, npy_int16, npy_uint32, npy_int32, npy_uint64, npy_int64, npy_float, npy_double#
+ * #nonzero = _NONZERO*11#
+ */
+static npy_intp
+count_nonzero_trivial_@name@(npy_intp count, const char *data, npy_int stride)
+{
+    npy_intp nonzero_count = 0;
+    while (count--) {
+        @type@ *ptmp = (@type@ *)data;
+        nonzero_count += (npy_bool) @nonzero@(*ptmp);
+        data += stride;
+    }
+    return nonzero_count;
+}
+/**end repeat**/
+
+NPY_NO_EXPORT npy_intp
+count_nonzero_trivial_dispatcher(npy_intp count, const char* data, npy_intp stride, int dtype_num) { 
+    switch(dtype_num) {
+        /**begin repeat
+         *
+         * #dtypeID = NPY_BOOL, NPY_UINT8, NPY_INT8, NPY_UINT16, NPY_INT16, NPY_UINT32, NPY_INT32, NPY_UINT64, NPY_INT64, NPY_FLOAT32, NPY_FLOAT64#
+         * #name = BOOL, BYTE, UBYTE, USHORT, SHORT, UINT, INT, ULONG, LONG, FLOAT, DOUBLE#
+         */
+        case @dtypeID@:
+        {
+            return count_nonzero_trivial_@name@(count, data, stride);
+        }
+        /**end repeat**/
+    }
+    return -1;
+}
+
 /**begin repeat
  *
  * #fname = CFLOAT, CDOUBLE, CLONGDOUBLE#
 
@@ -164,4 +164,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
 #undef INT_not_size_named
 #undef LONGLONG_not_size_named
 
+NPY_NO_EXPORT npy_intp
+count_nonzero_trivial_dispatcher(npy_intp count, const char* data, npy_intp stride, int dtype_num);
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
@@ -398,7 +398,7 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     }
     ni = PyArray_SIZE(indices);
     if ((ni > 0) && (PyArray_Size((PyObject *)self) == 0)) {
-        PyErr_SetString(PyExc_IndexError, 
+        PyErr_SetString(PyExc_IndexError,
                         "cannot replace elements of an empty array");
         goto fail;
     }
@@ -2629,7 +2629,7 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
     return count;
 }
 /*
- * Counts the number of True values in a raw boolean array. This
+ * Counts the number of non-zero values in a raw int array. This
  * is a low-overhead function which does no heap allocations.
  *
  * Returns -1 on error.
@@ -2739,6 +2739,15 @@ PyArray_CountNonzero(PyArrayObject *self)
             }
         }
         else {
+            /* Special low-overhead version specific to the float types (and some others) */
+            if (PyArray_ISNOTSWAPPED(self) && PyArray_ISALIGNED(self)) {
+                npy_intp dispatched_nonzero_count = count_nonzero_trivial_dispatcher(count,
+                                                        data, stride, dtype->type_num);
+                if (dispatched_nonzero_count >= 0) {
+                    return dispatched_nonzero_count;
+                }
+            }
+
             NPY_BEGIN_THREADS_THRESHOLDED(count);
             while (count--) {
                 if (nonzero(data, self)) {
 
@@ -708,7 +708,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || defined(NPY_HAVE_F16C)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
     #define NATIVE_FP16 1
     typedef _Float16 _npy_half;
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ class CountNonzero(Benchmark):`
`151`	`151`	`params = [`
`152`	`152`	`[1, 2, 3],`
`153`	`153`	`[100, 10000, 1000000],`
`154`		`- [bool, np.int8, np.int16, np.int32, np.int64, str, object]`
	`154`	`+ [bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, str, object]`
`155`	`155`	`]`
`156`	`156`
`157`	`157`	`def setup(self, numaxes, size, dtype):`
Original file line number	Diff line number	Diff line change
`@@ -398,7 +398,7 @@ PyArray_PutTo(PyArrayObject self, PyObject values0, PyObject *indices0,`
`398`	`398`	`}`
`399`	`399`	`ni = PyArray_SIZE(indices);`
`400`	`400`	`if ((ni > 0) && (PyArray_Size((PyObject *)self) == 0)) {`
`401`		`- PyErr_SetString(PyExc_IndexError,`
	`401`	`+ PyErr_SetString(PyExc_IndexError,`
`402`	`402`	`"cannot replace elements of an empty array");`
`403`	`403`	`goto fail;`
`404`	`404`	`}`
`@@ -2629,7 +2629,7 @@ count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)`
`2629`	`2629`	`return count;`
`2630`	`2630`	`}`
`2631`	`2631`	`/*`
`2632`		`- * Counts the number of True values in a raw boolean array. This`
	`2632`	`+ * Counts the number of non-zero values in a raw int array. This`
`2633`	`2633`	`* is a low-overhead function which does no heap allocations.`
`2634`	`2634`	`*`
`2635`	`2635`	`* Returns -1 on error.`
`@@ -2739,6 +2739,15 @@ PyArray_CountNonzero(PyArrayObject *self)`
`2739`	`2739`	`}`
`2740`	`2740`	`}`
`2741`	`2741`	`else {`
	`2742`	`+ /* Special low-overhead version specific to the float types (and some others) */`
	`2743`	`+ if (PyArray_ISNOTSWAPPED(self) && PyArray_ISALIGNED(self)) {`
	`2744`	`+ npy_intp dispatched_nonzero_count = count_nonzero_trivial_dispatcher(count,`
	`2745`	`+ data, stride, dtype->type_num);`
	`2746`	`+ if (dispatched_nonzero_count >= 0) {`
	`2747`	`+ return dispatched_nonzero_count;`
	`2748`	`+ }`
	`2749`	`+ }`
	`2750`	`+`
`2742`	`2751`	`NPY_BEGIN_THREADS_THRESHOLDED(count);`
`2743`	`2752`	`while (count--) {`
`2744`	`2753`	`if (nonzero(data, self)) {`