From 2f4ace73ee323932f2e5b9e32da696b47e7bda51 Mon Sep 17 00:00:00 2001
From: Matti Picus <matti.picus@gmail.com>
Date: Sun, 15 Jun 2025 15:21:03 +0300
Subject: [PATCH 01/26] BUG: fix linting (#29210)

---
 numpy/_core/tests/test_multiarray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index f66109a3d8b5..b164f1dada3b 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -7333,7 +7333,7 @@ def apply_mode(m, mode):
                 (m.shape[0] * 2, m.shape[1] * 2), dtype=m.dtype, order=order
             )[::2, ::2]
             retval[...] = m
-            return retval    
+            return retval
 
         is_complex = np.issubdtype(dtype, np.complexfloating)
         m1 = self.m1.astype(dtype) + (1j if is_complex else 0)
@@ -7341,7 +7341,7 @@ def apply_mode(m, mode):
         dot_res = np.dot(m1, m2)
         mo = np.zeros_like(dot_res)
 
-        for mode in itertools.product(*[modes]*3):
+        for mode in itertools.product(*[modes] * 3):
             m1_, m2_, mo_ = [apply_mode(*x) for x in zip([m1, m2, mo], mode)]
             assert_equal(np.matmul(m1_, m2_, out=mo_), dot_res)
 

From aea8869549c110c46afbe75ed2e162b702d3d106 Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Tue, 17 Jun 2025 16:09:16 +0530
Subject: [PATCH 02/26] CI: Add WoA validation setup to windows.yml

---
 .github/workflows/windows.yml | 74 +++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e760e37780a7..18d02081fd67 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -85,6 +85,80 @@ jobs:
       run: |
         spin test -- --timeout=600 --durations=10
 
+  python64bit_openblas_winarm64:
+    name: arm64, LPARM64 OpenBLAS
+    runs-on: windows-11-arm
+    # To enable this job on a fork, comment out:
+    if: github.repository == 'numpy/numpy'
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler-pyversion:
+          - ["MSVC", "3.11"]
+          - ["Clang-cl", "3.14t-dev"]
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        submodules: recursive
+        fetch-tags: true
+        persist-credentials: false
+
+    - name: Setup Python
+      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+      with:
+        python-version: ${{ matrix.compiler-pyversion[1] }}
+        architecture: arm64
+
+    - name: Setup MSVC
+      if: matrix.compiler-pyversion[0] == 'MSVC'
+      uses: bus1/cabuild/action/msdevshell@e22aba57d6e74891d059d66501b6b5aed8123c4d  # v1
+      with:
+        architecture: arm64
+
+    - name: Install build dependencies from PyPI
+      run: |
+        pip install -r requirements/build_requirements.txt
+
+    - name: Install pkg-config
+      run: |
+        choco install -y --stoponfirstfailure --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite
+        echo "PKG_CONFIG_PATH=${{ github.workspace }}/.openblas" >> $env:GITHUB_ENV
+
+    - name: Install Clang-cl
+      if: matrix.compiler-pyversion[0] == 'Clang-cl'
+      run: |
+        uses: ./.github/windows_arm64_steps
+
+    - name: Install NumPy (MSVC)
+      if: matrix.compiler-pyversion[0] == 'MSVC'
+      run: |
+        pip install -r requirements/ci_requirements.txt
+        spin build --with-scipy-openblas=32 -j2 -- --vsenv
+
+    - name: Install NumPy (Clang-cl)
+      if: matrix.compiler-pyversion[0] == 'Clang-cl'
+      run: |
+        "[binaries]","c = 'clang-cl'","cpp = 'clang-cl'","ar = 'llvm-lib'","c_ld = 'lld-link'","cpp_ld = 'lld-link'" | Out-File $PWD/clang-cl-arm64.ini -Encoding ascii
+        pip install -r requirements/ci_requirements.txt
+        spin build --with-scipy-openblas=32 -j2 -- --vsenv --native-file=$PWD/clang-cl-arm64.ini
+
+    - name: Meson Log
+      shell: bash
+      if: ${{ failure() }}
+      run: |
+        cat build/meson-logs/meson-log.txt
+
+    - name: Install test dependencies
+      run: |
+        python -m pip install -r requirements/test_requirements.txt
+        python -m pip install threadpoolctl
+
+    - name: Run test suite
+      run: |
+        spin test -- --timeout=600 --durations=10
+
   msvc_python_no_openblas:
     name: MSVC, ${{ matrix.architecture }} Python , no BLAS
     runs-on: ${{ matrix.os }}

From d8c4f2ba8604fbb0e7f22a5df8ea3a355a53385e Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Tue, 17 Jun 2025 16:12:41 +0530
Subject: [PATCH 03/26] CI: Create action.yml for LLVM Win-ARM64 as reusable
 blocks

---
 .github/windows_arm64_steps /action.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .github/windows_arm64_steps /action.yml

diff --git a/.github/windows_arm64_steps /action.yml b/.github/windows_arm64_steps /action.yml
new file mode 100644
index 000000000000..86517b6246e7
--- /dev/null
+++ b/.github/windows_arm64_steps /action.yml	
@@ -0,0 +1,16 @@
+name: Build Dependencies(Win-ARM64)
+description: "Setup LLVM for Win-ARM64 builds"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install LLVM
+      shell: pwsh
+      run: |
+        Invoke-WebRequest https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.6/LLVM-20.1.6-woa64.exe -UseBasicParsing -OutFile LLVM-woa64.exe
+        Start-Process -FilePath ".\LLVM-woa64.exe" -ArgumentList "/S" -Wait
+        echo "C:\Program Files\LLVM\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "CC=clang-cl" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        echo "CXX=clang-cl" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        echo "FC=flang-new" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+

From f02bb58cb634286739a1c5eb9dc35bc0e086efa7 Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Tue, 17 Jun 2025 16:16:56 +0530
Subject: [PATCH 04/26] CI: Modify wheel.yml to use clang-cl for Win-ARM64

---
 .github/workflows/wheels.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 68352eb1fc7c..ab6bbb618899 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -127,11 +127,9 @@ jobs:
         with:
           architecture: 'x86'
 
-      - name: Setup MSVC arm64
+      - name: Setup LLVM for Windows ARM64
         if: ${{ matrix.buildplat[1] == 'win_arm64' }}
-        uses: bus1/cabuild/action/msdevshell@e22aba57d6e74891d059d66501b6b5aed8123c4d  # v1
-        with:
-          architecture: 'arm64'
+        uses: ./.github/windows_arm64_steps
 
       - name: pkg-config-for-win
         run: |

From fb0dff6061addfa3b7d2cba9ecc4dfedf5dad954 Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Tue, 17 Jun 2025 16:32:03 +0530
Subject: [PATCH 05/26] CI: fix action.yml naming

---
 .github/{windows_arm64_steps  => windows_arm64_steps}/action.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/{windows_arm64_steps  => windows_arm64_steps}/action.yml (100%)

diff --git a/.github/windows_arm64_steps /action.yml b/.github/windows_arm64_steps/action.yml
similarity index 100%
rename from .github/windows_arm64_steps /action.yml
rename to .github/windows_arm64_steps/action.yml

From f36c0daa423244ca6887ab426e3c6149a13a2667 Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Tue, 17 Jun 2025 16:40:39 +0530
Subject: [PATCH 06/26] CI: Fix reusable LLVM block

---
 .github/workflows/windows.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 18d02081fd67..e723b787a5de 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -128,8 +128,7 @@ jobs:
 
     - name: Install Clang-cl
       if: matrix.compiler-pyversion[0] == 'Clang-cl'
-      run: |
-        uses: ./.github/windows_arm64_steps
+      uses: ./.github/windows_arm64_steps
 
     - name: Install NumPy (MSVC)
       if: matrix.compiler-pyversion[0] == 'MSVC'

From 03d57303675386998ad005d28c7c3ca7177694d1 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Tue, 17 Jun 2025 17:49:15 +0200
Subject: [PATCH 07/26] MAINT: Fix some undef warnings (#29216)

As noted by Chuck in gh-29138, there are some undef warnings that seem not nice
this should fix them.

The fact that `NPY_LONG`, etc. are not defined at macro expansion time is a bit
of a trap, maybe it would be nice to have CI fail for this...
---
 numpy/_core/src/_simd/_simd.c                             | 2 +-
 numpy/_core/src/multiarray/descriptor.c                   | 2 +-
 numpy/_core/src/umath/_operand_flag_tests.c               | 2 +-
 numpy/_core/src/umath/_rational_tests.c                   | 2 +-
 numpy/_core/src/umath/_struct_ufunc_tests.c               | 2 +-
 numpy/_core/src/umath/_umath_tests.c.src                  | 2 +-
 .../_core/tests/examples/limited_api/limited_api_latest.c | 8 ++++----
 numpy/f2py/rules.py                                       | 2 +-
 numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c        | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/numpy/_core/src/_simd/_simd.c b/numpy/_core/src/_simd/_simd.c
index 2f0a5df6375c..d25d7bbf1c38 100644
--- a/numpy/_core/src/_simd/_simd.c
+++ b/numpy/_core/src/_simd/_simd.c
@@ -88,7 +88,7 @@ PyMODINIT_FUNC PyInit__simd(void)
     NPY_MTARGETS_CONF_DISPATCH(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
     NPY_MTARGETS_CONF_BASELINE(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
diff --git a/numpy/_core/src/multiarray/descriptor.c b/numpy/_core/src/multiarray/descriptor.c
index 5708e5c6ecb7..f520e3c4bceb 100644
--- a/numpy/_core/src/multiarray/descriptor.c
+++ b/numpy/_core/src/multiarray/descriptor.c
@@ -2095,7 +2095,7 @@ static PyMemberDef arraydescr_members[] = {
     {"alignment",
         T_PYSSIZET, offsetof(PyArray_Descr, alignment), READONLY, NULL},
     {"flags",
-#if NPY_ULONGLONG == NPY_UINT64
+#if NPY_SIZEOF_LONGLONG == 8
         T_ULONGLONG, offsetof(PyArray_Descr, flags), READONLY, NULL},
 #else
     #error Assuming long long is 64bit, if not replace with getter function.
diff --git a/numpy/_core/src/umath/_operand_flag_tests.c b/numpy/_core/src/umath/_operand_flag_tests.c
index 9747b7946512..5cdff6220280 100644
--- a/numpy/_core/src/umath/_operand_flag_tests.c
+++ b/numpy/_core/src/umath/_operand_flag_tests.c
@@ -77,7 +77,7 @@ PyMODINIT_FUNC PyInit__operand_flag_tests(void)
     ((PyUFuncObject*)ufunc)->iter_flags = NPY_ITER_REDUCE_OK;
     PyModule_AddObject(m, "inplace_add", (PyObject*)ufunc);
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
diff --git a/numpy/_core/src/umath/_rational_tests.c b/numpy/_core/src/umath/_rational_tests.c
index a95c89b373df..d257bc22d051 100644
--- a/numpy/_core/src/umath/_rational_tests.c
+++ b/numpy/_core/src/umath/_rational_tests.c
@@ -1355,7 +1355,7 @@ PyMODINIT_FUNC PyInit__rational_tests(void) {
     GCD_LCM_UFUNC(gcd,NPY_INT64,"greatest common denominator of two integers");
     GCD_LCM_UFUNC(lcm,NPY_INT64,"least common multiple of two integers");
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
diff --git a/numpy/_core/src/umath/_struct_ufunc_tests.c b/numpy/_core/src/umath/_struct_ufunc_tests.c
index 8edbdc00b6f3..56c4be117e44 100644
--- a/numpy/_core/src/umath/_struct_ufunc_tests.c
+++ b/numpy/_core/src/umath/_struct_ufunc_tests.c
@@ -157,7 +157,7 @@ PyMODINIT_FUNC PyInit__struct_ufunc_tests(void)
     PyDict_SetItemString(d, "add_triplet", add_triplet);
     Py_DECREF(add_triplet);
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
diff --git a/numpy/_core/src/umath/_umath_tests.c.src b/numpy/_core/src/umath/_umath_tests.c.src
index 9f2818d14526..845f51ebc94f 100644
--- a/numpy/_core/src/umath/_umath_tests.c.src
+++ b/numpy/_core/src/umath/_umath_tests.c.src
@@ -944,7 +944,7 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
diff --git a/numpy/_core/tests/examples/limited_api/limited_api_latest.c b/numpy/_core/tests/examples/limited_api/limited_api_latest.c
index 13668f2f0ebf..92d83ea977a1 100644
--- a/numpy/_core/tests/examples/limited_api/limited_api_latest.c
+++ b/numpy/_core/tests/examples/limited_api/limited_api_latest.c
@@ -1,11 +1,11 @@
-#if Py_LIMITED_API != PY_VERSION_HEX & 0xffff0000
-    # error "Py_LIMITED_API not defined to Python major+minor version"
-#endif
-
 #include <Python.h>
 #include <numpy/arrayobject.h>
 #include <numpy/ufuncobject.h>
 
+#if Py_LIMITED_API != PY_VERSION_HEX & 0xffff0000
+    # error "Py_LIMITED_API not defined to Python major+minor version"
+#endif
+
 static PyModuleDef moduledef = {
     .m_base = PyModuleDef_HEAD_INIT,
     .m_name = "limited_api_latest"
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index 667ef287f92b..68c49e60028e 100644
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -286,7 +286,7 @@
 #initcommonhooks#
 #interface_usercode#
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal whether this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m , #gil_used#);
 #endif
diff --git a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
index b66672a43e21..25866f1a40ec 100644
--- a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
+++ b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
@@ -223,7 +223,7 @@ PyMODINIT_FUNC PyInit_test_array_from_pyobj_ext(void) {
   on_exit(f2py_report_on_exit,(void*)"array_from_pyobj.wrap.call");
 #endif
 
-#if Py_GIL_DISABLED
+#ifdef Py_GIL_DISABLED
     // signal whether this module supports running with the GIL disabled
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif

From 3f99204758206967cce4af0f460de4946443f62d Mon Sep 17 00:00:00 2001
From: Joren Hammudoglu <jhammudoglu@gmail.com>
Date: Tue, 17 Jun 2025 19:30:32 +0200
Subject: [PATCH 08/26] MAINT: bump `mypy` to `1.16.1` (#29219)

---
 environment.yml                    | 2 +-
 requirements/test_requirements.txt | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/environment.yml b/environment.yml
index 770a83218133..75c6626abaf8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -25,7 +25,7 @@ dependencies:
   - hypothesis
   # For type annotations
   - typing_extensions>=4.5.0
-  - mypy=1.16.0
+  - mypy=1.16.1
   - orjson  # makes mypy faster
   # For building docs
   - sphinx>=4.5.0
diff --git a/requirements/test_requirements.txt b/requirements/test_requirements.txt
index 17260753db4a..e50919ef4f7b 100644
--- a/requirements/test_requirements.txt
+++ b/requirements/test_requirements.txt
@@ -12,9 +12,8 @@ pytest-timeout
 # For testing types. Notes on the restrictions:
 # - Mypy relies on C API features not present in PyPy
 # NOTE: Keep mypy in sync with environment.yml
-mypy==1.16.0; platform_python_implementation != "PyPy"
+mypy==1.16.1; platform_python_implementation != "PyPy"
 typing_extensions>=4.5.0
 # for optional f2py encoding detection
 charset-normalizer
 tzdata
-

From f6a17f099ba3ae2acabc288bf2297fd821725ffc Mon Sep 17 00:00:00 2001
From: Joren Hammudoglu <jhammudoglu@gmail.com>
Date: Tue, 17 Jun 2025 19:31:38 +0200
Subject: [PATCH 09/26] TYP: Workaround for a mypy issue in
 ``ndarray.__iter__`` (#29218)

---
 numpy/__init__.pyi                              | 12 ++++++++----
 numpy/typing/tests/data/reveal/ndarray_misc.pyi | 10 ++++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 272e52f88e83..ce879e208e49 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -807,6 +807,7 @@ _RealNumberT = TypeVar("_RealNumberT", bound=floating | integer)
 _FloatingT_co = TypeVar("_FloatingT_co", bound=floating, default=floating, covariant=True)
 _IntegerT = TypeVar("_IntegerT", bound=integer)
 _IntegerT_co = TypeVar("_IntegerT_co", bound=integer, default=integer, covariant=True)
+_NonObjectScalarT = TypeVar("_NonObjectScalarT", bound=np.bool | number | flexible | datetime64 | timedelta64)
 
 _NBit = TypeVar("_NBit", bound=NBitBase, default=Any)  # pyright: ignore[reportDeprecated]
 _NBit1 = TypeVar("_NBit1", bound=NBitBase, default=Any)  # pyright: ignore[reportDeprecated]
@@ -2572,10 +2573,13 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeT_co, _DTypeT_co]):
     def __len__(self) -> int: ...
     def __contains__(self, value: object, /) -> builtins.bool: ...
 
-    @overload  # == 1-d & object_
-    def __iter__(self: ndarray[tuple[int], dtype[object_]], /) -> Iterator[Any]: ...
-    @overload  # == 1-d
-    def __iter__(self: ndarray[tuple[int], dtype[_ScalarT]], /) -> Iterator[_ScalarT]: ...
+    # NOTE: This weird `Never` tuple works around a strange mypy issue where it assigns
+    # `tuple[int]` to `tuple[Never]` or `tuple[int, int]` to `tuple[Never, Never]`.
+    # This way the bug only occurs for 9-D arrays, which are probably not very common.
+    @overload
+    def __iter__(self: ndarray[tuple[Never, Never, Never, Never, Never, Never, Never, Never, Never]], /) -> Iterator[Any]: ...
+    @overload  # == 1-d & dtype[T \ object_]
+    def __iter__(self: ndarray[tuple[int], dtype[_NonObjectScalarT]], /) -> Iterator[_NonObjectScalarT]: ...
     @overload  # >= 2-d
     def __iter__(self: ndarray[tuple[int, int, *tuple[int, ...]], dtype[_ScalarT]], /) -> Iterator[NDArray[_ScalarT]]: ...
     @overload  # ?-d
diff --git a/numpy/typing/tests/data/reveal/ndarray_misc.pyi b/numpy/typing/tests/data/reveal/ndarray_misc.pyi
index 465ce7679b49..4cbb90621ca9 100644
--- a/numpy/typing/tests/data/reveal/ndarray_misc.pyi
+++ b/numpy/typing/tests/data/reveal/ndarray_misc.pyi
@@ -6,6 +6,7 @@ function-based counterpart in `../from_numeric.py`.
 
 """
 
+from collections.abc import Iterator
 import ctypes as ct
 import operator
 from types import ModuleType
@@ -29,6 +30,10 @@ AR_m: npt.NDArray[np.timedelta64]
 AR_U: npt.NDArray[np.str_]
 AR_V: npt.NDArray[np.void]
 
+AR_f8_1d: np.ndarray[tuple[int], np.dtype[np.float64]]
+AR_f8_2d: np.ndarray[tuple[int, int], np.dtype[np.float64]]
+AR_f8_3d: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
+
 ctypes_obj = AR_f8.ctypes
 
 assert_type(AR_f8.__dlpack__(), CapsuleType)
@@ -235,3 +240,8 @@ assert_type(AR_m.to_device("cpu"), npt.NDArray[np.timedelta64])
 
 assert_type(f8.__array_namespace__(), ModuleType)
 assert_type(AR_f8.__array_namespace__(), ModuleType)
+
+assert_type(iter(AR_f8), Iterator[Any])  # any-D
+assert_type(iter(AR_f8_1d), Iterator[np.float64])  # 1-D
+assert_type(iter(AR_f8_2d), Iterator[npt.NDArray[np.float64]])  # 2-D
+assert_type(iter(AR_f8_3d), Iterator[npt.NDArray[np.float64]])  # 3-D

From 32f4afaad7cb035e1b46987285d34c27c57dbe57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20Gro=C3=9F?= <mgross321@gmail.com>
Date: Tue, 17 Jun 2025 19:34:26 +0200
Subject: [PATCH 10/26] ENH: improve Timsort with powersort merge-policy
 (#29208)

Implement the improved merge policy for Timsort,
as developed by Munro and Wild.
Benchmarks show a significant improvement in performance.
---
 numpy/_core/src/npysort/timsort.cpp | 110 +++++++++++++---------------
 1 file changed, 51 insertions(+), 59 deletions(-)

diff --git a/numpy/_core/src/npysort/timsort.cpp b/numpy/_core/src/npysort/timsort.cpp
index 0f0f5721e7cf..9ecf88c0aeb9 100644
--- a/numpy/_core/src/npysort/timsort.cpp
+++ b/numpy/_core/src/npysort/timsort.cpp
@@ -39,8 +39,9 @@
 #include <cstdlib>
 #include <utility>
 
-/* enough for 32 * 1.618 ** 128 elements */
-#define TIMSORT_STACK_SIZE 128
+/* enough for 32 * 1.618 ** 128 elements.
+   If powersort was used in all cases, 90 would suffice, as  32 * 2 ** 90  >=  32 * 1.618 ** 128  */
+#define RUN_STACK_SIZE 128
 
 static npy_intp
 compute_min_run(npy_intp num)
@@ -58,6 +59,7 @@ compute_min_run(npy_intp num)
 typedef struct {
     npy_intp s; /* start pointer */
     npy_intp l; /* length */
+    int power;  /* node "level" for powersort merge strategy */
 } run;
 
 /* buffer for argsort. Declared here to avoid multiple declarations. */
@@ -383,60 +385,51 @@ merge_at_(type *arr, const run *stack, const npy_intp at, buffer_<Tag> *buffer)
     return 0;
 }
 
-template <typename Tag, typename type>
+/* See https://github.com/python/cpython/blob/ea23c897cd25702e72a04e06664f6864f07a7c5d/Objects/listsort.txt
+*  for a detailed explanation.
+*  In CPython, *num* is called *n*, but we changed it for consistency with the NumPy implementation.
+*/
 static int
-try_collapse_(type *arr, run *stack, npy_intp *stack_ptr, buffer_<Tag> *buffer)
+powerloop(npy_intp s1, npy_intp n1, npy_intp n2, npy_intp num)
 {
-    int ret;
-    npy_intp A, B, C, top;
-    top = *stack_ptr;
-
-    while (1 < top) {
-        B = stack[top - 2].l;
-        C = stack[top - 1].l;
-
-        if ((2 < top && stack[top - 3].l <= B + C) ||
-            (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) {
-            A = stack[top - 3].l;
-
-            if (A <= C) {
-                ret = merge_at_<Tag>(arr, stack, top - 3, buffer);
-
-                if (NPY_UNLIKELY(ret < 0)) {
-                    return ret;
-                }
-
-                stack[top - 3].l += B;
-                stack[top - 2] = stack[top - 1];
-                --top;
-            }
-            else {
-                ret = merge_at_<Tag>(arr, stack, top - 2, buffer);
-
-                if (NPY_UNLIKELY(ret < 0)) {
-                    return ret;
-                }
-
-                stack[top - 2].l += C;
-                --top;
-            }
+    int result = 0;
+    npy_intp a = 2 * s1 + n1;  /* 2*a */
+    npy_intp b = a + n1 + n2;  /* 2*b */
+    for (;;) {
+        ++result;
+        if (a >= num) {  /* both quotient bits are 1 */
+            a -= num;
+            b -= num;
         }
-        else if (1 < top && B <= C) {
-            ret = merge_at_<Tag>(arr, stack, top - 2, buffer);
+        else if (b >= num) {  /* a/num bit is 0, b/num bit is 1 */
+            break;
+        }
+        a <<= 1;
+        b <<= 1;
+    }
+    return result;
+}
 
+template <typename Tag, typename type>
+static int
+found_new_run_(type *arr, run *stack, npy_intp *stack_ptr, npy_intp n2,
+               npy_intp num, buffer_<Tag> *buffer)
+{
+    int ret;
+    if (*stack_ptr > 0) {
+        npy_intp s1 = stack[*stack_ptr - 1].s;
+        npy_intp n1 = stack[*stack_ptr - 1].l;
+        int power = powerloop(s1, n1, n2, num);
+        while (*stack_ptr > 1 && stack[*stack_ptr - 2].power > power) {
+            ret = merge_at_<Tag>(arr, stack, *stack_ptr - 2, buffer);
             if (NPY_UNLIKELY(ret < 0)) {
                 return ret;
             }
-
-            stack[top - 2].l += C;
-            --top;
-        }
-        else {
-            break;
+            stack[*stack_ptr - 2].l += stack[*stack_ptr - 1].l;
+            --(*stack_ptr);
         }
+        stack[*stack_ptr - 1].power = power;
     }
-
-    *stack_ptr = top;
     return 0;
 }
 
@@ -491,7 +484,7 @@ timsort_(void *start, npy_intp num)
     int ret;
     npy_intp l, n, stack_ptr, minrun;
     buffer_<Tag> buffer;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     buffer.pw = NULL;
     buffer.size = 0;
     stack_ptr = 0;
@@ -499,15 +492,14 @@ timsort_(void *start, npy_intp num)
 
     for (l = 0; l < num;) {
         n = count_run_<Tag>((type *)start, l, num, minrun);
+        ret = found_new_run_<Tag>((type *)start, stack, &stack_ptr, n, num, &buffer);
+        if (NPY_UNLIKELY(ret < 0))
+            goto cleanup;
+
+        // Push the new run onto the stack.
         stack[stack_ptr].s = l;
         stack[stack_ptr].l = n;
         ++stack_ptr;
-        ret = try_collapse_<Tag>((type *)start, stack, &stack_ptr, &buffer);
-
-        if (NPY_UNLIKELY(ret < 0)) {
-            goto cleanup;
-        }
-
         l += n;
     }
 
@@ -897,7 +889,7 @@ atimsort_(void *v, npy_intp *tosort, npy_intp num)
     int ret;
     npy_intp l, n, stack_ptr, minrun;
     buffer_intp buffer;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     buffer.pw = NULL;
     buffer.size = 0;
     stack_ptr = 0;
@@ -1371,7 +1363,7 @@ string_timsort_(void *start, npy_intp num, void *varr)
     size_t len = elsize / sizeof(type);
     int ret;
     npy_intp l, n, stack_ptr, minrun;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     string_buffer_<Tag> buffer;
 
     /* Items that have zero size don't make sense to sort */
@@ -1800,7 +1792,7 @@ string_atimsort_(void *start, npy_intp *tosort, npy_intp num, void *varr)
     size_t len = elsize / sizeof(type);
     int ret;
     npy_intp l, n, stack_ptr, minrun;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     buffer_intp buffer;
 
     /* Items that have zero size don't make sense to sort */
@@ -2253,7 +2245,7 @@ npy_timsort(void *start, npy_intp num, void *varr)
     PyArray_CompareFunc *cmp = PyDataType_GetArrFuncs(PyArray_DESCR(arr))->compare;
     int ret;
     npy_intp l, n, stack_ptr, minrun;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     buffer_char buffer;
 
     /* Items that have zero size don't make sense to sort */
@@ -2689,7 +2681,7 @@ npy_atimsort(void *start, npy_intp *tosort, npy_intp num, void *varr)
     PyArray_CompareFunc *cmp = PyDataType_GetArrFuncs(PyArray_DESCR(arr))->compare;
     int ret;
     npy_intp l, n, stack_ptr, minrun;
-    run stack[TIMSORT_STACK_SIZE];
+    run stack[RUN_STACK_SIZE];
     buffer_intp buffer;
 
     /* Items that have zero size don't make sense to sort */

From d1c67573569885087a253120c1a9f2caf3ccf084 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 17 Jun 2025 14:56:21 -0400
Subject: [PATCH 11/26] ENH: Detect CPU features on OpenBSD ARM and PowerPC64

Also while looking at this I noticed due to a compiler warning that
npy__cpu_init_features_linux() was not enabled on FreeBSD with the
original commit c47e9621ebf76f8085ff5ec8b01c07921d14f6a7 thus the
code was doing nothing on FreeBSD ARM.
---
 numpy/_core/src/common/npy_cpu_features.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/numpy/_core/src/common/npy_cpu_features.c b/numpy/_core/src/common/npy_cpu_features.c
index f15f636cdb1e..617225ec9eff 100644
--- a/numpy/_core/src/common/npy_cpu_features.c
+++ b/numpy/_core/src/common/npy_cpu_features.c
@@ -562,7 +562,7 @@ npy__cpu_init_features(void)
 
 #elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
 
-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
     #ifdef __FreeBSD__
         #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX
     #endif
@@ -585,7 +585,7 @@ static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
 #ifdef __linux__
     unsigned int hwcap = getauxval(AT_HWCAP);
     if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
@@ -612,7 +612,7 @@ npy__cpu_init_features(void)
     npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
-// TODO: AIX, OpenBSD
+// TODO: AIX
 #else
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
     #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
@@ -698,7 +698,7 @@ npy__cpu_init_features_arm8(void)
     npy__cpu_have[NPY_CPU_FEATURE_ASIMD]      = 1;
 }
 
-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
 /*
  * we aren't sure of what kind kernel or clib we deal with
  * so we play it safe
@@ -709,7 +709,7 @@ npy__cpu_init_features_arm8(void)
 #if defined(__linux__)
 __attribute__((weak)) unsigned long getauxval(unsigned long); // linker should handle it
 #endif
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
 __attribute__((weak)) int elf_aux_info(int, void *, int); // linker should handle it
 
 static unsigned long getauxval(unsigned long k)
@@ -807,7 +807,7 @@ static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
     if (npy__cpu_init_features_linux())
         return;
 #endif

From b6a6740e106277f4b1081210bbd158fed3e0d20c Mon Sep 17 00:00:00 2001
From: Mugundanmcw <mugunthan.selvanayagam@multicorewareinc.com>
Date: Wed, 18 Jun 2025 18:16:30 +0530
Subject: [PATCH 12/26] CI: Add conditions to check hash of LLVM package

---
 .github/windows_arm64_steps/action.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/windows_arm64_steps/action.yml b/.github/windows_arm64_steps/action.yml
index 86517b6246e7..8ecb3b8a0cdd 100644
--- a/.github/windows_arm64_steps/action.yml
+++ b/.github/windows_arm64_steps/action.yml
@@ -4,10 +4,16 @@ description: "Setup LLVM for Win-ARM64 builds"
 runs:
   using: "composite"
   steps:
-    - name: Install LLVM
+    - name: Install LLVM with checksum verification
       shell: pwsh
       run: |
         Invoke-WebRequest https://github.com/llvm/llvm-project/releases/download/llvmorg-20.1.6/LLVM-20.1.6-woa64.exe -UseBasicParsing -OutFile LLVM-woa64.exe
+        $expectedHash = "92f69a1134e32e54b07d51c6e24d9594852f6476f32c3d70471ae00fffc2d462"
+        $fileHash = (Get-FileHash -Path "LLVM-woa64.exe" -Algorithm SHA256).Hash
+        if ($fileHash -ne $expectedHash) {
+            Write-Error "Checksum verification failed. The downloaded file may be corrupted or tampered with."
+            exit 1
+        }
         Start-Process -FilePath ".\LLVM-woa64.exe" -ArgumentList "/S" -Wait
         echo "C:\Program Files\LLVM\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
         echo "CC=clang-cl" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append

From d52b36ee0b8bf3e7ca9eec19d8c4b41ff2c04f59 Mon Sep 17 00:00:00 2001
From: Michael <michael.siebert2k@gmail.com>
Date: Wed, 18 Jun 2025 19:32:32 +0200
Subject: [PATCH 13/26] strides comparison performance fix, compare discussion
 #29179 (#29188)

---
 numpy/_core/src/umath/matmul.c.src | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/umath/matmul.c.src b/numpy/_core/src/umath/matmul.c.src
index 02c4fde56bf2..6f54aeb4d968 100644
--- a/numpy/_core/src/umath/matmul.c.src
+++ b/numpy/_core/src/umath/matmul.c.src
@@ -27,6 +27,8 @@
  *****************************************************************************
  */
 
+#define ABS(x) ((x) < 0 ? -(x) : (x))
+
 #if defined(HAVE_CBLAS)
 /*
  * -1 to be conservative, in case blas internally uses a for loop with an
@@ -554,9 +556,9 @@ NPY_NO_EXPORT void
         } else {
             /* matrix @ matrix 
              * copy if not blasable, see gh-12365 & gh-23588 */
-            npy_bool i1_transpose = is1_m < is1_n,
-                     i2_transpose = is2_n < is2_p,
-                     o_transpose = os_m < os_p;
+            npy_bool i1_transpose = ABS(is1_m) < ABS(is1_n),
+                     i2_transpose = ABS(is2_n) < ABS(is2_p),
+                     o_transpose = ABS(os_m) < ABS(os_p);
 
             npy_intp tmp_is1_m = i1_transpose ? sz : sz*dn,
                      tmp_is1_n = i1_transpose ? sz*dm : sz,

From 2b0eda38f099203bb72572a8b640b37165fc7dc5 Mon Sep 17 00:00:00 2001
From: jorenham <jhammudoglu@gmail.com>
Date: Tue, 17 Jun 2025 17:49:17 +0200
Subject: [PATCH 14/26] TYP: Support iteration of ``StringDType`` arrays

---
 numpy/__init__.pyi | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index ce879e208e49..bcbe95accee6 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -2577,11 +2577,15 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeT_co, _DTypeT_co]):
     # `tuple[int]` to `tuple[Never]` or `tuple[int, int]` to `tuple[Never, Never]`.
     # This way the bug only occurs for 9-D arrays, which are probably not very common.
     @overload
-    def __iter__(self: ndarray[tuple[Never, Never, Never, Never, Never, Never, Never, Never, Never]], /) -> Iterator[Any]: ...
+    def __iter__(
+        self: ndarray[tuple[Never, Never, Never, Never, Never, Never, Never, Never, Never], Any], /
+    ) -> Iterator[Any]: ...
     @overload  # == 1-d & dtype[T \ object_]
     def __iter__(self: ndarray[tuple[int], dtype[_NonObjectScalarT]], /) -> Iterator[_NonObjectScalarT]: ...
+    @overload  # == 1-d & StringDType
+    def __iter__(self: ndarray[tuple[int], dtypes.StringDType], /) -> Iterator[str]: ...
     @overload  # >= 2-d
-    def __iter__(self: ndarray[tuple[int, int, *tuple[int, ...]], dtype[_ScalarT]], /) -> Iterator[NDArray[_ScalarT]]: ...
+    def __iter__(self: ndarray[tuple[int, int, *tuple[int, ...]], _DTypeT], /) -> Iterator[ndarray[_AnyShape, _DTypeT]]: ...
     @overload  # ?-d
     def __iter__(self, /) -> Iterator[Any]: ...
 

From f15a116ee12ec539c7ed1fafb3a0abfb82005e0f Mon Sep 17 00:00:00 2001
From: Developer-Ecosystem-Engineering
 <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>
Date: Wed, 18 Jun 2025 13:00:27 -0700
Subject: [PATCH 15/26] BUG: Address interaction between SME and FPSR (#29223)

* BUG: Address interaction between SME and FPSR

This is intended to resolve https://github.com/numpy/numpy/issues/28687

The root cause is an interaction between Arm Scalable Matrix Extension (SME) and the floating point status register (FPSR).

 As noted in Arm docs for FPSR, "On entry to or exit from Streaming SVE mode, FPSR.{IOC, DZC, OFC, UFC, IXC, IDC, QC} are set to 1 and the remaining bits are set to 0".  This means that floating point status flags are all raised when SME is used, regardless of values or operations performed.

These are manifesting now because Apple Silicon M4 supports SME and macOS 15.4 enables SME codepaths for Accelerate BLAS / LAPACK.  However, SME / FPSR behavior is not specific to Apple Silicon M4 and will occur on non-Apple chips using SME as well.

Changes add compile and runtime checks to determine whether BLAS / LAPACK might use SME (macOS / Accelerate only at the moment).  If so, special handling of floating-point error (FPE) is added, which includes:
- clearing FPE after some BLAS calls
- short-circuiting FPE read after some BLAS calls

All tests pass
Performance is similar

Another approach would have been to wrap all BLAS / LAPACK calls with save / restore FPE.  However, it added a lot of overhead for the inner loops that utilize BLAS / LAPACK.  Some benchmarks were 8x slower.

* add blas_supports_fpe and ifdef check

Address the linker & linter failures
---
 numpy/_core/meson.build                       |   1 +
 numpy/_core/src/common/blas_utils.c           | 134 ++++++++++++++++++
 numpy/_core/src/common/blas_utils.h           |  30 ++++
 numpy/_core/src/common/cblasfuncs.c           |   3 +-
 numpy/_core/src/multiarray/multiarraymodule.c |   5 +
 numpy/_core/src/umath/matmul.c.src            |  36 ++++-
 numpy/_core/tests/test_multiarray.py          |   6 +
 numpy/testing/_private/utils.py               |  10 ++
 8 files changed, 217 insertions(+), 8 deletions(-)
 create mode 100644 numpy/_core/src/common/blas_utils.c
 create mode 100644 numpy/_core/src/common/blas_utils.h

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index a4d2050122c6..cd46a20b0246 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -1117,6 +1117,7 @@ src_multiarray_umath_common = [
 ]
 if have_blas
   src_multiarray_umath_common += [
+    'src/common/blas_utils.c',
     'src/common/cblasfuncs.c',
     'src/common/python_xerbla.c',
   ]
diff --git a/numpy/_core/src/common/blas_utils.c b/numpy/_core/src/common/blas_utils.c
new file mode 100644
index 000000000000..aaf976ed70e4
--- /dev/null
+++ b/numpy/_core/src/common/blas_utils.c
@@ -0,0 +1,134 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include "numpy/numpyconfig.h"  // NPY_VISIBILITY_HIDDEN
+#include "numpy/npy_math.h"     // npy_get_floatstatus_barrier
+#include "blas_utils.h"
+
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+
+/* Return whether we're running on macOS 15.4 or later
+ */
+static inline bool
+is_macOS_version_15_4_or_later(void){
+#if !defined(__APPLE__)
+    return false;
+#else
+    char *osProductVersion = NULL;
+    size_t size = 0;
+    bool ret = false;
+
+    // Query how large OS version string should be
+    if(-1 == sysctlbyname("kern.osproductversion", NULL, &size, NULL, 0)){
+        goto cleanup;
+    }
+
+    osProductVersion = malloc(size + 1);
+
+    // Get the OS version string
+    if(-1 == sysctlbyname("kern.osproductversion", osProductVersion, &size, NULL, 0)){
+        goto cleanup;
+    }
+
+    osProductVersion[size] = '\0';
+
+    // Parse the version string
+    int major = 0, minor = 0;
+    if(2 > sscanf(osProductVersion, "%d.%d", &major, &minor)) {
+        goto cleanup;
+    }
+
+    if(major >= 15 && minor >= 4){
+        ret = true;
+    }
+
+cleanup:
+    if(osProductVersion){
+        free(osProductVersion);
+    }
+
+    return ret;
+#endif
+}
+
+/* ARM Scalable Matrix Extension (SME) raises all floating-point error flags
+ * when it's used regardless of values or operations.  As a consequence,
+ * when SME is used, all FPE state is lost and special handling is needed.
+ *
+ * For NumPy, SME is not currently used directly, but can be used via
+ * BLAS / LAPACK libraries.  This function does a runtime check for whether
+ * BLAS / LAPACK can use SME and special handling around FPE is required.
+ */
+static inline bool
+BLAS_can_use_ARM_SME(void)
+{
+#if defined(__APPLE__) && defined(__aarch64__) && defined(ACCELERATE_NEW_LAPACK)
+    // ARM SME can be used by Apple's Accelerate framework for BLAS / LAPACK
+    // - macOS 15.4+
+    // - Apple silicon M4+
+
+    // Does OS / Accelerate support ARM SME?
+    if(!is_macOS_version_15_4_or_later()){
+        return false;
+    }
+
+    // Does hardware support SME?
+    int has_SME = 0;
+    size_t size = sizeof(has_SME);
+    if(-1 == sysctlbyname("hw.optional.arm.FEAT_SME", &has_SME, &size, NULL, 0)){
+        return false;
+    }
+
+    if(has_SME){
+        return true;
+    }
+#endif
+
+    // default assume SME is not used
+    return false;
+}
+
+/* Static variable to cache runtime check of BLAS FPE support.
+ */
+static bool blas_supports_fpe = true;
+
+#endif // NPY_BLAS_CHECK_FPE_SUPPORT
+
+
+NPY_VISIBILITY_HIDDEN bool
+npy_blas_supports_fpe(void)
+{
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+    return blas_supports_fpe;
+#else
+    return true;
+#endif
+}
+
+NPY_VISIBILITY_HIDDEN void
+npy_blas_init(void)
+{
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+    blas_supports_fpe = !BLAS_can_use_ARM_SME();
+#endif
+}
+
+NPY_VISIBILITY_HIDDEN int
+npy_get_floatstatus_after_blas(void)
+{
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+    if(!blas_supports_fpe){
+        // BLAS does not support FPE and we need to return FPE state.
+        // Instead of clearing and then grabbing state, just return
+        // that no flags are set.
+        return 0;
+    }
+#endif
+    char *param = NULL;
+    return npy_get_floatstatus_barrier(param);
+}
diff --git a/numpy/_core/src/common/blas_utils.h b/numpy/_core/src/common/blas_utils.h
new file mode 100644
index 000000000000..8c1437f88899
--- /dev/null
+++ b/numpy/_core/src/common/blas_utils.h
@@ -0,0 +1,30 @@
+#include <stdbool.h>
+
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
+
+/* NPY_BLAS_CHECK_FPE_SUPPORT controls whether we need a runtime check
+ * for floating-point error (FPE) support in BLAS.
+ */
+#if defined(__APPLE__) && defined(__aarch64__) && defined(ACCELERATE_NEW_LAPACK)
+#define NPY_BLAS_CHECK_FPE_SUPPORT 1
+#else
+#define NPY_BLAS_CHECK_FPE_SUPPORT 0
+#endif
+
+/* Initialize BLAS environment, if needed
+ */
+NPY_VISIBILITY_HIDDEN void
+npy_blas_init(void);
+
+/* Runtime check if BLAS supports floating-point errors.
+ * true  - BLAS supports FPE and one can rely on them to indicate errors
+ * false - BLAS does not support FPE.  Special handling needed for FPE state
+ */
+NPY_VISIBILITY_HIDDEN bool
+npy_blas_supports_fpe(void);
+
+/* If BLAS supports FPE, exactly the same as npy_get_floatstatus_barrier().
+ * Otherwise, we can't rely on FPE state and need special handling.
+ */
+NPY_VISIBILITY_HIDDEN int
+npy_get_floatstatus_after_blas(void);
diff --git a/numpy/_core/src/common/cblasfuncs.c b/numpy/_core/src/common/cblasfuncs.c
index f9d683d812d4..66a215dfeb64 100644
--- a/numpy/_core/src/common/cblasfuncs.c
+++ b/numpy/_core/src/common/cblasfuncs.c
@@ -12,6 +12,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/npy_math.h"
 #include "numpy/ufuncobject.h"
+#include "blas_utils.h"
 #include "npy_cblas.h"
 #include "arraytypes.h"
 #include "common.h"
@@ -693,7 +694,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         NPY_END_ALLOW_THREADS;
     }
 
-    int fpes = npy_get_floatstatus_barrier((char *) result);
+    int fpes = npy_get_floatstatus_after_blas();
     if (fpes && PyUFunc_GiveFloatingpointErrors("dot", fpes) < 0) {
         goto fail;
     }
diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c
index 7724756ba351..dcfb1226a0ab 100644
--- a/numpy/_core/src/multiarray/multiarraymodule.c
+++ b/numpy/_core/src/multiarray/multiarraymodule.c
@@ -43,6 +43,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "arraytypes.h"
 #include "arrayobject.h"
 #include "array_converter.h"
+#include "blas_utils.h"
 #include "hashdescr.h"
 #include "descriptor.h"
 #include "dragon4.h"
@@ -4781,6 +4782,10 @@ _multiarray_umath_exec(PyObject *m) {
         return -1;
     }
 
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+    npy_blas_init();
+#endif
+
 #if defined(MS_WIN64) && defined(__GNUC__)
   PyErr_WarnEx(PyExc_Warning,
         "Numpy built with MINGW-W64 on Windows 64 bits is experimental, " \
diff --git a/numpy/_core/src/umath/matmul.c.src b/numpy/_core/src/umath/matmul.c.src
index 6f54aeb4d968..11e014acec7f 100644
--- a/numpy/_core/src/umath/matmul.c.src
+++ b/numpy/_core/src/umath/matmul.c.src
@@ -16,6 +16,7 @@
 
 
 
+#include "blas_utils.h"
 #include "npy_cblas.h"
 #include "arraytypes.h" /* For TYPE_dot functions */
 
@@ -122,7 +123,7 @@ static inline void
     }
 }
 
-NPY_NO_EXPORT void
+static void
 @name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
             void *ip2, npy_intp is2_n,
             void *op, npy_intp op_m,
@@ -158,7 +159,7 @@ NPY_NO_EXPORT void
                                      is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
 }
 
-NPY_NO_EXPORT void
+static void
 @name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                            void *ip2, npy_intp is2_n, npy_intp is2_p,
                            void *op, npy_intp os_m, npy_intp os_p,
@@ -262,7 +263,7 @@ NPY_NO_EXPORT void
  * #IS_HALF = 0, 0, 0, 1, 0*13#
  */
 
-NPY_NO_EXPORT void
+static void
 @TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                            void *_ip2, npy_intp is2_n, npy_intp is2_p,
                            void *_op, npy_intp os_m, npy_intp os_p,
@@ -320,7 +321,7 @@ NPY_NO_EXPORT void
 }
 
 /**end repeat**/
-NPY_NO_EXPORT void
+static void
 BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                            void *_ip2, npy_intp is2_n, npy_intp is2_p,
                            void *_op, npy_intp os_m, npy_intp os_p,
@@ -359,7 +360,7 @@ BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
     }
 }
 
-NPY_NO_EXPORT void
+static void
 OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
                            void *_ip2, npy_intp is2_n, npy_intp is2_p,
                            void *_op, npy_intp os_m, npy_intp os_p,
@@ -631,6 +632,11 @@ NPY_NO_EXPORT void
 #endif
     }
 #if @USEBLAS@ && defined(HAVE_CBLAS)
+#if NPY_BLAS_CHECK_FPE_SUPPORT
+    if (!npy_blas_supports_fpe()) {
+        npy_clear_floatstatus_barrier((char*)args);
+    }
+#endif
     if (allocate_buffer) free(tmp_ip12op);
 #endif
 }
@@ -655,7 +661,7 @@ NPY_NO_EXPORT void
  * #prefix = c, z, 0#
  * #USE_BLAS = 1, 1, 0#
  */
-NPY_NO_EXPORT void
+static void
 @name@_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
             char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
@@ -751,6 +757,7 @@ OBJECT_dotc(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp
  *          CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT#
  *  #DOT = dot*15, dotc*4#
  *  #CHECK_PYERR = 0*18, 1#
+ *  #CHECK_BLAS = 1*2, 0*13, 1*2, 0*2#
  */
 NPY_NO_EXPORT void
 @TYPE@_vecdot(char **args, npy_intp const *dimensions, npy_intp const *steps,
@@ -774,6 +781,11 @@ NPY_NO_EXPORT void
         }
 #endif
     }
+#if @CHECK_BLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
+    if (!npy_blas_supports_fpe()) {
+        npy_clear_floatstatus_barrier((char*)args);
+    }
+#endif
 }
 /**end repeat**/
 
@@ -789,7 +801,7 @@ NPY_NO_EXPORT void
  * #step1 = &oneF, &oneD#
  * #step0 = &zeroF, &zeroD#
  */
-NPY_NO_EXPORT void
+static void
 @name@_vecmat_via_gemm(void *ip1, npy_intp is1_n,
                        void *ip2, npy_intp is2_n, npy_intp is2_m,
                        void *op, npy_intp os_m,
@@ -880,6 +892,11 @@ NPY_NO_EXPORT void
 #endif
         }
     }
+#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
+    if (!npy_blas_supports_fpe()) {
+        npy_clear_floatstatus_barrier((char*)args);
+    }
+#endif
 }
 /**end repeat**/
 
@@ -945,5 +962,10 @@ NPY_NO_EXPORT void
 #endif
         }
     }
+#if @USEBLAS@ && NPY_BLAS_CHECK_FPE_SUPPORT
+    if (!npy_blas_supports_fpe()) {
+        npy_clear_floatstatus_barrier((char*)args);
+    }
+#endif
 }
 /**end repeat**/
diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index b164f1dada3b..81e55deb3daf 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -31,6 +31,7 @@
 from numpy.exceptions import AxisError, ComplexWarning
 from numpy.lib.recfunctions import repack_fields
 from numpy.testing import (
+    BLAS_SUPPORTS_FPE,
     HAS_REFCOUNT,
     IS_64BIT,
     IS_PYPY,
@@ -3363,6 +3364,11 @@ def test_dot(self):
     @pytest.mark.parametrize("dtype", [np.half, np.double, np.longdouble])
     @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
     def test_dot_errstate(self, dtype):
+        # Some dtypes use BLAS for 'dot' operation and
+        # not all BLAS support floating-point errors.
+        if not BLAS_SUPPORTS_FPE and dtype == np.double:
+            pytest.skip("BLAS does not support FPE")
+
         a = np.array([1, 1], dtype=dtype)
         b = np.array([-np.inf, np.inf], dtype=dtype)
 
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index d7ceaeab72cc..65f4059f98fd 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -42,6 +42,7 @@
         'assert_no_gc_cycles', 'break_cycles', 'HAS_LAPACK64', 'IS_PYSTON',
         'IS_MUSL', 'check_support_sve', 'NOGIL_BUILD',
         'IS_EDITABLE', 'IS_INSTALLED', 'NUMPY_ROOT', 'run_threaded', 'IS_64BIT',
+        'BLAS_SUPPORTS_FPE',
         ]
 
 
@@ -89,6 +90,15 @@ class KnownFailureException(Exception):
 IS_PYPY = sys.implementation.name == 'pypy'
 IS_PYSTON = hasattr(sys, "pyston_version_info")
 HAS_REFCOUNT = getattr(sys, 'getrefcount', None) is not None and not IS_PYSTON
+BLAS_SUPPORTS_FPE = True
+if platform.system() == 'Darwin' or platform.machine() == 'arm64':
+    try:
+        blas = np.__config__.CONFIG['Build Dependencies']['blas']
+        if blas['name'] == 'accelerate':
+            BLAS_SUPPORTS_FPE = False
+    except KeyError:
+        pass
+
 HAS_LAPACK64 = numpy.linalg._umath_linalg._ilp64
 
 IS_MUSL = False

From 5cbc0bcebbe031140b63d2ffba06b73e471b43b2 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 17 Jun 2025 19:57:18 -0400
Subject: [PATCH 16/26] ENH: Detect CPU features on FreeBSD / OpenBSD RISC-V64.

---
 numpy/_core/src/common/npy_cpu_features.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/numpy/_core/src/common/npy_cpu_features.c b/numpy/_core/src/common/npy_cpu_features.c
index f15f636cdb1e..06c82fe41e27 100644
--- a/numpy/_core/src/common/npy_cpu_features.c
+++ b/numpy/_core/src/common/npy_cpu_features.c
@@ -846,22 +846,30 @@ npy__cpu_init_features(void)
 
 #elif defined(__riscv) && __riscv_xlen == 64
 
-#include <sys/auxv.h>
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+    #include <sys/auxv.h>
 
-#ifndef HWCAP_RVV
-    // https://github.com/torvalds/linux/blob/v6.8/arch/riscv/include/uapi/asm/hwcap.h#L24
-    #define COMPAT_HWCAP_ISA_V	(1 << ('V' - 'A'))
+    #ifndef HWCAP_RVV
+        // https://github.com/torvalds/linux/blob/v6.8/arch/riscv/include/uapi/asm/hwcap.h#L24
+        #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
+    #endif
 #endif
 
 static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
-
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+#ifdef __linux__
     unsigned int hwcap = getauxval(AT_HWCAP);
+#else
+    unsigned long hwcap;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#endif
     if (hwcap & COMPAT_HWCAP_ISA_V) {
         npy__cpu_have[NPY_CPU_FEATURE_RVV]  = 1;
     }
+#endif
 }
 
 /*********** Unsupported ARCH ***********/

From 7a74e12558789ecf0252dad56a77560db7411d13 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Thu, 19 Jun 2025 01:18:27 -0600
Subject: [PATCH 17/26] BUG: avoid negating unsigned integers in resize
 implementation (#29230)

The negation of an unsigned int underflows and creates a large positive repeats, which leads to allocations failures and/or swapping.
---
 numpy/_core/fromnumeric.py        | 3 ++-
 numpy/_core/tests/test_numeric.py | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/numpy/_core/fromnumeric.py b/numpy/_core/fromnumeric.py
index 73dcd1ddc11d..e20d774d014d 100644
--- a/numpy/_core/fromnumeric.py
+++ b/numpy/_core/fromnumeric.py
@@ -1607,7 +1607,8 @@ def resize(a, new_shape):
         # First case must zero fill. The second would have repeats == 0.
         return np.zeros_like(a, shape=new_shape)
 
-    repeats = -(-new_size // a.size)  # ceil division
+    # ceiling division without negating new_size
+    repeats = (new_size + a.size - 1) // a.size
     a = concatenate((a,) * repeats)[:new_size]
 
     return reshape(a, new_shape)
diff --git a/numpy/_core/tests/test_numeric.py b/numpy/_core/tests/test_numeric.py
index 8a72e4bfa65d..65da65ddc9f9 100644
--- a/numpy/_core/tests/test_numeric.py
+++ b/numpy/_core/tests/test_numeric.py
@@ -79,6 +79,13 @@ def test_negative_resize(self):
         with pytest.raises(ValueError, match=r"negative"):
             np.resize(A, new_shape=new_shape)
 
+    def test_unsigned_resize(self):
+        # ensure unsigned integer sizes don't lead to underflows
+        for dt_pair in [(np.int32, np.uint32), (np.int64, np.uint64)]:
+            arr = np.array([[23, 95], [66, 37]])
+            assert_array_equal(np.resize(arr, dt_pair[0](1)),
+                               np.resize(arr, dt_pair[1](1)))
+
     def test_subclass(self):
         class MyArray(np.ndarray):
             __array_priority__ = 1.

From 35079afa808ea0fc49bb13c878150ef20c4f64fa Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Thu, 19 Jun 2025 14:21:05 +0200
Subject: [PATCH 18/26] TST: Fix test that uses unininitialized memory (#29232)

Tests should avoid this generally, this one is worse, since it can
even fail due to warnings.
---
 numpy/_core/tests/test_ufunc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/tests/test_ufunc.py b/numpy/_core/tests/test_ufunc.py
index 21ebc02c2625..a1cd63aec523 100644
--- a/numpy/_core/tests/test_ufunc.py
+++ b/numpy/_core/tests/test_ufunc.py
@@ -2123,9 +2123,9 @@ class ArrayPriorityMinus1000b(ArrayPriorityBase):
         class ArrayPriorityMinus2000(ArrayPriorityBase):
             __array_priority__ = -2000
 
-        x = ArrayPriorityMinus1000(2)
-        xb = ArrayPriorityMinus1000b(2)
-        y = ArrayPriorityMinus2000(2)
+        x = np.ones(2).view(ArrayPriorityMinus1000)
+        xb = np.ones(2).view(ArrayPriorityMinus1000b)
+        y = np.ones(2).view(ArrayPriorityMinus2000)
 
         assert np.add(x, y) is ArrayPriorityMinus1000
         assert np.add(y, x) is ArrayPriorityMinus1000

From 4575abf725d9f14af73ee0442b05c01d0d46a7c1 Mon Sep 17 00:00:00 2001
From: Joren Hammudoglu <jhammudoglu@gmail.com>
Date: Thu, 19 Jun 2025 16:43:25 +0200
Subject: [PATCH 19/26] MAINT: bump ``ruff`` to ``0.12.0`` (#29220)

---
 environment.yml                      | 2 +-
 requirements/linter_requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index 75c6626abaf8..17de8d3eeb5e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -45,7 +45,7 @@ dependencies:
   # NOTE: breathe 4.33.0 collides with sphinx.ext.graphviz
   - breathe>4.33.0
   # For linting
-  - ruff=0.11.13
+  - ruff=0.12.0
   - gitpython
   # Used in some tests
   - cffi
diff --git a/requirements/linter_requirements.txt b/requirements/linter_requirements.txt
index 45319571b561..05319a9bdb8a 100644
--- a/requirements/linter_requirements.txt
+++ b/requirements/linter_requirements.txt
@@ -1,3 +1,3 @@
 # keep in sync with `environment.yml`
-ruff==0.11.13
+ruff==0.12.0
 GitPython>=3.1.30

From 7b30ce7432d0b42bb805fbf5575b0d0ba5be98ab Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Thu, 19 Jun 2025 16:47:49 +0200
Subject: [PATCH 20/26] BUG: Enforce integer limitation in concatenate (#29231)

* BUG: Enforce integer limitation in concatenate

Concatenate internals only deal with integer many arrays, that
should be fine in practice, but a SystemError (or in principle
maybe also a harder crash?) is not really.

* skip 32bit systems
---
 numpy/_core/src/multiarray/multiarraymodule.c | 11 +++++++++--
 numpy/_core/tests/test_shape_base.py          | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c
index dcfb1226a0ab..d4766b5af7b4 100644
--- a/numpy/_core/src/multiarray/multiarraymodule.c
+++ b/numpy/_core/src/multiarray/multiarraymodule.c
@@ -669,10 +669,17 @@ PyArray_ConcatenateInto(PyObject *op,
     }
 
     /* Convert the input list into arrays */
-    narrays = PySequence_Size(op);
-    if (narrays < 0) {
+    Py_ssize_t narrays_true = PySequence_Size(op);
+    if (narrays_true < 0) {
         return NULL;
     }
+    else if (narrays_true > NPY_MAX_INT) {
+        PyErr_Format(PyExc_ValueError,
+            "concatenate() only supports up to %d arrays but got %zd.",
+            NPY_MAX_INT, narrays_true);
+        return NULL;
+    }
+    narrays = (int)narrays_true;
     arrays = PyArray_malloc(narrays * sizeof(arrays[0]));
     if (arrays == NULL) {
         PyErr_NoMemory();
diff --git a/numpy/_core/tests/test_shape_base.py b/numpy/_core/tests/test_shape_base.py
index f7b944be08b7..8de24278fc5d 100644
--- a/numpy/_core/tests/test_shape_base.py
+++ b/numpy/_core/tests/test_shape_base.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 
 import numpy as np
@@ -29,6 +31,7 @@
     assert_raises,
     assert_raises_regex,
 )
+from numpy.testing._private.utils import requires_memory
 
 
 class TestAtleast1d:
@@ -290,6 +293,17 @@ def test_exceptions(self):
         # No arrays to concatenate raises ValueError
         assert_raises(ValueError, concatenate, ())
 
+    @pytest.mark.slow
+    @pytest.mark.skipif(sys.maxsize < 2**32, reason="only problematic on 64bit platforms")
+    @requires_memory(2 * np.iinfo(np.intc).max)
+    def test_huge_list_error(self):
+        a = np.array([1])
+        max_int = np.iinfo(np.intc).max
+        arrs = (a,) * (max_int + 1)
+        msg = fr"concatenate\(\) only supports up to {max_int} arrays but got {max_int + 1}."
+        with pytest.raises(ValueError, match=msg):
+            np.concatenate(arrs)
+
     def test_concatenate_axis_None(self):
         a = np.arange(4, dtype=np.float64).reshape((2, 2))
         b = list(range(3))

From 1da34ea0daaedb94d2b673c70a47f4a57883a713 Mon Sep 17 00:00:00 2001
From: Pieter Eendebak <pieter.eendebak@gmail.com>
Date: Fri, 20 Jun 2025 07:43:13 +0200
Subject: [PATCH 21/26] DEP: Deprecate setting the strides attribute of a numpy
 array (#28925)

Deprecate setting strides (mutating) on an array.

---------

Co-authored-by: Charles Harris <charlesr.harris@gmail.com>
Co-authored-by: Sebastian Berg <sebastian@sipsolutions.net>
Co-authored-by: Joren Hammudoglu <jhammudoglu@gmail.com>
---
 .../upcoming_changes/28925.deprecation.rst    |  9 +++++
 numpy/__init__.pyi                            |  3 +-
 numpy/_core/src/multiarray/getset.c           | 22 ++++++++----
 numpy/_core/tests/test_api.py                 |  3 +-
 numpy/_core/tests/test_deprecations.py        |  7 ++++
 numpy/_core/tests/test_half.py                |  6 ++--
 numpy/_core/tests/test_multiarray.py          | 36 ++++++++++++-------
 numpy/_core/tests/test_nditer.py              |  4 +--
 numpy/_core/tests/test_regression.py          | 23 ++++++------
 numpy/lib/_npyio_impl.py                      |  2 +-
 10 files changed, 75 insertions(+), 40 deletions(-)
 create mode 100644 doc/release/upcoming_changes/28925.deprecation.rst

diff --git a/doc/release/upcoming_changes/28925.deprecation.rst b/doc/release/upcoming_changes/28925.deprecation.rst
new file mode 100644
index 000000000000..a421839394fa
--- /dev/null
+++ b/doc/release/upcoming_changes/28925.deprecation.rst
@@ -0,0 +1,9 @@
+Setting the ``strides`` attribute is deprecated
+-----------------------------------------------
+Setting the strides attribute is now deprecated since mutating
+an array is unsafe if an array is shared, especially by multiple
+threads.  As an alternative, you can create a new view (no copy) via:
+* `np.lib.stride_tricks.strided_window_view` if applicable,
+* `np.lib.stride_tricks.as_strided` for the general case,
+* or the `np.ndarray` constructor (``buffer`` is the original array) for a light-weight version.
+
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index bcbe95accee6..2e9dea06ce6a 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -217,7 +217,7 @@ from typing import (
 # library include `typing_extensions` stubs:
 # https://github.com/python/typeshed/blob/main/stdlib/typing_extensions.pyi
 from _typeshed import Incomplete, StrOrBytesPath, SupportsFlush, SupportsLenAndGetItem, SupportsWrite
-from typing_extensions import CapsuleType, TypeVar
+from typing_extensions import CapsuleType, TypeVar, deprecated
 
 from numpy import (
     char,
@@ -2169,6 +2169,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeT_co, _DTypeT_co]):
     def shape(self, value: _ShapeLike) -> None: ...
     @property
     def strides(self) -> _Shape: ...
+    @deprecated("Setting the strides on a NumPy array has been deprecated in NumPy 2.4")
     @strides.setter
     def strides(self, value: _ShapeLike) -> None: ...
     def byteswap(self, inplace: builtins.bool = ...) -> Self: ...
diff --git a/numpy/_core/src/multiarray/getset.c b/numpy/_core/src/multiarray/getset.c
index 8482b6006e3e..48da52dd3178 100644
--- a/numpy/_core/src/multiarray/getset.c
+++ b/numpy/_core/src/multiarray/getset.c
@@ -85,7 +85,7 @@ array_shape_set(PyArrayObject *self, PyObject *val, void* NPY_UNUSED(ignored))
         /* Free old dimensions and strides */
         npy_free_cache_dim_array(self);
         ((PyArrayObject_fields *)self)->nd = nd;
-        ((PyArrayObject_fields *)self)->dimensions = _dimensions; 
+        ((PyArrayObject_fields *)self)->dimensions = _dimensions;
         ((PyArrayObject_fields *)self)->strides = _dimensions + nd;
 
         if (nd) {
@@ -95,7 +95,7 @@ array_shape_set(PyArrayObject *self, PyObject *val, void* NPY_UNUSED(ignored))
     }
     else {
         /* Free old dimensions and strides */
-        npy_free_cache_dim_array(self);        
+        npy_free_cache_dim_array(self);
         ((PyArrayObject_fields *)self)->nd = 0;
         ((PyArrayObject_fields *)self)->dimensions = NULL;
         ((PyArrayObject_fields *)self)->strides = NULL;
@@ -116,6 +116,19 @@ array_strides_get(PyArrayObject *self, void *NPY_UNUSED(ignored))
 static int
 array_strides_set(PyArrayObject *self, PyObject *obj, void *NPY_UNUSED(ignored))
 {
+    if (obj == NULL) {
+        PyErr_SetString(PyExc_AttributeError,
+                "Cannot delete array strides");
+        return -1;
+    }
+
+    /* Deprecated NumPy 2.4, 2025-05-11 */
+    if (DEPRECATE("Setting the strides on a NumPy array has been deprecated in NumPy 2.4.\n"
+                  "As an alternative, you can create a new view using np.lib.stride_tricks.as_strided."
+                 ) < 0 ) {
+        return -1;
+    }
+
     PyArray_Dims newstrides = {NULL, -1};
     PyArrayObject *new;
     npy_intp numbytes = 0;
@@ -124,11 +137,6 @@ array_strides_set(PyArrayObject *self, PyObject *obj, void *NPY_UNUSED(ignored))
     npy_intp upper_offset = 0;
     Py_buffer view;
 
-    if (obj == NULL) {
-        PyErr_SetString(PyExc_AttributeError,
-                "Cannot delete array strides");
-        return -1;
-    }
     if (!PyArray_OptionalIntpConverter(obj, &newstrides) ||
         newstrides.len == -1) {
         PyErr_SetString(PyExc_TypeError, "invalid strides");
diff --git a/numpy/_core/tests/test_api.py b/numpy/_core/tests/test_api.py
index d427ac0399a2..bb21d79c472d 100644
--- a/numpy/_core/tests/test_api.py
+++ b/numpy/_core/tests/test_api.py
@@ -5,6 +5,7 @@
 import numpy as np
 import numpy._core.umath as ncu
 from numpy._core._rational_tests import rational
+from numpy.lib import stride_tricks
 from numpy.testing import (
     HAS_REFCOUNT,
     assert_,
@@ -558,7 +559,7 @@ def check_copy_result(x, y, ccontig, fcontig, strides=False):
 
 def test_contiguous_flags():
     a = np.ones((4, 4, 1))[::2, :, :]
-    a.strides = a.strides[:2] + (-123,)
+    a = stride_tricks.as_strided(a, strides=a.strides[:2] + (-123,))
     b = np.ones((2, 2, 1, 2, 2)).swapaxes(3, 4)
 
     def check_contig(a, ccontig, fcontig):
diff --git a/numpy/_core/tests/test_deprecations.py b/numpy/_core/tests/test_deprecations.py
index c4acbf9d2d69..7d4875d6d149 100644
--- a/numpy/_core/tests/test_deprecations.py
+++ b/numpy/_core/tests/test_deprecations.py
@@ -406,6 +406,13 @@ def __array_wrap__(self, arr):
         self.assert_deprecated(lambda: np.negative(test2))
         assert test2.called
 
+class TestDeprecatedArrayAttributeSetting(_DeprecationTestCase):
+    message = "Setting the .*on a NumPy array has been deprecated.*"
+
+    def test_deprecated_strides_set(self):
+        x = np.eye(2)
+        self.assert_deprecated(setattr, args=(x, 'strides', x.strides))
+
 
 class TestDeprecatedDTypeParenthesizedRepeatCount(_DeprecationTestCase):
     message = "Passing in a parenthesized single number"
diff --git a/numpy/_core/tests/test_half.py b/numpy/_core/tests/test_half.py
index e2d6e6796db4..711c13655b7a 100644
--- a/numpy/_core/tests/test_half.py
+++ b/numpy/_core/tests/test_half.py
@@ -21,7 +21,7 @@ class TestHalf:
     def setup_method(self):
         # An array of all possible float16 values
         self.all_f16 = np.arange(0x10000, dtype=uint16)
-        self.all_f16.dtype = float16
+        self.all_f16 = self.all_f16.view(float16)
 
         # NaN value can cause an invalid FP exception if HW is being used
         with np.errstate(invalid='ignore'):
@@ -32,7 +32,7 @@ def setup_method(self):
         self.nonan_f16 = np.concatenate(
                                 (np.arange(0xfc00, 0x7fff, -1, dtype=uint16),
                                  np.arange(0x0000, 0x7c01, 1, dtype=uint16)))
-        self.nonan_f16.dtype = float16
+        self.nonan_f16 = self.nonan_f16.view(float16)
         self.nonan_f32 = np.array(self.nonan_f16, dtype=float32)
         self.nonan_f64 = np.array(self.nonan_f16, dtype=float64)
 
@@ -218,7 +218,7 @@ def test_half_values(self):
                       0x0001, 0x8001,
                       0x0000, 0x8000,
                       0x7c00, 0xfc00], dtype=uint16)
-        b.dtype = float16
+        b = b.view(dtype=float16)
         assert_equal(a, b)
 
     def test_half_rounding(self):
diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index 81e55deb3daf..0faf35c64b98 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -29,6 +29,7 @@
 from numpy._core.multiarray import _get_ndarray_c_version, dot
 from numpy._core.tests._locales import CommaDecimalPointLocale
 from numpy.exceptions import AxisError, ComplexWarning
+from numpy.lib import stride_tricks
 from numpy.lib.recfunctions import repack_fields
 from numpy.testing import (
     BLAS_SUPPORTS_FPE,
@@ -382,7 +383,8 @@ def make_array(size, offset, strides):
                                offset=offset * x.itemsize)
             except Exception as e:
                 raise RuntimeError(e)
-            r.strides = strides = strides * x.itemsize
+            with pytest.warns(DeprecationWarning):
+                r.strides = strides * x.itemsize
             return r
 
         assert_equal(make_array(4, 4, -1), np.array([4, 3, 2, 1]))
@@ -392,24 +394,28 @@ def make_array(size, offset, strides):
         assert_raises(RuntimeError, make_array, 8, 3, 1)
         # Check that the true extent of the array is used.
         # Test relies on as_strided base not exposing a buffer.
-        x = np.lib.stride_tricks.as_strided(np.arange(1), (10, 10), (0, 0))
+        x = stride_tricks.as_strided(np.arange(1), (10, 10), (0, 0))
 
         def set_strides(arr, strides):
-            arr.strides = strides
+            with pytest.warns(DeprecationWarning):
+                arr.strides = strides
 
         assert_raises(ValueError, set_strides, x, (10 * x.itemsize, x.itemsize))
 
         # Test for offset calculations:
-        x = np.lib.stride_tricks.as_strided(np.arange(10, dtype=np.int8)[-1],
+        x = stride_tricks.as_strided(np.arange(10, dtype=np.int8)[-1],
                                                     shape=(10,), strides=(-1,))
         assert_raises(ValueError, set_strides, x[::-1], -1)
         a = x[::-1]
-        a.strides = 1
-        a[::2].strides = 2
+        with pytest.warns(DeprecationWarning):
+            a.strides = 1
+        with pytest.warns(DeprecationWarning):
+            a[::2].strides = 2
 
         # test 0d
         arr_0d = np.array(0)
-        arr_0d.strides = ()
+        with pytest.warns(DeprecationWarning):
+            arr_0d.strides = ()
         assert_raises(TypeError, set_strides, arr_0d, None)
 
     def test_fill(self):
@@ -3635,7 +3641,7 @@ def test_ravel(self):
         a = a.reshape(2, 1, 2, 2).swapaxes(-1, -2)
         strides = list(a.strides)
         strides[1] = 123
-        a.strides = strides
+        a = stride_tricks.as_strided(a, strides=strides)
         assert_(a.ravel(order='K').flags.owndata)
         assert_equal(a.ravel('K'), np.arange(0, 15, 2))
 
@@ -3644,7 +3650,7 @@ def test_ravel(self):
         a = a.reshape(2, 1, 2, 2).swapaxes(-1, -2)
         strides = list(a.strides)
         strides[1] = 123
-        a.strides = strides
+        a = stride_tricks.as_strided(a, strides=strides)
         assert_(np.may_share_memory(a.ravel(order='K'), a))
         assert_equal(a.ravel(order='K'), np.arange(2**3))
 
@@ -3657,7 +3663,7 @@ def test_ravel(self):
 
         # 1-element tidy strides test:
         a = np.array([[1]])
-        a.strides = (123, 432)
+        a = stride_tricks.as_strided(a, strides=(123, 432))
         if np.ones(1).strides == (8,):
             assert_(np.may_share_memory(a.ravel('K'), a))
             assert_equal(a.ravel('K').strides, (a.dtype.itemsize,))
@@ -4546,7 +4552,8 @@ def test_datetime64_byteorder(self):
         original = np.array([['2015-02-24T00:00:00.000000000']], dtype='datetime64[ns]')
 
         original_byte_reversed = original.copy(order='K')
-        original_byte_reversed.dtype = original_byte_reversed.dtype.newbyteorder('S')
+        new_dtype = original_byte_reversed.dtype.newbyteorder('S')
+        original_byte_reversed = original_byte_reversed.view(dtype=new_dtype)
         original_byte_reversed.byteswap(inplace=True)
 
         new = pickle.loads(pickle.dumps(original_byte_reversed))
@@ -8366,10 +8373,13 @@ def test_padded_struct_array(self):
         self._check_roundtrip(x3)
 
     @pytest.mark.valgrind_error(reason="leaks buffer info cache temporarily.")
-    def test_relaxed_strides(self, c=np.ones((1, 10, 10), dtype='i8')):  # noqa: B008
+    def test_relaxed_strides(self, c=stride_tricks.as_strided(  # noqa: B008
+                                              np.ones((1, 10, 10), dtype='i8'),  # noqa: B008
+                                              strides=(-1, 80, 8)
+                                              )
+                                  ):
         # Note: c defined as parameter so that it is persistent and leak
         # checks will notice gh-16934 (buffer info cache leak).
-        c.strides = (-1, 80, 8)  # strides need to be fixed at export
 
         assert_(memoryview(c).strides == (800, 80, 8))
 
diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index a29a49bfb71a..f71130f16331 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -858,7 +858,7 @@ def test_iter_nbo_align_contig():
 
     # Unaligned input
     a = np.zeros((6 * 4 + 1,), dtype='i1')[1:]
-    a.dtype = 'f4'
+    a = a.view('f4')
     a[:] = np.arange(6, dtype='f4')
     assert_(not a.flags.aligned)
     # Without 'aligned', shouldn't copy
@@ -1803,7 +1803,7 @@ def test_iter_buffering():
     arrays.append(np.arange(10, dtype='f4'))
     # Unaligned array
     a = np.zeros((4 * 16 + 1,), dtype='i1')[1:]
-    a.dtype = 'i4'
+    a = a.view('i4')
     a[:] = np.arange(16, dtype='i4')
     arrays.append(a)
     # 4-D F-order array
diff --git a/numpy/_core/tests/test_regression.py b/numpy/_core/tests/test_regression.py
index fbfa9311a1dc..3d44728aafaa 100644
--- a/numpy/_core/tests/test_regression.py
+++ b/numpy/_core/tests/test_regression.py
@@ -12,6 +12,7 @@
 import numpy as np
 from numpy._utils import asbytes, asunicode
 from numpy.exceptions import AxisError, ComplexWarning
+from numpy.lib.stride_tricks import as_strided
 from numpy.testing import (
     HAS_REFCOUNT,
     IS_64BIT,
@@ -208,7 +209,7 @@ def test_mem_dot(self):
         # Dummy array to detect bad memory access:
         _z = np.ones(10)
         _dummy = np.empty((0, 10))
-        z = np.lib.stride_tricks.as_strided(_z, _dummy.shape, _dummy.strides)
+        z = as_strided(_z, _dummy.shape, _dummy.strides)
         np.dot(x, np.transpose(y), out=z)
         assert_equal(_z, np.ones(10))
         # Do the same for the built-in dot:
@@ -438,19 +439,16 @@ def test_lexsort_zerolen_custom_strides(self):
         xs = np.array([], dtype='i8')
         assert np.lexsort((xs,)).shape[0] == 0  # Works
 
-        xs.strides = (16,)
+        xs = as_strided(xs, strides=(16,))
         assert np.lexsort((xs,)).shape[0] == 0  # Was: MemoryError
 
     def test_lexsort_zerolen_custom_strides_2d(self):
         xs = np.array([], dtype='i8')
+        xt = as_strided(xs, shape=(0, 2), strides=(16, 16))
+        assert np.lexsort((xt,), axis=0).shape[0] == 0
 
-        xs.shape = (0, 2)
-        xs.strides = (16, 16)
-        assert np.lexsort((xs,), axis=0).shape[0] == 0
-
-        xs.shape = (2, 0)
-        xs.strides = (16, 16)
-        assert np.lexsort((xs,), axis=0).shape[0] == 2
+        xt = as_strided(xs, shape=(2, 0), strides=(16, 16))
+        assert np.lexsort((xt,), axis=0).shape[0] == 2
 
     def test_lexsort_invalid_axis(self):
         assert_raises(AxisError, np.lexsort, (np.arange(1),), axis=2)
@@ -644,7 +642,7 @@ def test_reshape_order(self):
     def test_reshape_zero_strides(self):
         # Issue #380, test reshaping of zero strided arrays
         a = np.ones(1)
-        a = np.lib.stride_tricks.as_strided(a, shape=(5,), strides=(0,))
+        a = as_strided(a, shape=(5,), strides=(0,))
         assert_(a.reshape(5, 1).strides[0] == 0)
 
     def test_reshape_zero_size(self):
@@ -1654,7 +1652,7 @@ def test_eq_string_and_object_array(self):
 
     def test_nonzero_byteswap(self):
         a = np.array([0x80000000, 0x00000080, 0], dtype=np.uint32)
-        a.dtype = np.float32
+        a = a.view(np.float32)
         assert_equal(a.nonzero()[0], [1])
         a = a.byteswap()
         a = a.view(a.dtype.newbyteorder())
@@ -1878,7 +1876,8 @@ def test_alignment_update(self):
         # Check that alignment flag is updated on stride setting
         a = np.arange(10)
         assert_(a.flags.aligned)
-        a.strides = 3
+        with pytest.warns(DeprecationWarning):
+            a.strides = 3
         assert_(not a.flags.aligned)
 
     def test_ticket_1770(self):
diff --git a/numpy/lib/_npyio_impl.py b/numpy/lib/_npyio_impl.py
index f284eeb74834..36ead97a1aae 100644
--- a/numpy/lib/_npyio_impl.py
+++ b/numpy/lib/_npyio_impl.py
@@ -1730,7 +1730,7 @@ def fromregex(file, regexp, dtype, encoding=None):
             #   re-interpret as a single-field structured array.
             newdtype = np.dtype(dtype[dtype.names[0]])
             output = np.array(seq, dtype=newdtype)
-            output.dtype = dtype
+            output = output.view(dtype)
         else:
             output = np.array(seq, dtype=dtype)
 

From 20d034fff6931b97780e9f22172309d81a5e8322 Mon Sep 17 00:00:00 2001
From: Koki Watanabe <56009584+math-hiyoko@users.noreply.github.com>
Date: Fri, 20 Jun 2025 22:56:31 +0900
Subject: [PATCH 22/26] ENH: np.unique: support hash based unique for string
 dtype (#28767)

* Support NPY_STRING, NPY_UNICODE

* unique for NPY_STRING and NPY_UNICODE

* fix construct array

* remove unneccessary include

* refactor

* refactoring

* comment

* feature: unique for NPY_VSTRING

* refactoring

* remove unneccessary include

* add test

* add error message

* linter

* linter

* reserve bucket

* remove emoji from testcase

* fix testcase

* remove error

* fix testcase

* fix testcase name

* use basic_string

* fix testcase

* add ValueError

* fix testcase

* fix memory error

* remove multibyte char

* refactoring

* add multibyte char

* refactoring

* fix memory error

* fix GIL

* fix strlen

* remove PyArray_GETPTR1

* refactoring

* refactoring

* use optional

* refactoring

* refactoring

* refactoring

* refactoring

* fix comment

* linter

* add doc

* DOC: fix

* DOC: fix format

* MNT: refactoring

* MNT: refactoring

* ENH: Store pointers to strings in the set instead of the strings themselves.

* FIX: length in memcmp

* ENH: refactoring

* DOC: 49sec -> 34sec

* Update numpy/lib/_arraysetops_impl.py

Co-authored-by: Nathan Goldbaum <nathan.goldbaum@gmail.com>

* DOC: Mention that hash-based np.unique returns unsorted strings

* ENH: support medium and long vstrings

* FIX: comment

* ENH: use RAII wrapper

* FIX: error handling of string packing

* FIX: error handling of string packing

* FIX: change default bucket size

* FIX: include

* FIX: cast

* ENH: support equal_nan=False

* FIX: function equal

* FIX: check the case if pack_status douesn't return NULL

* FIX: check the case if pack_status douesn't return NULL

* FIX: stderr

* ENH: METH_VARARGS -> METH_FASTCALL

* FIX: log

* FIX: release allocator

* FIX: comment

* FIX: delete log

* ENH: implemented FNV-1a as hash function

* bool -> npy_bool

* FIX: cast

* 34sec -> 35.1sec

* fix: lint

* fix: cast using const void *

* fix: fix fnv1a hash

* fix: lint

* 35.1sec -> 33.5sec

* enh: define macro HASH_TABLE_INITIAL_BUCKETS

* enh: error handling of NpyString_load

* enh: delete comments on GIL

* fix: PyErr_SetString when NpyString_load failed

* fix: PyErr_SetString -> npy_gil_error

---------

Co-authored-by: Nathan Goldbaum <nathan.goldbaum@gmail.com>
---
 doc/release/upcoming_changes/28767.change.rst |  10 +
 .../upcoming_changes/28767.performance.rst    |  10 +
 numpy/_core/meson.build                       |   1 +
 numpy/_core/src/multiarray/fnv.c              |  85 ++++
 numpy/_core/src/multiarray/fnv.h              |  26 ++
 numpy/_core/src/multiarray/multiarraymodule.c |   2 +-
 numpy/_core/src/multiarray/unique.cpp         | 368 ++++++++++++++----
 numpy/_core/src/multiarray/unique.h           |   3 +-
 numpy/lib/_arraysetops_impl.py                |   3 +-
 numpy/lib/tests/test_arraysetops.py           | 186 ++++++++-
 10 files changed, 605 insertions(+), 89 deletions(-)
 create mode 100644 doc/release/upcoming_changes/28767.change.rst
 create mode 100644 doc/release/upcoming_changes/28767.performance.rst
 create mode 100644 numpy/_core/src/multiarray/fnv.c
 create mode 100644 numpy/_core/src/multiarray/fnv.h

diff --git a/doc/release/upcoming_changes/28767.change.rst b/doc/release/upcoming_changes/28767.change.rst
new file mode 100644
index 000000000000..ec173c3672b0
--- /dev/null
+++ b/doc/release/upcoming_changes/28767.change.rst
@@ -0,0 +1,10 @@
+``unique_values`` for string dtypes may return unsorted data
+------------------------------------------------------------
+np.unique now supports hash‐based duplicate removal for string dtypes.
+This enhancement extends the hash-table algorithm to byte strings ('S'),
+Unicode strings ('U'), and the experimental string dtype ('T', StringDType).
+As a result, calling np.unique() on an array of strings will use
+the faster hash-based method to obtain unique values.
+Note that this hash-based method does not guarantee that the returned unique values will be sorted.
+This also works for StringDType arrays containing None (missing values)
+when using equal_nan=True (treating missing values as equal).
diff --git a/doc/release/upcoming_changes/28767.performance.rst b/doc/release/upcoming_changes/28767.performance.rst
new file mode 100644
index 000000000000..ef8ac1c3a45d
--- /dev/null
+++ b/doc/release/upcoming_changes/28767.performance.rst
@@ -0,0 +1,10 @@
+Performance improvements to ``np.unique`` for string dtypes
+-----------------------------------------------------------
+The hash-based algorithm for unique extraction provides
+an order-of-magnitude speedup on large string arrays. 
+In an internal benchmark with about 1 billion string elements,
+the hash-based np.unique completed in roughly 33.5 seconds,
+compared to 498 seconds with the sort-based method
+– about 15× faster for unsorted unique operations on strings.
+This improvement greatly reduces the time to find unique values
+in very large string datasets.
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index cd46a20b0246..6098986618e4 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -1207,6 +1207,7 @@ src_multiarray = multiarray_gen_headers + [
   # Remove this `arm64_exports.c` file once scipy macos arm64 build correctly
   # links to the arm64 npymath library, see gh-22673
   'src/npymath/arm64_exports.c',
+  'src/multiarray/fnv.c',
 ]
 
 src_umath = umath_gen_headers + [
diff --git a/numpy/_core/src/multiarray/fnv.c b/numpy/_core/src/multiarray/fnv.c
new file mode 100644
index 000000000000..2b7848519e61
--- /dev/null
+++ b/numpy/_core/src/multiarray/fnv.c
@@ -0,0 +1,85 @@
+/*
+  FNV-1a hash algorithm implementation
+  Based on the implementation from:
+  https://github.com/lcn2/fnv
+*/
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+#include "fnv.h"
+
+
+#define FNV1A_32_INIT ((npy_uint32)0x811c9dc5)
+#define FNV1A_64_INIT ((npy_uint64)0xcbf29ce484222325ULL)
+
+/*
+  Compute a 32-bit FNV-1a hash of buffer
+  original implementation from:
+  https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_32a.c
+*/
+npy_uint32 
+npy_fnv1a_32(const void *buf, size_t len, npy_uint32 hval)
+{
+    const unsigned char *bp = (const unsigned char *)buf;  /* start of buffer */
+    const unsigned char *be = bp + len;                    /* beyond end of buffer */
+
+    /*
+      FNV-1a hash each octet in the buffer
+    */
+    while (bp < be) {
+
+        /* xor the bottom with the current octet */
+        hval ^= (npy_uint32)*bp++;
+        
+        /* multiply by the 32 bit FNV magic prime */
+        /* hval *= 0x01000193; */
+        hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
+    }
+
+    return hval;
+}
+
+/*
+  Compute a 64-bit FNV-1a hash of the given data
+  original implementation from:
+  https://github.com/lcn2/fnv/blob/b7fcbee95538ee6a15744e756e7e7f1c02862cb0/hash_64a.c
+*/
+npy_uint64 
+npy_fnv1a_64(const void *buf, size_t len, npy_uint64 hval)
+{
+    const unsigned char *bp = (const unsigned char *)buf;  /* start of buffer */
+    const unsigned char *be = bp + len;                    /* beyond end of buffer */
+
+    /*
+      FNV-1a hash each octet in the buffer
+    */
+    while (bp < be) {
+
+        /* xor the bottom with the current octet */
+        hval ^= (npy_uint64)*bp++;
+        
+        /* multiply by the 64 bit FNV magic prime */
+        /* hval *= 0x100000001b3ULL; */
+        hval += (hval << 1) + (hval << 4) + (hval << 5) +
+		        (hval << 7) + (hval << 8) + (hval << 40);
+    }
+
+    return hval;
+}
+
+/*
+ * Compute a size_t FNV-1a hash of the given data
+ * This will use 32-bit or 64-bit hash depending on the size of size_t
+ */
+size_t 
+npy_fnv1a(const void *buf, size_t len)
+{
+#if NPY_SIZEOF_SIZE_T == 8
+    return (size_t)npy_fnv1a_64(buf, len, FNV1A_64_INIT);
+#else /* NPY_SIZEOF_SIZE_T == 4 */
+    return (size_t)npy_fnv1a_32(buf, len, FNV1A_32_INIT);
+#endif
+}
diff --git a/numpy/_core/src/multiarray/fnv.h b/numpy/_core/src/multiarray/fnv.h
new file mode 100644
index 000000000000..c76f54a645b9
--- /dev/null
+++ b/numpy/_core/src/multiarray/fnv.h
@@ -0,0 +1,26 @@
+/*
+  FNV-1a hash algorithm implementation
+  Based on the implementation from:
+  https://github.com/lcn2/fnv
+*/
+
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
+
+
+/*
+  Compute a size_t FNV-1a hash of the given data
+  This will use 32-bit or 64-bit hash depending on the size of size_t
+
+  Parameters:
+  -----------
+  buf - pointer to the data to be hashed
+  len - length of the data in bytes
+
+  Returns:
+  -----------
+  size_t hash value
+*/
+size_t npy_fnv1a(const void *buf, size_t len);
+
+#endif  // NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_FNV_H_
diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c
index d4766b5af7b4..e80c6c0cd45c 100644
--- a/numpy/_core/src/multiarray/multiarraymodule.c
+++ b/numpy/_core/src/multiarray/multiarraymodule.c
@@ -4579,7 +4579,7 @@ static struct PyMethodDef array_module_methods[] = {
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_unique_hash",  (PyCFunction)array__unique_hash,
-        METH_O, "Collect unique values via a hash map."},
+        METH_FASTCALL | METH_KEYWORDS, "Collect unique values via a hash map."},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };
 
diff --git a/numpy/_core/src/multiarray/unique.cpp b/numpy/_core/src/multiarray/unique.cpp
index f36acfdef49a..636f1ef0137c 100644
--- a/numpy/_core/src/multiarray/unique.cpp
+++ b/numpy/_core/src/multiarray/unique.cpp
@@ -1,13 +1,21 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define HASH_TABLE_INITIAL_BUCKETS 1024
 #include <Python.h>
 
-#include <unordered_set>
+#include <algorithm>
+#include <cstring>
 #include <functional>
+#include <unordered_set>
 
 #include <numpy/npy_common.h>
 #include "numpy/arrayobject.h"
+#include "gil_utils.h"
+extern "C" {
+    #include "fnv.h"
+    #include "npy_argparse.h"
+}
 
 // This is to use RAII pattern to handle cpp exceptions while avoiding memory leaks.
 // Adapted from https://stackoverflow.com/a/25510879/2536294
@@ -18,77 +26,128 @@ struct FinalAction {
   private:
     F clean_;
 };
-
 template <typename F>
 FinalAction<F> finally(F f) {
     return FinalAction<F>(f);
 }
 
-template<typename T>
+template <typename T>
 static PyObject*
-unique(PyArrayObject *self)
+unique_integer(PyArrayObject *self, npy_bool equal_nan)
 {
-    /* This function takes a numpy array and returns a numpy array containing
-    the unique values.
-
-    It assumes the numpy array includes data that can be viewed as unsigned integers
-    of a certain size (sizeof(T)).
-
-    It doesn't need to know the actual type, since it needs to find unique values
-    among binary representations of the input data. This means it won't apply to
-    custom or complicated dtypes or string values.
+    /*
+    * Returns a new NumPy array containing the unique values of the input array of integer.
+    * This function uses hashing to identify uniqueness efficiently.
     */
     NPY_ALLOW_C_API_DEF;
-    std::unordered_set<T> hashset;
-
-    NpyIter *iter = NpyIter_New(self, NPY_ITER_READONLY |
-                                      NPY_ITER_EXTERNAL_LOOP |
-                                      NPY_ITER_REFS_OK |
-                                      NPY_ITER_ZEROSIZE_OK |
-                                      NPY_ITER_GROWINNER,
-                                NPY_KEEPORDER, NPY_NO_CASTING,
-                                NULL);
-    // Making sure the iterator is deallocated when the function returns, with
-    // or w/o an exception
-    auto iter_dealloc = finally([&]() { NpyIter_Deallocate(iter); });
-    if (iter == NULL) {
-        return NULL;
+    NPY_ALLOW_C_API;
+    PyArray_Descr *descr = PyArray_DESCR(self);
+    Py_INCREF(descr);
+    NPY_DISABLE_C_API;
+
+    PyThreadState *_save1 = PyEval_SaveThread();
+
+    // number of elements in the input array
+    npy_intp isize = PyArray_SIZE(self);
+
+    // Reserve hashset capacity in advance to minimize reallocations and collisions.
+    // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count:
+    // - Reserving for all elements (isize) may over-allocate when there are few unique values.
+    // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers).
+    // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631
+    std::unordered_set<T> hashset(std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS));
+
+    // Input array is one-dimensional, enabling efficient iteration using strides.
+    char *idata = PyArray_BYTES(self);
+    npy_intp istride = PyArray_STRIDES(self)[0];
+    for (npy_intp i = 0; i < isize; i++, idata += istride) {
+        hashset.insert(*(T *)idata);
     }
 
-    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
-    if (iternext == NULL) {
+    npy_intp length = hashset.size();
+
+    PyEval_RestoreThread(_save1);
+    NPY_ALLOW_C_API;
+    PyObject *res_obj = PyArray_NewFromDescr(
+        &PyArray_Type,
+        descr,
+        1, // ndim
+        &length, // shape
+        NULL, // strides
+        NULL, // data
+        // This flag is needed to be able to call .sort on it.
+        NPY_ARRAY_WRITEABLE, // flags
+        NULL // obj
+    );
+
+    if (res_obj == NULL) {
         return NULL;
     }
-    char **dataptr = NpyIter_GetDataPtrArray(iter);
-    npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter);
-    npy_intp *innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-    // release the GIL
-    PyThreadState *_save;
-    _save = PyEval_SaveThread();
-    // Making sure the GIL is re-acquired when the function returns, with
-    // or w/o an exception
-    auto grab_gil = finally([&]() { PyEval_RestoreThread(_save); });
-    // first we put the data in a hash map
-
-    if (NpyIter_GetIterSize(iter) > 0) {
-        do {
-            char* data = *dataptr;
-            npy_intp stride = *strideptr;
-            npy_intp count = *innersizeptr;
-
-            while (count--) {
-                hashset.insert(*((T *) data));
-                data += stride;
-            }
-        } while (iternext(iter));
+    NPY_DISABLE_C_API;
+    PyThreadState *_save2 = PyEval_SaveThread();
+    auto save2_dealloc = finally([&]() {
+        PyEval_RestoreThread(_save2);
+    });
+
+    char *odata = PyArray_BYTES((PyArrayObject *)res_obj);
+    npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0];
+    // Output array is one-dimensional, enabling efficient iteration using strides.
+    for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) {
+        *(T *)odata = *it;
     }
 
-    npy_intp length = hashset.size();
+    return res_obj;
+}
 
+template <typename T>
+static PyObject*
+unique_string(PyArrayObject *self, npy_bool equal_nan)
+{
+    /*
+    * Returns a new NumPy array containing the unique values of the input array of fixed size strings.
+    * This function uses hashing to identify uniqueness efficiently.
+    */
+    NPY_ALLOW_C_API_DEF;
     NPY_ALLOW_C_API;
     PyArray_Descr *descr = PyArray_DESCR(self);
     Py_INCREF(descr);
+    NPY_DISABLE_C_API;
+
+    PyThreadState *_save1 = PyEval_SaveThread();
+
+    // number of elements in the input array
+    npy_intp isize = PyArray_SIZE(self);
+
+    // variables for the string
+    npy_intp itemsize = descr->elsize;
+    npy_intp num_chars = itemsize / sizeof(T);
+    auto hash = [num_chars](const T *value) -> size_t {
+        return npy_fnv1a(value, num_chars * sizeof(T));
+    };
+    auto equal = [itemsize](const T *lhs, const T *rhs) -> bool {
+        return std::memcmp(lhs, rhs, itemsize) == 0;
+    };
+
+    // Reserve hashset capacity in advance to minimize reallocations and collisions.
+    // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count:
+    // - Reserving for all elements (isize) may over-allocate when there are few unique values.
+    // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers).
+    // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631
+    std::unordered_set<T *, decltype(hash), decltype(equal)> hashset(
+        std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS), hash, equal
+    );
+
+    // Input array is one-dimensional, enabling efficient iteration using strides.
+    char *idata = PyArray_BYTES(self);
+    npy_intp istride = PyArray_STRIDES(self)[0];
+    for (npy_intp i = 0; i < isize; i++, idata += istride) {
+        hashset.insert((T *)idata);
+    }
+
+    npy_intp length = hashset.size();
+
+    PyEval_RestoreThread(_save1);
+    NPY_ALLOW_C_API;
     PyObject *res_obj = PyArray_NewFromDescr(
         &PyArray_Type,
         descr,
@@ -100,18 +159,147 @@ unique(PyArrayObject *self)
         NPY_ARRAY_WRITEABLE, // flags
         NULL // obj
     );
+
+    if (res_obj == NULL) {
+        return NULL;
+    }
     NPY_DISABLE_C_API;
+    PyThreadState *_save2 = PyEval_SaveThread();
+    auto save2_dealloc = finally([&]() {
+        PyEval_RestoreThread(_save2);
+    });
+
+    char *odata = PyArray_BYTES((PyArrayObject *)res_obj);
+    npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0];
+    // Output array is one-dimensional, enabling efficient iteration using strides.
+    for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) {
+        std::memcpy(odata, *it, itemsize);
+    }
+
+    return res_obj;
+}
+
+static PyObject*
+unique_vstring(PyArrayObject *self, npy_bool equal_nan)
+{
+    /*
+    * Returns a new NumPy array containing the unique values of the input array.
+    * This function uses hashing to identify uniqueness efficiently.
+    */
+    NPY_ALLOW_C_API_DEF;
+    NPY_ALLOW_C_API;
+    PyArray_Descr *descr = PyArray_DESCR(self);
+    Py_INCREF(descr);
+    NPY_DISABLE_C_API;
+
+    PyThreadState *_save1 = PyEval_SaveThread();
+
+    // number of elements in the input array
+    npy_intp isize = PyArray_SIZE(self);
+
+    // variables for the vstring
+    npy_string_allocator *in_allocator = NpyString_acquire_allocator((PyArray_StringDTypeObject *)descr);
+    auto hash = [equal_nan](const npy_static_string *value) -> size_t {
+        if (value->buf == NULL) {
+            if (equal_nan) {
+                return 0;
+            } else {
+                return std::hash<const npy_static_string *>{}(value);
+            }
+        }
+        return npy_fnv1a(value->buf, value->size * sizeof(char));
+    };
+    auto equal = [equal_nan](const npy_static_string *lhs, const npy_static_string *rhs) -> bool {
+        if (lhs->buf == NULL && rhs->buf == NULL) {
+            if (equal_nan) {
+                return true;
+            } else {
+                return lhs == rhs;
+            }
+        }
+        if (lhs->buf == NULL || rhs->buf == NULL) {
+            return false;
+        }
+        if (lhs->size != rhs->size) {
+            return false;
+        }
+        return std::memcmp(lhs->buf, rhs->buf, lhs->size) == 0;
+    };
 
+    // Reserve hashset capacity in advance to minimize reallocations and collisions.
+    // We use min(isize, HASH_TABLE_INITIAL_BUCKETS) as the initial bucket count:
+    // - Reserving for all elements (isize) may over-allocate when there are few unique values.
+    // - Using a moderate upper bound HASH_TABLE_INITIAL_BUCKETS(1024) keeps memory usage reasonable (4 KiB for pointers).
+    // See discussion: https://github.com/numpy/numpy/pull/28767#discussion_r2064267631
+    std::unordered_set<npy_static_string *, decltype(hash), decltype(equal)> hashset(
+        std::min(isize, (npy_intp)HASH_TABLE_INITIAL_BUCKETS), hash, equal
+    );
+
+    // Input array is one-dimensional, enabling efficient iteration using strides.
+    char *idata = PyArray_BYTES(self);
+    npy_intp istride = PyArray_STRIDES(self)[0];
+    // unpacked_strings need to be allocated outside of the loop because of the lifetime problem.
+    std::vector<npy_static_string> unpacked_strings(isize, {0, NULL});
+    for (npy_intp i = 0; i < isize; i++, idata += istride) {
+        npy_packed_static_string *packed_string = (npy_packed_static_string *)idata;
+        int is_null = NpyString_load(in_allocator, packed_string, &unpacked_strings[i]);
+        if (is_null == -1) {
+            npy_gil_error(PyExc_RuntimeError,
+                "Failed to load string from packed static string. ");
+            return NULL;
+        }
+        hashset.insert(&unpacked_strings[i]);
+    }
+
+    NpyString_release_allocator(in_allocator);
+
+    npy_intp length = hashset.size();
+
+    PyEval_RestoreThread(_save1);
+    NPY_ALLOW_C_API;
+    PyObject *res_obj = PyArray_NewFromDescr(
+        &PyArray_Type,
+        descr,
+        1, // ndim
+        &length, // shape
+        NULL, // strides
+        NULL, // data
+        // This flag is needed to be able to call .sort on it.
+        NPY_ARRAY_WRITEABLE, // flags
+        NULL // obj
+    );
     if (res_obj == NULL) {
         return NULL;
     }
+    PyArray_Descr *res_descr = PyArray_DESCR((PyArrayObject *)res_obj);
+    Py_INCREF(res_descr);
+    NPY_DISABLE_C_API;
+
+    PyThreadState *_save2 = PyEval_SaveThread();
+    auto save2_dealloc = finally([&]() {
+        PyEval_RestoreThread(_save2);
+    });
+
+    npy_string_allocator *out_allocator = NpyString_acquire_allocator((PyArray_StringDTypeObject *)res_descr);
+    auto out_allocator_dealloc = finally([&]() {
+        NpyString_release_allocator(out_allocator);
+    });
 
-    // then we iterate through the map's keys to get the unique values
-    T* data = (T *)PyArray_DATA((PyArrayObject *)res_obj);
-    auto it = hashset.begin();
-    size_t i = 0;
-    for (; it != hashset.end(); it++, i++) {
-        data[i] = *it;
+    char *odata = PyArray_BYTES((PyArrayObject *)res_obj);
+    npy_intp ostride = PyArray_STRIDES((PyArrayObject *)res_obj)[0];
+    // Output array is one-dimensional, enabling efficient iteration using strides.
+    for (auto it = hashset.begin(); it != hashset.end(); it++, odata += ostride) {
+        npy_packed_static_string *packed_string = (npy_packed_static_string *)odata;
+        int pack_status = 0;
+        if ((*it)->buf == NULL) {
+            pack_status = NpyString_pack_null(out_allocator, packed_string);
+        } else {
+            pack_status = NpyString_pack(out_allocator, packed_string, (*it)->buf, (*it)->size);
+        }
+        if (pack_status == -1) {
+            // string packing failed
+            return NULL;
+        }
     }
 
     return res_obj;
@@ -119,27 +307,30 @@ unique(PyArrayObject *self)
 
 
 // this map contains the functions used for each item size.
-typedef std::function<PyObject *(PyArrayObject *)> function_type;
+typedef std::function<PyObject *(PyArrayObject *, npy_bool)> function_type;
 std::unordered_map<int, function_type> unique_funcs = {
-    {NPY_BYTE, unique<npy_byte>},
-    {NPY_UBYTE, unique<npy_ubyte>},
-    {NPY_SHORT, unique<npy_short>},
-    {NPY_USHORT, unique<npy_ushort>},
-    {NPY_INT, unique<npy_int>},
-    {NPY_UINT, unique<npy_uint>},
-    {NPY_LONG, unique<npy_long>},
-    {NPY_ULONG, unique<npy_ulong>},
-    {NPY_LONGLONG, unique<npy_longlong>},
-    {NPY_ULONGLONG, unique<npy_ulonglong>},
-    {NPY_INT8, unique<npy_int8>},
-    {NPY_INT16, unique<npy_int16>},
-    {NPY_INT32, unique<npy_int32>},
-    {NPY_INT64, unique<npy_int64>},
-    {NPY_UINT8, unique<npy_uint8>},
-    {NPY_UINT16, unique<npy_uint16>},
-    {NPY_UINT32, unique<npy_uint32>},
-    {NPY_UINT64, unique<npy_uint64>},
-    {NPY_DATETIME, unique<npy_uint64>},
+    {NPY_BYTE, unique_integer<npy_byte>},
+    {NPY_UBYTE, unique_integer<npy_ubyte>},
+    {NPY_SHORT, unique_integer<npy_short>},
+    {NPY_USHORT, unique_integer<npy_ushort>},
+    {NPY_INT, unique_integer<npy_int>},
+    {NPY_UINT, unique_integer<npy_uint>},
+    {NPY_LONG, unique_integer<npy_long>},
+    {NPY_ULONG, unique_integer<npy_ulong>},
+    {NPY_LONGLONG, unique_integer<npy_longlong>},
+    {NPY_ULONGLONG, unique_integer<npy_ulonglong>},
+    {NPY_INT8, unique_integer<npy_int8>},
+    {NPY_INT16, unique_integer<npy_int16>},
+    {NPY_INT32, unique_integer<npy_int32>},
+    {NPY_INT64, unique_integer<npy_int64>},
+    {NPY_UINT8, unique_integer<npy_uint8>},
+    {NPY_UINT16, unique_integer<npy_uint16>},
+    {NPY_UINT32, unique_integer<npy_uint32>},
+    {NPY_UINT64, unique_integer<npy_uint64>},
+    {NPY_DATETIME, unique_integer<npy_uint64>},
+    {NPY_STRING, unique_string<npy_byte>},
+    {NPY_UNICODE, unique_string<npy_ucs4>},
+    {NPY_VSTRING, unique_vstring},
 };
 
 
@@ -154,14 +345,21 @@ std::unordered_map<int, function_type> unique_funcs = {
  * type is unsupported or `NULL` with an error set.
  */
 extern "C" NPY_NO_EXPORT PyObject *
-array__unique_hash(PyObject *NPY_UNUSED(module), PyObject *arr_obj)
+array__unique_hash(PyObject *NPY_UNUSED(module),
+                   PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    if (!PyArray_Check(arr_obj)) {
-        PyErr_SetString(PyExc_TypeError,
-                "_unique_hash() requires a NumPy array input.");
+    PyArrayObject *arr = NULL;
+    npy_bool equal_nan = NPY_TRUE;  // default to True
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("_unique_hash", args, len_args, kwnames,
+                            "arr", &PyArray_Converter, &arr,
+                            "|equal_nan",  &PyArray_BoolConverter, &equal_nan,
+                            NULL, NULL, NULL
+                            ) < 0
+    ) {
         return NULL;
     }
-    PyArrayObject *arr = (PyArrayObject *)arr_obj;
 
     try {
         auto type = PyArray_TYPE(arr);
@@ -170,7 +368,7 @@ array__unique_hash(PyObject *NPY_UNUSED(module), PyObject *arr_obj)
             Py_RETURN_NOTIMPLEMENTED;
         }
 
-        return unique_funcs[type](arr);
+        return unique_funcs[type](arr, equal_nan);
     }
     catch (const std::bad_alloc &e) {
         PyErr_NoMemory();
diff --git a/numpy/_core/src/multiarray/unique.h b/numpy/_core/src/multiarray/unique.h
index 3e258405e8f4..7b3fb143ada4 100644
--- a/numpy/_core/src/multiarray/unique.h
+++ b/numpy/_core/src/multiarray/unique.h
@@ -5,7 +5,8 @@
 extern "C" {
 #endif
 
-PyObject* array__unique_hash(PyObject *NPY_UNUSED(dummy), PyObject *args);
+PyObject* array__unique_hash(PyObject *NPY_UNUSED(dummy),
+                             PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/lib/_arraysetops_impl.py b/numpy/lib/_arraysetops_impl.py
index ef0739ba486f..c4788385b924 100644
--- a/numpy/lib/_arraysetops_impl.py
+++ b/numpy/lib/_arraysetops_impl.py
@@ -368,7 +368,8 @@ def _unique1d(ar, return_index=False, return_inverse=False,
         conv = _array_converter(ar)
         ar_, = conv
 
-        if (hash_unique := _unique_hash(ar_)) is not NotImplemented:
+        if (hash_unique := _unique_hash(ar_, equal_nan=equal_nan)) \
+            is not NotImplemented:
             if sorted:
                 hash_unique.sort()
             # We wrap the result back in case it was a subclass of numpy.ndarray.
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index 7865e1b16ee9..b3e2bfa279b0 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 from numpy import ediff1d, intersect1d, isin, setdiff1d, setxor1d, union1d, unique
+from numpy.dtypes import StringDType
 from numpy.exceptions import AxisError
 from numpy.testing import (
     assert_array_equal,
@@ -813,7 +814,9 @@ def test_unique_1d(self):
 
     def test_unique_zero_sized(self):
         # test for zero-sized arrays
-        for dt in self.get_types():
+        types = self.get_types()
+        types.extend('SU')
+        for dt in types:
             a = np.array([], dt)
             b = np.array([], dt)
             i1 = np.array([], np.int64)
@@ -838,6 +841,187 @@ class Subclass(np.ndarray):
             bb = Subclass(b.shape, dtype=dt, buffer=b)
             self.check_all(aa, bb, i1, i2, c, dt)
 
+    def test_unique_byte_string_hash_based(self):
+        # test for byte string arrays
+        arr = ['apple', 'banana', 'apple', 'cherry', 'date', 'banana', 'fig', 'grape']
+        unq_sorted = ['apple', 'banana', 'cherry', 'date', 'fig', 'grape']
+
+        a1 = unique(arr, sorted=False)
+        # the result varies depending on the impl of std::unordered_set,
+        # so we check them by sorting
+        assert_array_equal(sorted(a1.tolist()), unq_sorted)
+
+    def test_unique_unicode_string_hash_based(self):
+        # test for unicode string arrays
+        arr = [
+            'café', 'cafe', 'café', 'naïve', 'naive',
+            'résumé', 'naïve', 'resume', 'résumé',
+        ]
+        unq_sorted = ['cafe', 'café', 'naive', 'naïve', 'resume', 'résumé']
+
+        a1 = unique(arr, sorted=False)
+        # the result varies depending on the impl of std::unordered_set,
+        # so we check them by sorting
+        assert_array_equal(sorted(a1.tolist()), unq_sorted)
+
+    def test_unique_vstring_hash_based_equal_nan(self):
+        # test for unicode and nullable string arrays (equal_nan=True)
+        a = np.array([
+                # short strings
+                'straße',
+                None,
+                'strasse',
+                'straße',
+                None,
+                'niño',
+                'nino',
+                'élève',
+                'eleve',
+                'niño',
+                'élève',
+                # medium strings
+                'b' * 20,
+                'ß' * 30,
+                None,
+                'é' * 30,
+                'e' * 20,
+                'ß' * 30,
+                'n' * 30,
+                'ñ' * 20,
+                None,
+                'e' * 20,
+                'ñ' * 20,
+                # long strings
+                'b' * 300,
+                'ß' * 400,
+                None,
+                'é' * 400,
+                'e' * 300,
+                'ß' * 400,
+                'n' * 400,
+                'ñ' * 300,
+                None,
+                'e' * 300,
+                'ñ' * 300,
+            ],
+            dtype=StringDType(na_object=None)
+        )
+        unq_sorted_wo_none = [
+            'b' * 20,
+            'b' * 300,
+            'e' * 20,
+            'e' * 300,
+            'eleve',
+            'nino',
+            'niño',
+            'n' * 30,
+            'n' * 400,
+            'strasse',
+            'straße',
+            'ß' * 30,
+            'ß' * 400,
+            'élève',
+            'é' * 30,
+            'é' * 400,
+            'ñ' * 20,
+            'ñ' * 300,
+        ]
+
+        a1 = unique(a, sorted=False, equal_nan=True)
+        # the result varies depending on the impl of std::unordered_set,
+        # so we check them by sorting
+
+        # a1 should have exactly one None
+        count_none = sum(x is None for x in a1)
+        assert_equal(count_none, 1)
+
+        a1_wo_none = sorted(x for x in a1 if x is not None)
+        assert_array_equal(a1_wo_none, unq_sorted_wo_none)
+
+    def test_unique_vstring_hash_based_not_equal_nan(self):
+        # test for unicode and nullable string arrays (equal_nan=False)
+        a = np.array([
+                # short strings
+                'straße',
+                None,
+                'strasse',
+                'straße',
+                None,
+                'niño',
+                'nino',
+                'élève',
+                'eleve',
+                'niño',
+                'élève',
+                # medium strings
+                'b' * 20,
+                'ß' * 30,
+                None,
+                'é' * 30,
+                'e' * 20,
+                'ß' * 30,
+                'n' * 30,
+                'ñ' * 20,
+                None,
+                'e' * 20,
+                'ñ' * 20,
+                # long strings
+                'b' * 300,
+                'ß' * 400,
+                None,
+                'é' * 400,
+                'e' * 300,
+                'ß' * 400,
+                'n' * 400,
+                'ñ' * 300,
+                None,
+                'e' * 300,
+                'ñ' * 300,
+            ],
+            dtype=StringDType(na_object=None)
+        )
+        unq_sorted_wo_none = [
+            'b' * 20,
+            'b' * 300,
+            'e' * 20,
+            'e' * 300,
+            'eleve',
+            'nino',
+            'niño',
+            'n' * 30,
+            'n' * 400,
+            'strasse',
+            'straße',
+            'ß' * 30,
+            'ß' * 400,
+            'élève',
+            'é' * 30,
+            'é' * 400,
+            'ñ' * 20,
+            'ñ' * 300,
+        ]
+
+        a1 = unique(a, sorted=False, equal_nan=False)
+        # the result varies depending on the impl of std::unordered_set,
+        # so we check them by sorting
+
+        # a1 should have exactly one None
+        count_none = sum(x is None for x in a1)
+        assert_equal(count_none, 6)
+
+        a1_wo_none = sorted(x for x in a1 if x is not None)
+        assert_array_equal(a1_wo_none, unq_sorted_wo_none)
+
+    def test_unique_vstring_errors(self):
+        a = np.array(
+            [
+                'apple', 'banana', 'apple', None, 'cherry',
+                'date', 'banana', 'fig', None, 'grape',
+            ] * 2,
+            dtype=StringDType(na_object=None)
+        )
+        assert_raises(ValueError, unique, a, equal_nan=False)
+
     @pytest.mark.parametrize("arg", ["return_index", "return_inverse", "return_counts"])
     def test_unsupported_hash_based(self, arg):
         """These currently never use the hash-based solution.  However,

From 7c91551b85650c5465d21956acf66a47d9d0d02b Mon Sep 17 00:00:00 2001
From: specsy <sdiya5556@gmail.com>
Date: Fri, 20 Jun 2025 20:29:10 +0530
Subject: [PATCH 23/26] DOC: Update CONTRIBUTING.rst (#28158)

* Update CONTRIBUTING.rst

fixes #19778
Updating the contibution section so that contributors avoid doing mkstakes while asking questions, instead they focus on doing contribution and work on project right after.

* Update CONTRIBUTING.rst

Shortened the length of the sentence.

* Update CONTRIBUTING.rst
---
 CONTRIBUTING.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 6e019983a0a2..0919790c65d1 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -7,8 +7,9 @@ Whether you're new to open source or experienced, your contributions
 help us grow.
 
 Pull requests (PRs) are always welcome, but making a PR is just the
-start. Please respond to comments and requests for changes to help
-move the process forward. Please follow our
+start. Please respond to comments and requests for changes to help move the process forward.
+Skip asking for an issue to be assigned to you on GitHub—send in your PR, explain what you did and ask for a review. It makes collaboration and support much easier.
+Please follow our
 `Code of Conduct <https://numpy.org/code-of-conduct/>`__, which applies
 to all interactions, including issues and PRs.
 

From 1b3a0d9385aa1ded43885d966bb8733da839762f Mon Sep 17 00:00:00 2001
From: Yuki Kobayashi <drsuaimqjgar@gmail.com>
Date: Sat, 21 Jun 2025 00:32:00 +0000
Subject: [PATCH 24/26] DOC: Fix some markup errors

---
 doc/source/f2py/buildtools/distutils-to-meson.rst | 2 +-
 doc/source/reference/random/multithreading.rst    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/f2py/buildtools/distutils-to-meson.rst b/doc/source/f2py/buildtools/distutils-to-meson.rst
index bf5da973e9fa..b24638e62239 100644
--- a/doc/source/f2py/buildtools/distutils-to-meson.rst
+++ b/doc/source/f2py/buildtools/distutils-to-meson.rst
@@ -117,7 +117,7 @@ sample is included below.
     +------------------------------------+-------------------------------+
     | LDFLAGS                            | Linker options                |
     +------------------------------------+-------------------------------+
-    | LD\ :sub:`LIBRARY`\ \ :sub:`PATH`\ | Library file locations (Unix) |
+    | LD\_LIBRARY\_PATH                  | Library file locations (Unix) |
     +------------------------------------+-------------------------------+
     | LIBS                               | Libraries to link against     |
     +------------------------------------+-------------------------------+
diff --git a/doc/source/reference/random/multithreading.rst b/doc/source/reference/random/multithreading.rst
index 17c6a515cdbc..73d2fc9ee5ad 100644
--- a/doc/source/reference/random/multithreading.rst
+++ b/doc/source/reference/random/multithreading.rst
@@ -9,7 +9,7 @@ well-behaved (writable and aligned). Under normal circumstances, arrays
 created using the common constructors such as :meth:`numpy.empty` will satisfy
 these requirements.
 
-This example makes use of:mod:`concurrent.futures` to fill an array using
+This example makes use of :mod:`concurrent.futures` to fill an array using
 multiple threads.  Threads are long-lived so that repeated calls do not
 require any additional overheads from thread creation.
 

From 124ac8d41f2d913cbac1a05452cdae3d80d7e789 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sat, 21 Jun 2025 07:40:15 -0600
Subject: [PATCH 25/26] MAINT: Update main after 2.1.0 release.

- Add 2.3.1-notes.rst
- Add 2.3.1-changelog.rst
- Update release.rst

[skip cirrus] [skip azp] [skip actions]
---
 doc/changelog/2.3.1-changelog.rst  | 34 +++++++++++++++++++
 doc/source/release.rst             |  1 +
 doc/source/release/2.3.1-notes.rst | 53 ++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)
 create mode 100644 doc/changelog/2.3.1-changelog.rst
 create mode 100644 doc/source/release/2.3.1-notes.rst

diff --git a/doc/changelog/2.3.1-changelog.rst b/doc/changelog/2.3.1-changelog.rst
new file mode 100644
index 000000000000..a1c840f8beda
--- /dev/null
+++ b/doc/changelog/2.3.1-changelog.rst
@@ -0,0 +1,34 @@
+
+Contributors
+============
+
+A total of 9 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Brad Smith +
+* Charles Harris
+* Developer-Ecosystem-Engineering
+* François Rozet
+* Joren Hammudoglu
+* Matti Picus
+* Mugundan Selvanayagam
+* Nathan Goldbaum
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 12 pull requests were merged for this release.
+
+* `#29140 <https://github.com/numpy/numpy/pull/29140>`__: MAINT: Prepare 2.3.x for further development
+* `#29191 <https://github.com/numpy/numpy/pull/29191>`__: BUG: fix matmul with transposed out arg (#29179)
+* `#29192 <https://github.com/numpy/numpy/pull/29192>`__: TYP: Backport typing fixes and improvements.
+* `#29205 <https://github.com/numpy/numpy/pull/29205>`__: BUG: Revert ``np.vectorize`` casting to legacy behavior (#29196)
+* `#29222 <https://github.com/numpy/numpy/pull/29222>`__: TYP: Backport typing fixes
+* `#29233 <https://github.com/numpy/numpy/pull/29233>`__: BUG: avoid negating unsigned integers in resize implementation...
+* `#29234 <https://github.com/numpy/numpy/pull/29234>`__: TST: Fix test that uses unininitialized memory (#29232)
+* `#29235 <https://github.com/numpy/numpy/pull/29235>`__: BUG: Address interaction between SME and FPSR (#29223)
+* `#29237 <https://github.com/numpy/numpy/pull/29237>`__: BUG: Enforce integer limitation in concatenate (#29231)
+* `#29238 <https://github.com/numpy/numpy/pull/29238>`__: CI: Add support for building NumPy with LLVM for Win-ARM64
+* `#29241 <https://github.com/numpy/numpy/pull/29241>`__: ENH: Detect CPU features on OpenBSD ARM and PowerPC64
+* `#29242 <https://github.com/numpy/numpy/pull/29242>`__: ENH: Detect CPU features on FreeBSD / OpenBSD RISC-V64.
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 6c6a853b06f5..59e6dd07b002 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -6,6 +6,7 @@ Release notes
     :maxdepth: 2
 
     2.4.0 <release/2.4.0-notes>
+    2.3.1 <release/2.3.1-notes>
     2.3.0 <release/2.3.0-notes>
     2.2.6 <release/2.2.6-notes>
     2.2.5 <release/2.2.5-notes>
diff --git a/doc/source/release/2.3.1-notes.rst b/doc/source/release/2.3.1-notes.rst
new file mode 100644
index 000000000000..d8193f07671c
--- /dev/null
+++ b/doc/source/release/2.3.1-notes.rst
@@ -0,0 +1,53 @@
+.. currentmodule:: numpy
+
+=========================
+NumPy 2.3.1 Release Notes
+=========================
+
+The NumPy 2.3.1 release is a patch release with several bug fixes, annotation
+improvements, and better support for OpenBSD. Highlights are:
+
+- Fix bug in ``matmul`` for non-contiguous out kwarg parameter
+- Fix for Accelerate runtime warnings on M4 hardware
+- Fix new in NumPy 2.3.0 ``np.vectorize`` casting errors
+- Improved support of cpu features for FreeBSD and OpenBSD 
+
+This release supports Python versions 3.11-3.13, Python 3.14 will be supported
+when it is released.
+
+
+Contributors
+============
+
+A total of 9 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Brad Smith +
+* Charles Harris
+* Developer-Ecosystem-Engineering
+* François Rozet
+* Joren Hammudoglu
+* Matti Picus
+* Mugundan Selvanayagam
+* Nathan Goldbaum
+* Sebastian Berg
+
+
+Pull requests merged
+====================
+
+A total of 12 pull requests were merged for this release.
+
+* `#29140 <https://github.com/numpy/numpy/pull/29140>`__: MAINT: Prepare 2.3.x for further development
+* `#29191 <https://github.com/numpy/numpy/pull/29191>`__: BUG: fix matmul with transposed out arg (#29179)
+* `#29192 <https://github.com/numpy/numpy/pull/29192>`__: TYP: Backport typing fixes and improvements.
+* `#29205 <https://github.com/numpy/numpy/pull/29205>`__: BUG: Revert ``np.vectorize`` casting to legacy behavior (#29196)
+* `#29222 <https://github.com/numpy/numpy/pull/29222>`__: TYP: Backport typing fixes
+* `#29233 <https://github.com/numpy/numpy/pull/29233>`__: BUG: avoid negating unsigned integers in resize implementation...
+* `#29234 <https://github.com/numpy/numpy/pull/29234>`__: TST: Fix test that uses uninitialized memory (#29232)
+* `#29235 <https://github.com/numpy/numpy/pull/29235>`__: BUG: Address interaction between SME and FPSR (#29223)
+* `#29237 <https://github.com/numpy/numpy/pull/29237>`__: BUG: Enforce integer limitation in concatenate (#29231)
+* `#29238 <https://github.com/numpy/numpy/pull/29238>`__: CI: Add support for building NumPy with LLVM for Win-ARM64
+* `#29241 <https://github.com/numpy/numpy/pull/29241>`__: ENH: Detect CPU features on OpenBSD ARM and PowerPC64
+* `#29242 <https://github.com/numpy/numpy/pull/29242>`__: ENH: Detect CPU features on FreeBSD / OpenBSD RISC-V64.
+

From 1fefc5c6b767e86c2642641e7ba7a22ab0cf554b Mon Sep 17 00:00:00 2001
From: Mohammed Abdul Rahman <130785777+that-ar-guy@users.noreply.github.com>
Date: Mon, 23 Jun 2025 23:26:14 +0530
Subject: [PATCH 26/26] DOC: Clarify dtype argument for __array__ in custom
 container guide (#29254)

* DOC: Clarify dtype argument for __array__ in custom container guide
---
 doc/source/user/basics.dispatch.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/source/user/basics.dispatch.rst b/doc/source/user/basics.dispatch.rst
index ae53995a3917..117d60f85467 100644
--- a/doc/source/user/basics.dispatch.rst
+++ b/doc/source/user/basics.dispatch.rst
@@ -46,6 +46,21 @@ array([[1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])
 
+The ``__array__`` method can optionally accept a `dtype` argument. If provided,
+this argument specifies the desired data type for the resulting NumPy array.
+Your implementation should attempt to convert the data to this `dtype`
+if possible. If the conversion is not supported, it's generally best
+to fall back to a default type or raise a `TypeError` or `ValueError`.
+
+Here's an example demonstrating its use with `dtype` specification:
+
+>>> np.asarray(arr, dtype=np.float32)
+array([[1., 0., 0., 0., 0.],
+       [0., 1., 0., 0., 0.],
+       [0., 0., 1., 0., 0.],
+       [0., 0., 0., 1., 0.],
+       [0., 0., 0., 0., 1.]], dtype=float32)
+
 If we operate on ``arr`` with a numpy function, numpy will again use the
 ``__array__`` interface to convert it to an array and then apply the function
 in the usual way.