From 9fe30842537c7b2a316896913abf499ec24b7ef8 Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Sat, 19 Apr 2025 00:10:17 +0530
Subject: [PATCH 01/11] WIP,Prototype: Use Neon SIMD to improve half->float
 cast performance [ci skip] [skip ci]

---
 .../multiarray/lowlevel_strided_loops.c.src   | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 1299e55b4258..9060b31e000e 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -11,6 +11,7 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
+#include <arm_neon.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #define _UMATHMODULE
@@ -847,6 +848,26 @@ static NPY_GCC_OPT_3 int
 {
     npy_intp N = dimensions[0];
     char *src = args[0], *dst = args[1];
+    #if @contig@ && @aligned@ && @is_half1@ && @is_float2@
+        float16_t* input = (float16_t*)src;
+        float* output = (float*)dst;
+        npy_intp i = 0;
+        size_t num_chunks_8 = N / 8;
+
+        for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
+            float16x8_t vec_f16 = vld1q_f16(input + i);
+            float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
+            float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
+            float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
+            float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
+            vst1q_f32(output + i, vec_f32_low);
+            vst1q_f32(output + i + 4, vec_f32_high);
+            i += 8;
+        }
+        for (; i < N; ++i) {
+            output[i] = (float)input[i];
+        }
+    #else
 #if !@contig@
     npy_intp src_stride = strides[0], dst_stride = strides[1];
 #endif
@@ -930,6 +951,7 @@ static NPY_GCC_OPT_3 int
         src += src_stride;
 #endif
     }
+    #endif
     return 0;
 }
 

From 4de0a6fe609e87db763769ed0f373e5912d73e0b Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Mon, 21 Apr 2025 02:39:49 +0530
Subject: [PATCH 02/11] Support Neon SIMD float32->float16 cast and update
 scalar path to use hardware cast

---
 .../multiarray/lowlevel_strided_loops.c.src   | 111 +++++++++++++-----
 1 file changed, 83 insertions(+), 28 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 9060b31e000e..6aa77248aded 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -11,7 +11,6 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#include <arm_neon.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #define _UMATHMODULE
@@ -26,6 +25,64 @@
 
 #include "umathmodule.h"
 
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+// Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+
+
+static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned(
+        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    const float16_t *src = (const float16_t *)args[0];
+    float *dst = (float *)args[1];
+    npy_intp N = dimensions[0];
+    npy_intp i = 0;
+    size_t num_chunks_8 = N / 8;
+
+    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
+        float16x8_t vec_f16 = vld1q_f16(src + i);
+        float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
+        float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
+        float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
+        float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
+        vst1q_f32(dst + i, vec_f32_low);
+        vst1q_f32(dst + i + 4, vec_f32_high);
+        i += 8;
+    }
+    for (; i < N; ++i) {
+        dst[i] = (float)src[i];
+    }
+    return 0;
+}
+
+static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned(
+        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    const float *src = (const float *)args[0];
+    float16_t *dst = (float16_t *)args[1];
+    npy_intp N = dimensions[0];
+    npy_intp i = 0;
+    size_t num_chunks_8 = N / 8;
+
+    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
+        float32x4_t vec_f32_low = vld1q_f32(src + i);
+        float32x4_t vec_f32_high = vld1q_f32(src + i + 4);
+        float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low);
+        float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high);
+        float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high);
+        vst1q_f16(dst + i, vec_f16_combined);
+        i += 8;
+    }
+    for (; i < N; ++i) {
+        dst[i] = (float16_t)src[i];
+    }
+    return 0;
+}
+#endif
+
+
 /*
  * x86 platform works with unaligned access but the compiler is allowed to
  * assume all data is aligned to its size by the C standard. This means it can
@@ -709,6 +766,14 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
+#ifdef __ARM_FP16_FORMAT_IEEE
+    #define EMULATED_FP16 0
+    typedef _Float16 _npy_half;
+#else
+    #define EMULATED_FP16 1
+    typedef npy_half _npy_half;
+#endif
+
 /**begin repeat
  *
  * #NAME1 = BOOL,
@@ -724,15 +789,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  * #type1 = npy_bool,
  *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          _npy_half, npy_float, npy_double, npy_longdouble,
  *          npy_cfloat, npy_cdouble, npy_clongdouble#
  * #rtype1 = npy_bool,
  *           npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *           npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *           npy_half, npy_float, npy_double, npy_longdouble,
+ *           _npy_half, npy_float, npy_double, npy_longdouble,
  *           npy_float, npy_double, npy_longdouble#
  * #is_bool1 = 1, 0*17#
- * #is_half1 = 0*11, 1, 0*6#
+ * #is_half1 = 0*11, EMULATED_FP16, 0*6#
  * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0#
  * #is_double1 = 0*13, 1, 0, 0, 1, 0#
  * #is_complex1 = 0*15, 1*3#
@@ -753,15 +818,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  * #type2 = npy_bool,
  *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          _npy_half, npy_float, npy_double, npy_longdouble,
  *          npy_cfloat, npy_cdouble, npy_clongdouble#
  * #rtype2 = npy_bool,
  *          npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *          npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *          npy_half, npy_float, npy_double, npy_longdouble,
+ *          _npy_half, npy_float, npy_double, npy_longdouble,
  *          npy_float, npy_double, npy_longdouble#
  * #is_bool2 = 1, 0*17#
- * #is_half2 = 0*11, 1, 0*6#
+ * #is_half2 = 0*11, EMULATED_FP16, 0*6#
  * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0#
  * #is_double2 = 0*13, 1, 0, 0, 1, 0#
  * #is_complex2 = 0*15, 1*3#
@@ -848,26 +913,6 @@ static NPY_GCC_OPT_3 int
 {
     npy_intp N = dimensions[0];
     char *src = args[0], *dst = args[1];
-    #if @contig@ && @aligned@ && @is_half1@ && @is_float2@
-        float16_t* input = (float16_t*)src;
-        float* output = (float*)dst;
-        npy_intp i = 0;
-        size_t num_chunks_8 = N / 8;
-
-        for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
-            float16x8_t vec_f16 = vld1q_f16(input + i);
-            float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
-            float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
-            float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
-            float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
-            vst1q_f32(output + i, vec_f32_low);
-            vst1q_f32(output + i + 4, vec_f32_high);
-            i += 8;
-        }
-        for (; i < N; ++i) {
-            output[i] = (float)input[i];
-        }
-    #else
 #if !@contig@
     npy_intp src_stride = strides[0], dst_stride = strides[1];
 #endif
@@ -951,7 +996,6 @@ static NPY_GCC_OPT_3 int
         src += src_stride;
 #endif
     }
-    #endif
     return 0;
 }
 
@@ -1016,6 +1060,17 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
 
                 case NPY_@NAME2@:
                     /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
+                    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+                        // Check for Half <-> Float, aligned, contiguous
+                        if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) {
+                            if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) {
+                                return &neon_cast_f16_to_f32_contig_aligned;
+                            }
+                            if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) {
+                                return &neon_cast_f32_to_f16_contig_aligned;
+                            }
+                        }
+                    #endif
 #  if NPY_USE_UNALIGNED_ACCESS
                     if (src_stride == sizeof(@type1@) &&
                                 dst_stride == sizeof(@type2@)) {

From 3ccb95a9bef2cc6fa239357daf34a5a3b166e20b Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Mon, 21 Apr 2025 03:21:12 +0530
Subject: [PATCH 03/11] Add missing header

---
 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 6aa77248aded..fc71905e6231 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -25,6 +25,10 @@
 
 #include "umathmodule.h"
 
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 // Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+
 

From 236223fdcb72c020780fc8b88f075f548ef7dab6 Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Mon, 21 Apr 2025 15:54:24 +0530
Subject: [PATCH 04/11] Relax VECTOR_ARITHMETIC check and add comment on need
 for SIMD routines

---
 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index fc71905e6231..bc43b160354a 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -29,8 +29,10 @@
 #include <arm_neon.h>
 #endif
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-// Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+
+#if defined(__ARM_FP16_FORMAT_IEEE)
+// Use ARM fp16<->fp32 vector intrinsics to optimize casting.
+// Clang/GCC auto-vectorization does not generate optimal code
+// leading to performance degradation.
 
 static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned(
         PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
@@ -1064,7 +1066,7 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
 
                 case NPY_@NAME2@:
                     /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
-                    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+                    #if defined(__ARM_FP16_FORMAT_IEEE)
                         // Check for Half <-> Float, aligned, contiguous
                         if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) {
                             if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) {

From a8f472d6a82546cb1ff8818365d2003781fbb28f Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Tue, 22 Apr 2025 00:13:05 +0530
Subject: [PATCH 05/11] Enable hardware cast on x86 when F16C is available

---
 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index bc43b160354a..e3b0a141dffa 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -772,7 +772,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#ifdef __ARM_FP16_FORMAT_IEEE
+#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
     typedef _Float16 _npy_half;
 #else

From c4b14866232cd5d3f5c3c16072fd959c590a145d Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Tue, 22 Apr 2025 10:52:39 +0530
Subject: [PATCH 06/11] Relax fp exceptions in Clang to enable vectorization
 for cast

---
 .../multiarray/lowlevel_strided_loops.c.src   | 83 ++-----------------
 1 file changed, 9 insertions(+), 74 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index e3b0a141dffa..d2a33bf064a8 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -25,70 +25,14 @@
 
 #include "umathmodule.h"
 
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined(__ARM_FP16_FORMAT_IEEE)
-// Use ARM fp16<->fp32 vector intrinsics to optimize casting.
-// Clang/GCC auto-vectorization does not generate optimal code
-// leading to performance degradation.
-
-static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned(
-        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
-        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
-        NpyAuxData *NPY_UNUSED(auxdata))
-{
-    const float16_t *src = (const float16_t *)args[0];
-    float *dst = (float *)args[1];
-    npy_intp N = dimensions[0];
-    npy_intp i = 0;
-    size_t num_chunks_8 = N / 8;
-
-    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
-        float16x8_t vec_f16 = vld1q_f16(src + i);
-        float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
-        float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
-        float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
-        float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
-        vst1q_f32(dst + i, vec_f32_low);
-        vst1q_f32(dst + i + 4, vec_f32_high);
-        i += 8;
-    }
-    for (; i < N; ++i) {
-        dst[i] = (float)src[i];
-    }
-    return 0;
-}
-
-static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned(
-        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
-        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
-        NpyAuxData *NPY_UNUSED(auxdata))
-{
-    const float *src = (const float *)args[0];
-    float16_t *dst = (float16_t *)args[1];
-    npy_intp N = dimensions[0];
-    npy_intp i = 0;
-    size_t num_chunks_8 = N / 8;
-
-    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
-        float32x4_t vec_f32_low = vld1q_f32(src + i);
-        float32x4_t vec_f32_high = vld1q_f32(src + i + 4);
-        float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low);
-        float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high);
-        float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high);
-        vst1q_f16(dst + i, vec_f16_combined);
-        i += 8;
-    }
-    for (; i < N; ++i) {
-        dst[i] = (float16_t)src[i];
-    }
-    return 0;
-}
+#if defined(__clang__) && __clang_major__ >= 12
+#define NPY_IGNORE_FP_EXCEPTIONS_ON  _Pragma("clang fp exceptions(ignore)")
+#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)")
+#else
+#define NPY_IGNORE_FP_EXCEPTIONS_ON
+#define NPY_IGNORE_FP_EXCEPTIONS_OFF
 #endif
 
-
 /*
  * x86 platform works with unaligned access but the compiler is allowed to
  * assume all data is aligned to its size by the C standard. This means it can
@@ -772,7 +716,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C)
+#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
     typedef _Float16 _npy_half;
 #else
@@ -911,6 +855,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #endif
 
+NPY_IGNORE_FP_EXCEPTIONS_ON
 static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
         PyArrayMethod_Context *context, char *const *args,
@@ -1004,6 +949,7 @@ static NPY_GCC_OPT_3 int
     }
     return 0;
 }
+NPY_IGNORE_FP_EXCEPTIONS_OFF
 
 #undef _CONVERT_FN
 #undef _TYPE2
@@ -1066,17 +1012,6 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
 
                 case NPY_@NAME2@:
                     /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
-                    #if defined(__ARM_FP16_FORMAT_IEEE)
-                        // Check for Half <-> Float, aligned, contiguous
-                        if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) {
-                            if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) {
-                                return &neon_cast_f16_to_f32_contig_aligned;
-                            }
-                            if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) {
-                                return &neon_cast_f32_to_f16_contig_aligned;
-                            }
-                        }
-                    #endif
 #  if NPY_USE_UNALIGNED_ACCESS
                     if (src_stride == sizeof(@type1@) &&
                                 dst_stride == sizeof(@type2@)) {

From 0fbe5ece95b67a3630ad83ab262cf39358b1a9aa Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Tue, 22 Apr 2025 12:11:51 +0530
Subject: [PATCH 07/11] Ignore fp exceptions only for float casts

---
 .../multiarray/lowlevel_strided_loops.c.src   | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index d2a33bf064a8..8ddd3d9f3f6d 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -718,9 +718,11 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
+    #define NATIVE_FP16 1
     typedef _Float16 _npy_half;
 #else
     #define EMULATED_FP16 1
+    #define NATIVE_FP16 0
     typedef npy_half _npy_half;
 #endif
 
@@ -747,7 +749,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  *           _npy_half, npy_float, npy_double, npy_longdouble,
  *           npy_float, npy_double, npy_longdouble#
  * #is_bool1 = 1, 0*17#
- * #is_half1 = 0*11, EMULATED_FP16, 0*6#
+ * #is_emu_half1 = 0*11, EMULATED_FP16, 0*6#
+ * #is_native_half1 = 0*11, NATIVE_FP16, 0*6#
  * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0#
  * #is_double1 = 0*13, 1, 0, 0, 1, 0#
  * #is_complex1 = 0*15, 1*3#
@@ -776,7 +779,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  *          _npy_half, npy_float, npy_double, npy_longdouble,
  *          npy_float, npy_double, npy_longdouble#
  * #is_bool2 = 1, 0*17#
- * #is_half2 = 0*11, EMULATED_FP16, 0*6#
+ * #is_emu_half2 = 0*11, EMULATED_FP16, 0*6#
+ * #is_native_half2 = 0*11, NATIVE_FP16, 0*6#
  * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0#
  * #is_double2 = 0*13, 1, 0, 0, 1, 0#
  * #is_complex2 = 0*15, 1*3#
@@ -790,8 +794,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #if !(NPY_USE_UNALIGNED_ACCESS && !@aligned@)
 
-/* For half types, don't use actual double/float types in conversion */
-#if @is_half1@ || @is_half2@
+/* For emulated half types, don't use actual double/float types in conversion */
+#if @is_emu_half1@ || @is_emu_half2@
 
 #  if @is_float1@
 #    define _TYPE1 npy_uint32
@@ -817,13 +821,13 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 #endif
 
 /* Determine an appropriate casting conversion function */
-#if @is_half1@
+#if @is_emu_half1@
 
 #  if @is_float2@
 #    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
 #  elif @is_double2@
 #    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
-#  elif @is_half2@
+#  elif @is_emu_half2@
 #    define _CONVERT_FN(x) (x)
 #  elif @is_bool2@
 #    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
@@ -831,13 +835,13 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 #    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
 #  endif
 
-#elif @is_half2@
+#elif @is_emu_half2@
 
 #  if @is_float1@
 #    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
 #  elif @is_double1@
 #    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
-#  elif @is_half1@
+#  elif @is_emu_half1@
 #    define _CONVERT_FN(x) (x)
 #  elif @is_bool1@
 #    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
@@ -855,7 +859,11 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #endif
 
-NPY_IGNORE_FP_EXCEPTIONS_ON
+#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \
+    (@is_native_half2@ || @is_float2@ || @is_double2@)
+    // Enable Vectorization on Clang for floating point casts
+    NPY_IGNORE_FP_EXCEPTIONS_ON
+#endif
 static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
         PyArrayMethod_Context *context, char *const *args,
@@ -949,7 +957,10 @@ static NPY_GCC_OPT_3 int
     }
     return 0;
 }
-NPY_IGNORE_FP_EXCEPTIONS_OFF
+#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \
+    (@is_native_half2@ || @is_float2@ || @is_double2@)
+    NPY_IGNORE_FP_EXCEPTIONS_OFF
+#endif
 
 #undef _CONVERT_FN
 #undef _TYPE2

From a7ce139709410f80f41450f3c1420978cac17773 Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Tue, 22 Apr 2025 13:38:34 +0530
Subject: [PATCH 08/11] Fix build

---
 .../multiarray/lowlevel_strided_loops.c.src   | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 8ddd3d9f3f6d..d4b665033556 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -25,14 +25,6 @@
 
 #include "umathmodule.h"
 
-#if defined(__clang__) && __clang_major__ >= 12
-#define NPY_IGNORE_FP_EXCEPTIONS_ON  _Pragma("clang fp exceptions(ignore)")
-#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)")
-#else
-#define NPY_IGNORE_FP_EXCEPTIONS_ON
-#define NPY_IGNORE_FP_EXCEPTIONS_OFF
-#endif
-
 /*
  * x86 platform works with unaligned access but the compiler is allowed to
  * assume all data is aligned to its size by the C standard. This means it can
@@ -859,11 +851,17 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #endif
 
-#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \
-    (@is_native_half2@ || @is_float2@ || @is_double2@)
-    // Enable Vectorization on Clang for floating point casts
-    NPY_IGNORE_FP_EXCEPTIONS_ON
+// Enable auto-vectorization for floating point casts with clang
+#if @is_native_half1@ || @is_float1@ || @is_double1@
+    #if @is_native_half2@ || @is_float2@ || @is_double2@
+        #if defined(__clang__) && !defined(__EMSCRIPTEN__)
+            #if __clang_major__ >= 12
+                _Pragma("clang fp exceptions(ignore)")
+            #endif
+        #endif
+    #endif
 #endif
+
 static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
         PyArrayMethod_Context *context, char *const *args,
@@ -957,9 +955,15 @@ static NPY_GCC_OPT_3 int
     }
     return 0;
 }
-#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \
-    (@is_native_half2@ || @is_float2@ || @is_double2@)
-    NPY_IGNORE_FP_EXCEPTIONS_OFF
+
+#if @is_native_half1@ || @is_float1@ || @is_double1@
+    #if @is_native_half2@ || @is_float2@ || @is_double2@
+        #if defined(__clang__) && !defined(__EMSCRIPTEN__)
+            #if __clang_major__ >= 12
+                _Pragma("clang fp exceptions(strict)")
+            #endif
+        #endif
+    #endif
 #endif
 
 #undef _CONVERT_FN

From 2c17e2aeba1b8b365836b3e9f3a038b6a9335615 Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Wed, 23 Apr 2025 09:57:19 +0530
Subject: [PATCH 09/11] Attempt to fix test failure on ARM64 native

---
 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index d4b665033556..9181fad4b3ca 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -708,7 +708,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C)
+#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
     #define NATIVE_FP16 1
     typedef _Float16 _npy_half;

From de229c76a90cac3d92f49e23f63c2a967238900f Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Thu, 24 Apr 2025 11:16:37 +0530
Subject: [PATCH 10/11] Work around gcc bug for double->half casts

---
 .../multiarray/lowlevel_strided_loops.c.src    | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 1dc07f27ce10..01ffd225274f 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -708,10 +708,10 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(__ARM_FP16_FORMAT_IEEE)
+#if defined(NPY_HAVE_NEON_FP16)
     #define EMULATED_FP16 0
     #define NATIVE_FP16 1
-    typedef __fp16 _npy_half;
+    typedef _Float16 _npy_half;
 #else
     #define EMULATED_FP16 1
     #define NATIVE_FP16 0
@@ -862,7 +862,18 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
     #endif
 #endif
 
-static NPY_GCC_OPT_3 int
+// Work around GCC bug for double->half casts. For SVE and
+// OPT_LEVEL > 1, it implements this as double->single->half
+// which is incorrect as it introduces double rounding with
+// narrowing casts.
+#if (@is_double1@ && @is_native_half2@) && \
+    defined(NPY_HAVE_SVE) && defined(__GNUC__)
+    #define GCC_CAST_OPT_LEVEL __attribute__((optimize("O1")))
+#else
+    #define GCC_CAST_OPT_LEVEL NPY_GCC_OPT_3
+#endif
+
+static GCC_CAST_OPT_LEVEL int
 @prefix@_cast_@name1@_to_@name2@(
         PyArrayMethod_Context *context, char *const *args,
         const npy_intp *dimensions, const npy_intp *strides,
@@ -966,6 +977,7 @@ static NPY_GCC_OPT_3 int
     #endif
 #endif
 
+#undef GCC_CAST_OPT_LEVEL
 #undef _CONVERT_FN
 #undef _TYPE2
 #undef _TYPE1

From 5c32fee7f9dbfc73d7f0abe0c341df6c04bca53b Mon Sep 17 00:00:00 2001
From: Krishna B <krishnaprasad.bindumadhavan@gmail.com>
Date: Tue, 29 Apr 2025 12:41:35 +0530
Subject: [PATCH 11/11] Add release note

---
 doc/release/upcoming_changes/28769.performance.rst | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 doc/release/upcoming_changes/28769.performance.rst

diff --git a/doc/release/upcoming_changes/28769.performance.rst b/doc/release/upcoming_changes/28769.performance.rst
new file mode 100644
index 000000000000..7fb8f02282f6
--- /dev/null
+++ b/doc/release/upcoming_changes/28769.performance.rst
@@ -0,0 +1,8 @@
+Performance improvements for ``np.float16`` casts
+--------------------------------------------------
+Earlier, floating point casts to and from ``np.float16`` types
+were emulated in software on all platforms.
+
+Now, on ARM devices that support Neon float16 intrinsics (such as
+recent Apple Silicon), the native float16 path is used to achieve
+the best performance.