From 1331cd3d973de267712d22c956aa2782badf5cdd Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 12 Jun 2025 22:57:04 +0100
Subject: [PATCH] Fix NEON_FP16 check for MSVC

The cast in `numpy/distutils/checks/cpu_neon_fp16.c` didn't compile correctly in MSVC, which lead to `NEON` reporting as unsupported due to it encompassing `NEON_FP16` in `meson_cpu/arm/meson.build`.

I'm not an expert, but it appears `fp16`/`fp16fml`/`dotprod` are not supported in VS2022:
https://learn.microsoft.com/en-us/cpp/build/reference/feature-arm64?view=msvc-170

Similarly, I don't think MSVC supports `float16` as a type yet, so I've enabled emulation in the strided loop code.
---
 numpy/_core/src/common/simd/neon/math.h                 | 4 ++--
 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +-
 numpy/distutils/checks/cpu_neon_fp16.c                  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/common/simd/neon/math.h b/numpy/_core/src/common/simd/neon/math.h
index 76c5b58be788..efa60628f2e8 100644
--- a/numpy/_core/src/common/simd/neon/math.h
+++ b/numpy/_core/src/common/simd/neon/math.h
@@ -261,8 +261,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
 #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP)       \
     NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a)  \
     {                                                             \
-        STYPE al = (STYPE)vget_low_##SFX(a);                      \
-        STYPE ah = (STYPE)vget_high_##SFX(a);                     \
+        STYPE al = (STYPE)vgetq_lane_##SFX(a, 0);                 \
+        STYPE ah = (STYPE)vgetq_lane_##SFX(a, 1);                 \
         return al OP ah ? al : ah;                                \
     }
 NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >)
diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
index 0c4eb3dd9a8d..c7ae1a36fcc9 100644
--- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -704,7 +704,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(NPY_HAVE_NEON_FP16)
+#if defined(NPY_HAVE_NEON_FP16) && !defined(_MSC_VER)
     #define EMULATED_FP16 0
     #define NATIVE_FP16 1
     typedef _Float16 _npy_half;
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
index f3b949770db6..8d0267fe38a8 100644
--- a/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -6,6 +6,6 @@
 int main(int argc, char **argv)
 {
     short *src = (short*)argv[argc-1];
-    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
+    float32x4_t v_z4 = vcvt_f32_f16(vreinterpret_f16_s16(vld1_s16(src)));
     return (int)vgetq_lane_f32(v_z4, 0);
 }