From 1331cd3d973de267712d22c956aa2782badf5cdd Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 12 Jun 2025 22:57:04 +0100 Subject: [PATCH] Fix NEON_FP16 check for MSVC The cast in `numpy/distutils/checks/cpu_neon_fp16.c` didn't compile correctly in MSVC, which lead to `NEON` reporting as unsupported due to it encompassing `NEON_FP16` in `meson_cpu/arm/meson.build`. I'm not an expert, but it appears `fp16`/`fp16fml`/`dotprod` are not supported in VS2022: https://learn.microsoft.com/en-us/cpp/build/reference/feature-arm64?view=msvc-170 Similarly, I don't think MSVC supports `float16` as a type yet, so I've enabled emulation in the strided loop code. --- numpy/_core/src/common/simd/neon/math.h | 4 ++-- numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +- numpy/distutils/checks/cpu_neon_fp16.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/_core/src/common/simd/neon/math.h b/numpy/_core/src/common/simd/neon/math.h index 76c5b58be788..efa60628f2e8 100644 --- a/numpy/_core/src/common/simd/neon/math.h +++ b/numpy/_core/src/common/simd/neon/math.h @@ -261,8 +261,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP) \ NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ { \ - STYPE al = (STYPE)vget_low_##SFX(a); \ - STYPE ah = (STYPE)vget_high_##SFX(a); \ + STYPE al = (STYPE)vgetq_lane_##SFX(a, 0); \ + STYPE ah = (STYPE)vgetq_lane_##SFX(a, 1); \ return al OP ah ? al : ah; \ } NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 0c4eb3dd9a8d..c7ae1a36fcc9 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -704,7 +704,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ -#if defined(NPY_HAVE_NEON_FP16) +#if defined(NPY_HAVE_NEON_FP16) && !defined(_MSC_VER) #define EMULATED_FP16 0 #define NATIVE_FP16 1 typedef _Float16 _npy_half; diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c index f3b949770db6..8d0267fe38a8 100644 --- a/numpy/distutils/checks/cpu_neon_fp16.c +++ b/numpy/distutils/checks/cpu_neon_fp16.c @@ -6,6 +6,6 @@ int main(int argc, char **argv) { short *src = (short*)argv[argc-1]; - float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src)); + float32x4_t v_z4 = vcvt_f32_f16(vreinterpret_f16_s16(vld1_s16(src))); return (int)vgetq_lane_f32(v_z4, 0); }