Relax fp exceptions in Clang to enable vectorization for cast

numpy · seiko2plus · Apr 29, 2025 · Apr 18, 2025 · Apr 20, 2025 · Apr 20, 2025
commit c4b14866232cd5d3f5c3c16072fd959c590a145d
diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
@@ -25,70 +25,14 @@
 
 #include "umathmodule.h"
 
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined(__ARM_FP16_FORMAT_IEEE)
-// Use ARM fp16<->fp32 vector intrinsics to optimize casting.
-// Clang/GCC auto-vectorization does not generate optimal code
-// leading to performance degradation.
-
-static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned(
-        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
-        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
-        NpyAuxData *NPY_UNUSED(auxdata))
-{
-    const float16_t *src = (const float16_t *)args[0];
-    float *dst = (float *)args[1];
-    npy_intp N = dimensions[0];
-    npy_intp i = 0;
-    size_t num_chunks_8 = N / 8;
-
-    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
-        float16x8_t vec_f16 = vld1q_f16(src + i);
-        float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
-        float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
-        float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
-        float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
-        vst1q_f32(dst + i, vec_f32_low);
-        vst1q_f32(dst + i + 4, vec_f32_high);
-        i += 8;
-    }
-    for (; i < N; ++i) {
-        dst[i] = (float)src[i];
-    }
-    return 0;
-}
-
-static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned(
-        PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
-        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
-        NpyAuxData *NPY_UNUSED(auxdata))
-{
-    const float *src = (const float *)args[0];
-    float16_t *dst = (float16_t *)args[1];
-    npy_intp N = dimensions[0];
-    npy_intp i = 0;
-    size_t num_chunks_8 = N / 8;
-
-    for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
-        float32x4_t vec_f32_low = vld1q_f32(src + i);
-        float32x4_t vec_f32_high = vld1q_f32(src + i + 4);
-        float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low);
-        float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high);
-        float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high);
-        vst1q_f16(dst + i, vec_f16_combined);
-        i += 8;
-    }
-    for (; i < N; ++i) {
-        dst[i] = (float16_t)src[i];
-    }
-    return 0;
-}
+#if defined(__clang__) && __clang_major__ >= 12
+#define NPY_IGNORE_FP_EXCEPTIONS_ON  _Pragma("clang fp exceptions(ignore)")
+#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)")
+#else
+#define NPY_IGNORE_FP_EXCEPTIONS_ON
+#define NPY_IGNORE_FP_EXCEPTIONS_OFF
 #endif
 
-
 /*
  * x86 platform works with unaligned access but the compiler is allowed to
  * assume all data is aligned to its size by the C standard. This means it can
@@ -772,7 +716,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
-#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C)
+#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C)
     #define EMULATED_FP16 0
     typedef _Float16 _npy_half;
 #else
@@ -911,6 +855,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #endif
 
+NPY_IGNORE_FP_EXCEPTIONS_ON
 static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
         PyArrayMethod_Context *context, char *const *args,
@@ -1004,6 +949,7 @@ static NPY_GCC_OPT_3 int
     }
     return 0;
 }
+NPY_IGNORE_FP_EXCEPTIONS_OFF
 
 #undef _CONVERT_FN
 #undef _TYPE2
@@ -1066,17 +1012,6 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
 
                 case NPY_@NAME2@:
                     /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
-                    #if defined(__ARM_FP16_FORMAT_IEEE)
-                        // Check for Half <-> Float, aligned, contiguous
-                        if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) {
-                            if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) {
-                                return &neon_cast_f16_to_f32_contig_aligned;
-                            }
-                            if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) {
-                                return &neon_cast_f32_to_f16_contig_aligned;
-                            }
-                        }
-                    #endif
 #  if NPY_USE_UNALIGNED_ACCESS
                     if (src_stride == sizeof(@type1@) &&
                                 dst_stride == sizeof(@type2@)) {