From 9fe30842537c7b2a316896913abf499ec24b7ef8 Mon Sep 17 00:00:00 2001 From: Krishna B Date: Sat, 19 Apr 2025 00:10:17 +0530 Subject: [PATCH 01/11] WIP,Prototype: Use Neon SIMD to improve half->float cast performance [ci skip] [skip ci] --- .../multiarray/lowlevel_strided_loops.c.src | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 1299e55b4258..9060b31e000e 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -11,6 +11,7 @@ #define PY_SSIZE_T_CLEAN #include +#include #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #define _UMATHMODULE @@ -847,6 +848,26 @@ static NPY_GCC_OPT_3 int { npy_intp N = dimensions[0]; char *src = args[0], *dst = args[1]; + #if @contig@ && @aligned@ && @is_half1@ && @is_float2@ + float16_t* input = (float16_t*)src; + float* output = (float*)dst; + npy_intp i = 0; + size_t num_chunks_8 = N / 8; + + for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { + float16x8_t vec_f16 = vld1q_f16(input + i); + float16x4_t vec_f16_low_half = vget_low_f16(vec_f16); + float16x4_t vec_f16_high_half = vget_high_f16(vec_f16); + float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half); + float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half); + vst1q_f32(output + i, vec_f32_low); + vst1q_f32(output + i + 4, vec_f32_high); + i += 8; + } + for (; i < N; ++i) { + output[i] = (float)input[i]; + } + #else #if !@contig@ npy_intp src_stride = strides[0], dst_stride = strides[1]; #endif @@ -930,6 +951,7 @@ static NPY_GCC_OPT_3 int src += src_stride; #endif } + #endif return 0; } From 4de0a6fe609e87db763769ed0f373e5912d73e0b Mon Sep 17 00:00:00 2001 From: Krishna B Date: Mon, 21 Apr 2025 02:39:49 +0530 Subject: [PATCH 02/11] Support Neon SIMD float32->float16 cast and update scalar path to use hardware cast --- .../multiarray/lowlevel_strided_loops.c.src | 111 +++++++++++++----- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 9060b31e000e..6aa77248aded 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -11,7 +11,6 @@ #define PY_SSIZE_T_CLEAN #include -#include #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #define _UMATHMODULE @@ -26,6 +25,64 @@ #include "umathmodule.h" +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +// Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+ + +static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned( + PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, + const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides), + NpyAuxData *NPY_UNUSED(auxdata)) +{ + const float16_t *src = (const float16_t *)args[0]; + float *dst = (float *)args[1]; + npy_intp N = dimensions[0]; + npy_intp i = 0; + size_t num_chunks_8 = N / 8; + + for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { + float16x8_t vec_f16 = vld1q_f16(src + i); + float16x4_t vec_f16_low_half = vget_low_f16(vec_f16); + float16x4_t vec_f16_high_half = vget_high_f16(vec_f16); + float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half); + float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half); + vst1q_f32(dst + i, vec_f32_low); + vst1q_f32(dst + i + 4, vec_f32_high); + i += 8; + } + for (; i < N; ++i) { + dst[i] = (float)src[i]; + } + return 0; +} + +static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned( + PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, + const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides), + NpyAuxData *NPY_UNUSED(auxdata)) +{ + const float *src = (const float *)args[0]; + float16_t *dst = (float16_t *)args[1]; + npy_intp N = dimensions[0]; + npy_intp i = 0; + size_t num_chunks_8 = N / 8; + + for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { + float32x4_t vec_f32_low = vld1q_f32(src + i); + float32x4_t vec_f32_high = vld1q_f32(src + i + 4); + float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low); + float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high); + float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high); + vst1q_f16(dst + i, vec_f16_combined); + i += 8; + } + for (; i < N; ++i) { + dst[i] = (float16_t)src[i]; + } + return 0; +} +#endif + + /* * x86 platform works with unaligned access but the compiler is allowed to * assume all data is aligned to its size by the C standard. This means it can @@ -709,6 +766,14 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ +#ifdef __ARM_FP16_FORMAT_IEEE + #define EMULATED_FP16 0 + typedef _Float16 _npy_half; +#else + #define EMULATED_FP16 1 + typedef npy_half _npy_half; +#endif + /**begin repeat * * #NAME1 = BOOL, @@ -724,15 +789,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * #type1 = npy_bool, * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_half, npy_float, npy_double, npy_longdouble, + * _npy_half, npy_float, npy_double, npy_longdouble, * npy_cfloat, npy_cdouble, npy_clongdouble# * #rtype1 = npy_bool, * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_half, npy_float, npy_double, npy_longdouble, + * _npy_half, npy_float, npy_double, npy_longdouble, * npy_float, npy_double, npy_longdouble# * #is_bool1 = 1, 0*17# - * #is_half1 = 0*11, 1, 0*6# + * #is_half1 = 0*11, EMULATED_FP16, 0*6# * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0# * #is_double1 = 0*13, 1, 0, 0, 1, 0# * #is_complex1 = 0*15, 1*3# @@ -753,15 +818,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * #type2 = npy_bool, * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_half, npy_float, npy_double, npy_longdouble, + * _npy_half, npy_float, npy_double, npy_longdouble, * npy_cfloat, npy_cdouble, npy_clongdouble# * #rtype2 = npy_bool, * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_half, npy_float, npy_double, npy_longdouble, + * _npy_half, npy_float, npy_double, npy_longdouble, * npy_float, npy_double, npy_longdouble# * #is_bool2 = 1, 0*17# - * #is_half2 = 0*11, 1, 0*6# + * #is_half2 = 0*11, EMULATED_FP16, 0*6# * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0# * #is_double2 = 0*13, 1, 0, 0, 1, 0# * #is_complex2 = 0*15, 1*3# @@ -848,26 +913,6 @@ static NPY_GCC_OPT_3 int { npy_intp N = dimensions[0]; char *src = args[0], *dst = args[1]; - #if @contig@ && @aligned@ && @is_half1@ && @is_float2@ - float16_t* input = (float16_t*)src; - float* output = (float*)dst; - npy_intp i = 0; - size_t num_chunks_8 = N / 8; - - for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { - float16x8_t vec_f16 = vld1q_f16(input + i); - float16x4_t vec_f16_low_half = vget_low_f16(vec_f16); - float16x4_t vec_f16_high_half = vget_high_f16(vec_f16); - float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half); - float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half); - vst1q_f32(output + i, vec_f32_low); - vst1q_f32(output + i + 4, vec_f32_high); - i += 8; - } - for (; i < N; ++i) { - output[i] = (float)input[i]; - } - #else #if !@contig@ npy_intp src_stride = strides[0], dst_stride = strides[1]; #endif @@ -951,7 +996,6 @@ static NPY_GCC_OPT_3 int src += src_stride; #endif } - #endif return 0; } @@ -1016,6 +1060,17 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, case NPY_@NAME2@: /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/ + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + // Check for Half <-> Float, aligned, contiguous + if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { + if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) { + return &neon_cast_f16_to_f32_contig_aligned; + } + if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) { + return &neon_cast_f32_to_f16_contig_aligned; + } + } + #endif # if NPY_USE_UNALIGNED_ACCESS if (src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { From 3ccb95a9bef2cc6fa239357daf34a5a3b166e20b Mon Sep 17 00:00:00 2001 From: Krishna B Date: Mon, 21 Apr 2025 03:21:12 +0530 Subject: [PATCH 03/11] Add missing header --- numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 6aa77248aded..fc71905e6231 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -25,6 +25,10 @@ #include "umathmodule.h" +#ifdef __ARM_NEON +#include +#endif + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+ From 236223fdcb72c020780fc8b88f075f548ef7dab6 Mon Sep 17 00:00:00 2001 From: Krishna B Date: Mon, 21 Apr 2025 15:54:24 +0530 Subject: [PATCH 04/11] Relax VECTOR_ARITHMETIC check and add comment on need for SIMD routines --- numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index fc71905e6231..bc43b160354a 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -29,8 +29,10 @@ #include #endif -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -// Optimized Neon SIMD Routines for fp16<->fp32 casts on ARMv8.2-a+ +#if defined(__ARM_FP16_FORMAT_IEEE) +// Use ARM fp16<->fp32 vector intrinsics to optimize casting. +// Clang/GCC auto-vectorization does not generate optimal code +// leading to performance degradation. static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned( PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, @@ -1064,7 +1066,7 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, case NPY_@NAME2@: /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/ - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #if defined(__ARM_FP16_FORMAT_IEEE) // Check for Half <-> Float, aligned, contiguous if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) { From a8f472d6a82546cb1ff8818365d2003781fbb28f Mon Sep 17 00:00:00 2001 From: Krishna B Date: Tue, 22 Apr 2025 00:13:05 +0530 Subject: [PATCH 05/11] Enable hardware cast on x86 when F16C is available --- numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index bc43b160354a..e3b0a141dffa 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -772,7 +772,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ -#ifdef __ARM_FP16_FORMAT_IEEE +#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C) #define EMULATED_FP16 0 typedef _Float16 _npy_half; #else From c4b14866232cd5d3f5c3c16072fd959c590a145d Mon Sep 17 00:00:00 2001 From: Krishna B Date: Tue, 22 Apr 2025 10:52:39 +0530 Subject: [PATCH 06/11] Relax fp exceptions in Clang to enable vectorization for cast --- .../multiarray/lowlevel_strided_loops.c.src | 83 ++----------------- 1 file changed, 9 insertions(+), 74 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index e3b0a141dffa..d2a33bf064a8 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -25,70 +25,14 @@ #include "umathmodule.h" -#ifdef __ARM_NEON -#include -#endif - -#if defined(__ARM_FP16_FORMAT_IEEE) -// Use ARM fp16<->fp32 vector intrinsics to optimize casting. -// Clang/GCC auto-vectorization does not generate optimal code -// leading to performance degradation. - -static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned( - PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, - const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides), - NpyAuxData *NPY_UNUSED(auxdata)) -{ - const float16_t *src = (const float16_t *)args[0]; - float *dst = (float *)args[1]; - npy_intp N = dimensions[0]; - npy_intp i = 0; - size_t num_chunks_8 = N / 8; - - for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { - float16x8_t vec_f16 = vld1q_f16(src + i); - float16x4_t vec_f16_low_half = vget_low_f16(vec_f16); - float16x4_t vec_f16_high_half = vget_high_f16(vec_f16); - float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half); - float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half); - vst1q_f32(dst + i, vec_f32_low); - vst1q_f32(dst + i + 4, vec_f32_high); - i += 8; - } - for (; i < N; ++i) { - dst[i] = (float)src[i]; - } - return 0; -} - -static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned( - PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, - const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides), - NpyAuxData *NPY_UNUSED(auxdata)) -{ - const float *src = (const float *)args[0]; - float16_t *dst = (float16_t *)args[1]; - npy_intp N = dimensions[0]; - npy_intp i = 0; - size_t num_chunks_8 = N / 8; - - for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) { - float32x4_t vec_f32_low = vld1q_f32(src + i); - float32x4_t vec_f32_high = vld1q_f32(src + i + 4); - float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low); - float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high); - float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high); - vst1q_f16(dst + i, vec_f16_combined); - i += 8; - } - for (; i < N; ++i) { - dst[i] = (float16_t)src[i]; - } - return 0; -} +#if defined(__clang__) && __clang_major__ >= 12 +#define NPY_IGNORE_FP_EXCEPTIONS_ON _Pragma("clang fp exceptions(ignore)") +#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)") +#else +#define NPY_IGNORE_FP_EXCEPTIONS_ON +#define NPY_IGNORE_FP_EXCEPTIONS_OFF #endif - /* * x86 platform works with unaligned access but the compiler is allowed to * assume all data is aligned to its size by the C standard. This means it can @@ -772,7 +716,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ -#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C) +#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C) #define EMULATED_FP16 0 typedef _Float16 _npy_half; #else @@ -911,6 +855,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif +NPY_IGNORE_FP_EXCEPTIONS_ON static NPY_GCC_OPT_3 int @prefix@_cast_@name1@_to_@name2@( PyArrayMethod_Context *context, char *const *args, @@ -1004,6 +949,7 @@ static NPY_GCC_OPT_3 int } return 0; } +NPY_IGNORE_FP_EXCEPTIONS_OFF #undef _CONVERT_FN #undef _TYPE2 @@ -1066,17 +1012,6 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, case NPY_@NAME2@: /*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/ - #if defined(__ARM_FP16_FORMAT_IEEE) - // Check for Half <-> Float, aligned, contiguous - if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { - if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) { - return &neon_cast_f16_to_f32_contig_aligned; - } - if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) { - return &neon_cast_f32_to_f16_contig_aligned; - } - } - #endif # if NPY_USE_UNALIGNED_ACCESS if (src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { From 0fbe5ece95b67a3630ad83ab262cf39358b1a9aa Mon Sep 17 00:00:00 2001 From: Krishna B Date: Tue, 22 Apr 2025 12:11:51 +0530 Subject: [PATCH 07/11] Ignore fp exceptions only for float casts --- .../multiarray/lowlevel_strided_loops.c.src | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index d2a33bf064a8..8ddd3d9f3f6d 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -718,9 +718,11 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C) #define EMULATED_FP16 0 + #define NATIVE_FP16 1 typedef _Float16 _npy_half; #else #define EMULATED_FP16 1 + #define NATIVE_FP16 0 typedef npy_half _npy_half; #endif @@ -747,7 +749,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * _npy_half, npy_float, npy_double, npy_longdouble, * npy_float, npy_double, npy_longdouble# * #is_bool1 = 1, 0*17# - * #is_half1 = 0*11, EMULATED_FP16, 0*6# + * #is_emu_half1 = 0*11, EMULATED_FP16, 0*6# + * #is_native_half1 = 0*11, NATIVE_FP16, 0*6# * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0# * #is_double1 = 0*13, 1, 0, 0, 1, 0# * #is_complex1 = 0*15, 1*3# @@ -776,7 +779,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * _npy_half, npy_float, npy_double, npy_longdouble, * npy_float, npy_double, npy_longdouble# * #is_bool2 = 1, 0*17# - * #is_half2 = 0*11, EMULATED_FP16, 0*6# + * #is_emu_half2 = 0*11, EMULATED_FP16, 0*6# + * #is_native_half2 = 0*11, NATIVE_FP16, 0*6# * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0# * #is_double2 = 0*13, 1, 0, 0, 1, 0# * #is_complex2 = 0*15, 1*3# @@ -790,8 +794,8 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #if !(NPY_USE_UNALIGNED_ACCESS && !@aligned@) -/* For half types, don't use actual double/float types in conversion */ -#if @is_half1@ || @is_half2@ +/* For emulated half types, don't use actual double/float types in conversion */ +#if @is_emu_half1@ || @is_emu_half2@ # if @is_float1@ # define _TYPE1 npy_uint32 @@ -817,13 +821,13 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif /* Determine an appropriate casting conversion function */ -#if @is_half1@ +#if @is_emu_half1@ # if @is_float2@ # define _CONVERT_FN(x) npy_halfbits_to_floatbits(x) # elif @is_double2@ # define _CONVERT_FN(x) npy_halfbits_to_doublebits(x) -# elif @is_half2@ +# elif @is_emu_half2@ # define _CONVERT_FN(x) (x) # elif @is_bool2@ # define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x)) @@ -831,13 +835,13 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * # define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x)) # endif -#elif @is_half2@ +#elif @is_emu_half2@ # if @is_float1@ # define _CONVERT_FN(x) npy_floatbits_to_halfbits(x) # elif @is_double1@ # define _CONVERT_FN(x) npy_doublebits_to_halfbits(x) -# elif @is_half1@ +# elif @is_emu_half1@ # define _CONVERT_FN(x) (x) # elif @is_bool1@ # define _CONVERT_FN(x) npy_float_to_half((float)(x!=0)) @@ -855,7 +859,11 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif -NPY_IGNORE_FP_EXCEPTIONS_ON +#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \ + (@is_native_half2@ || @is_float2@ || @is_double2@) + // Enable Vectorization on Clang for floating point casts + NPY_IGNORE_FP_EXCEPTIONS_ON +#endif static NPY_GCC_OPT_3 int @prefix@_cast_@name1@_to_@name2@( PyArrayMethod_Context *context, char *const *args, @@ -949,7 +957,10 @@ static NPY_GCC_OPT_3 int } return 0; } -NPY_IGNORE_FP_EXCEPTIONS_OFF +#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \ + (@is_native_half2@ || @is_float2@ || @is_double2@) + NPY_IGNORE_FP_EXCEPTIONS_OFF +#endif #undef _CONVERT_FN #undef _TYPE2 From a7ce139709410f80f41450f3c1420978cac17773 Mon Sep 17 00:00:00 2001 From: Krishna B Date: Tue, 22 Apr 2025 13:38:34 +0530 Subject: [PATCH 08/11] Fix build --- .../multiarray/lowlevel_strided_loops.c.src | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 8ddd3d9f3f6d..d4b665033556 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -25,14 +25,6 @@ #include "umathmodule.h" -#if defined(__clang__) && __clang_major__ >= 12 -#define NPY_IGNORE_FP_EXCEPTIONS_ON _Pragma("clang fp exceptions(ignore)") -#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)") -#else -#define NPY_IGNORE_FP_EXCEPTIONS_ON -#define NPY_IGNORE_FP_EXCEPTIONS_OFF -#endif - /* * x86 platform works with unaligned access but the compiler is allowed to * assume all data is aligned to its size by the C standard. This means it can @@ -859,11 +851,17 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif -#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \ - (@is_native_half2@ || @is_float2@ || @is_double2@) - // Enable Vectorization on Clang for floating point casts - NPY_IGNORE_FP_EXCEPTIONS_ON +// Enable auto-vectorization for floating point casts with clang +#if @is_native_half1@ || @is_float1@ || @is_double1@ + #if @is_native_half2@ || @is_float2@ || @is_double2@ + #if defined(__clang__) && !defined(__EMSCRIPTEN__) + #if __clang_major__ >= 12 + _Pragma("clang fp exceptions(ignore)") + #endif + #endif + #endif #endif + static NPY_GCC_OPT_3 int @prefix@_cast_@name1@_to_@name2@( PyArrayMethod_Context *context, char *const *args, @@ -957,9 +955,15 @@ static NPY_GCC_OPT_3 int } return 0; } -#if (@is_native_half1@ || @is_float1@ || @is_double1@) && \ - (@is_native_half2@ || @is_float2@ || @is_double2@) - NPY_IGNORE_FP_EXCEPTIONS_OFF + +#if @is_native_half1@ || @is_float1@ || @is_double1@ + #if @is_native_half2@ || @is_float2@ || @is_double2@ + #if defined(__clang__) && !defined(__EMSCRIPTEN__) + #if __clang_major__ >= 12 + _Pragma("clang fp exceptions(strict)") + #endif + #endif + #endif #endif #undef _CONVERT_FN From 2c17e2aeba1b8b365836b3e9f3a038b6a9335615 Mon Sep 17 00:00:00 2001 From: Krishna B Date: Wed, 23 Apr 2025 09:57:19 +0530 Subject: [PATCH 09/11] Attempt to fix test failure on ARM64 native --- numpy/_core/src/multiarray/lowlevel_strided_loops.c.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index d4b665033556..9181fad4b3ca 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -708,7 +708,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ -#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C) +#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || defined(NPY_HAVE_F16C) #define EMULATED_FP16 0 #define NATIVE_FP16 1 typedef _Float16 _npy_half; From de229c76a90cac3d92f49e23f63c2a967238900f Mon Sep 17 00:00:00 2001 From: Krishna B Date: Thu, 24 Apr 2025 11:16:37 +0530 Subject: [PATCH 10/11] Work around gcc bug for double->half casts --- .../multiarray/lowlevel_strided_loops.c.src | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src index 1dc07f27ce10..01ffd225274f 100644 --- a/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/_core/src/multiarray/lowlevel_strided_loops.c.src @@ -708,10 +708,10 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ -#if defined(__ARM_FP16_FORMAT_IEEE) +#if defined(NPY_HAVE_NEON_FP16) #define EMULATED_FP16 0 #define NATIVE_FP16 1 - typedef __fp16 _npy_half; + typedef _Float16 _npy_half; #else #define EMULATED_FP16 1 #define NATIVE_FP16 0 @@ -862,7 +862,18 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif #endif -static NPY_GCC_OPT_3 int +// Work around GCC bug for double->half casts. For SVE and +// OPT_LEVEL > 1, it implements this as double->single->half +// which is incorrect as it introduces double rounding with +// narrowing casts. +#if (@is_double1@ && @is_native_half2@) && \ + defined(NPY_HAVE_SVE) && defined(__GNUC__) + #define GCC_CAST_OPT_LEVEL __attribute__((optimize("O1"))) +#else + #define GCC_CAST_OPT_LEVEL NPY_GCC_OPT_3 +#endif + +static GCC_CAST_OPT_LEVEL int @prefix@_cast_@name1@_to_@name2@( PyArrayMethod_Context *context, char *const *args, const npy_intp *dimensions, const npy_intp *strides, @@ -966,6 +977,7 @@ static NPY_GCC_OPT_3 int #endif #endif +#undef GCC_CAST_OPT_LEVEL #undef _CONVERT_FN #undef _TYPE2 #undef _TYPE1 From 5c32fee7f9dbfc73d7f0abe0c341df6c04bca53b Mon Sep 17 00:00:00 2001 From: Krishna B Date: Tue, 29 Apr 2025 12:41:35 +0530 Subject: [PATCH 11/11] Add release note --- doc/release/upcoming_changes/28769.performance.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 doc/release/upcoming_changes/28769.performance.rst diff --git a/doc/release/upcoming_changes/28769.performance.rst b/doc/release/upcoming_changes/28769.performance.rst new file mode 100644 index 000000000000..7fb8f02282f6 --- /dev/null +++ b/doc/release/upcoming_changes/28769.performance.rst @@ -0,0 +1,8 @@ +Performance improvements for ``np.float16`` casts +-------------------------------------------------- +Earlier, floating point casts to and from ``np.float16`` types +were emulated in software on all platforms. + +Now, on ARM devices that support Neon float16 intrinsics (such as +recent Apple Silicon), the native float16 path is used to achieve +the best performance.