8000 ENH: Improve Floating Point Cast Performance on ARM by f2013519 · Pull Request #28769 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Improve Floating Point Cast Performance on ARM #28769

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 29, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Relax fp exceptions in Clang to enable vectorization for cast
  • Loading branch information
f2013519 committed Apr 22, 2025
commit c4b14866232cd5d3f5c3c16072fd959c590a145d
83 changes: 9 additions & 74 deletions numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -25,70 +25,14 @@

#include "umathmodule.h"

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

#if defined(__ARM_FP16_FORMAT_IEEE)
// Use ARM fp16<->fp32 vector intrinsics to optimize casting.
// Clang/GCC auto-vectorization does not generate optimal code
// leading to performance degradation.

static NPY_GCC_OPT_3 int neon_cast_f16_to_f32_contig_aligned(
PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
NpyAuxData *NPY_UNUSED(auxdata))
{
const float16_t *src = (const float16_t *)args[0];
float *dst = (float *)args[1];
npy_intp N = dimensions[0];
npy_intp i = 0;
size_t num_chunks_8 = N / 8;

for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
float16x8_t vec_f16 = vld1q_f16(src + i);
float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
vst1q_f32(dst + i, vec_f32_low);
vst1q_f32(dst + i + 4, vec_f32_high);
i += 8;
}
for (; i < N; ++i) {
dst[i] = (float)src[i];
}
return 0;
}

static NPY_GCC_OPT_3 int neon_cast_f32_to_f16_contig_aligned(
PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
NpyAuxData *NPY_UNUSED(auxdata))
{
const float *src = (const float *)args[0];
float16_t *dst = (float16_t *)args[1];
npy_intp N = dimensions[0];
npy_intp i = 0;
size_t num_chunks_8 = N / 8;

for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
float32x4_t vec_f32_low = vld1q_f32(src + i);
float32x4_t vec_f32_high = vld1q_f32(src + i + 4);
float16x4_t vec_f16_low = vcvt_f16_f32(vec_f32_low);
float16x4_t vec_f16_high = vcvt_f16_f32(vec_f32_high);
float16x8_t vec_f16_combined = vcombine_f16(vec_f16_low, vec_f16_high);
vst1q_f16(dst + i, vec_f16_combined);
i += 8;
}
for (; i < N; ++i) {
dst[i] = (float16_t)src[i];
}
return 0;
}
#if defined(__clang__) && __clang_major__ >= 12
#define NPY_IGNORE_FP_EXCEPTIONS_ON _Pragma("clang fp exceptions(ignore)")
#define NPY_IGNORE_FP_EXCEPTIONS_OFF _Pragma("clang fp exceptions(strict)")
#else
#define NPY_IGNORE_FP_EXCEPTIONS_ON
#define NPY_IGNORE_FP_EXCEPTIONS_OFF
#endif


/*
* x86 platform works with unaligned access but the compiler is allowed to
* assume all data is aligned to its size by the C standard. This means it can
Expand Down Expand Up @@ -772,7 +716,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *

/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/

#if defined(__ARM_FP16_FORMAT_IEEE) || defined(NPY_HAVE_F16C)
#if defined(NPY_HAVE_NEON_FP16) || defined(NPY_HAVE_F16C)
#define EMULATED_FP16 0
typedef _Float16 _npy_half;
#else
Expand Down Expand Up @@ -911,6 +855,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *

#endif

NPY_IGNORE_FP_EXCEPTIONS_ON
static NPY_GCC_OPT_3 int
@prefix@_cast_@name1@_to_@name2@(
PyArrayMethod_Context *context, char *const *args,
Expand Down Expand Up @@ -1004,6 +949,7 @@ static NPY_GCC_OPT_3 int
}
return 0;
}
NPY_IGNORE_FP_EXCEPTIONS_OFF

#undef _CONVERT_FN
#undef _TYPE2
Expand Down Expand Up @@ -1066,17 +1012,6 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,

case NPY_@NAME2@:
/*printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);*/
#if defined(__ARM_FP16_FORMAT_IEEE)
// Check for Half <-> Float, aligned, contiguous
if (aligned && src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) {
if (src_type_num == NPY_HALF && dst_type_num == NPY_FLOAT) {
return &neon_cast_f16_to_f32_contig_aligned;
}
if (src_type_num == NPY_FLOAT && dst_type_num == NPY_HALF) {
return &neon_cast_f32_to_f16_contig_aligned;
}
}
#endif
# if NPY_USE_UNALIGNED_ACCESS
if (src_stride == sizeof(@type1@) &&
dst_stride == sizeof(@type2@)) {
Expand Down
Loading
0