10000 Prototype: Use Neon SIMD for float16->float32 casts · numpy/numpy@9977b8a · GitHub
[go: up one dir, main page]

Skip to content

Commit 9977b8a

Browse files
committed
Prototype: Use Neon SIMD for float16->float32 casts
1 parent 8b226b4 commit 9977b8a

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

numpy/_core/src/multiarray/lowlevel_strided_loops.c.src

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define PY_SSIZE_T_CLEAN
1212
#include <Python.h>
1313

14+
#include <arm_neon.h>
1415
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
1516
#define _MULTIARRAYMODULE
1617
#define _UMATHMODULE
@@ -847,6 +848,26 @@ static NPY_GCC_OPT_3 int
847848
{
848849
npy_intp N = dimensions[0];
849850
char *src = args[0], *dst = args[1];
851+
#if @contig@ && @aligned@ && @is_half1@ && @is_float2@
852+
float16_t* input = (float16_t*)src;
853+
float* output = (float*)dst;
854+
npy_intp i = 0;
855+
size_t num_chunks_8 = N / 8;
856+
857+
for (size_t chunk = 0; chunk < num_chunks_8; ++chunk) {
858+
float16x8_t vec_f16 = vld1q_f16(input + i);
859+
float16x4_t vec_f16_low_half = vget_low_f16(vec_f16);
860+
float16x4_t vec_f16_high_half = vget_high_f16(vec_f16);
861+
float32x4_t vec_f32_low = vcvt_f32_f16(vec_f16_low_half);
862+
float32x4_t vec_f32_high = vcvt_f32_f16(vec_f16_high_half);
863+
vst1q_f32(output + i, vec_f32_low);
864+
vst1q_f32(output + i + 4, vec_f32_high);
865+
i += 8;
866+
}
867+
for (; i < N; ++i) {
868+
output[i] = (float)input[i];
869+
}
870+
#else
850871
#if !@contig@
851872
npy_intp src_stride = strides[0], dst_stride = strides[1];
852873
#endif
@@ -930,6 +951,7 @@ static NPY_GCC_OPT_3 int
930951
src += src_stride;
931952
#endif
932953
}
954+
#endif
933955
return 0;
934956
}
935957

0 commit comments

Comments
 (0)
0