8000 ENH: Implement the NumPy C SIMD vectorization interface by seiko2plus · Pull Request #16397 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Implement the NumPy C SIMD vectorization interface #16397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 10, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ENH: [3/4] implement the NumPy C SIMD vectorization interface
  implement the same intrinsics as X86 for NEON
  • Loading branch information
seiko2plus committed Jul 8, 2020
commit 5eab8e00081ffd32ffbedea6324f00904ec30624
78 changes: 78 additions & 0 deletions numpy/core/src/common/simd/neon/arithmetic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_ARITHMETIC_H
#define _NPY_SIMD_NEON_ARITHMETIC_H

/***************************
* Addition
***************************/
// non-saturated
#define npyv_add_u8 vaddq_u8
#define npyv_add_s8 vaddq_s8
#define npyv_add_u16 vaddq_u16
#define npyv_add_s16 vaddq_s16
#define npyv_add_u32 vaddq_u32
#define npyv_add_s32 vaddq_s32
#define npyv_add_u64 vaddq_u64
#define npyv_add_s64 vaddq_s64
#define npyv_add_f32 vaddq_f32
#define npyv_add_f64 vaddq_f64

// saturated
#define npyv_adds_u8 vqaddq_u8
#define npyv_adds_s8 vqaddq_s8
#define npyv_adds_u16 vqaddq_u16
#define npyv_adds_s16 vqaddq_s16

/***************************
* Subtraction
***************************/
// non-saturated
#define npyv_sub_u8 vsubq_u8
#define npyv_sub_s8 vsubq_s8
#define npyv_sub_u16 vsubq_u16
#define npyv_sub_s16 vsubq_s16
#define npyv_sub_u32 vsubq_u32
#define npyv_sub_s32 vsubq_s32
#define npyv_sub_u64 vsubq_u64
#define npyv_sub_s64 vsubq_s64
#define npyv_sub_f32 vsubq_f32
#define npyv_sub_f64 vsubq_f64

// saturated
#define npyv_subs_u8 vqsubq_u8
#define npyv_subs_s8 vqsubq_s8
#define npyv_subs_u16 vqsubq_u16
#define npyv_subs_s16 vqsubq_s16

/***************************
* Multiplication
***************************/
// non-saturated
#define npyv_mul_u8 vmulq_u8
#define npyv_mul_s8 vmulq_s8
#define npyv_mul_u16 vmulq_u16
#define npyv_mul_s16 vmulq_s16
#define npyv_mul_u32 vmulq_u32
#define npyv_mul_s32 vmulq_s32
#define npyv_mul_f32 vmulq_f32
#define npyv_mul_f64 vmulq_f64

/***************************
* Division
***************************/
#ifdef __aarch64__
#define npyv_div_f32 vdivq_f32
#else
NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
{
float32x4_t recip = vrecpeq_f32(b);
recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
return vmulq_f32(a, recip);
}
#endif
#define npyv_div_f64 vdivq_f64

#endif // _NPY_SIMD_NEON_ARITHMETIC_H
32 changes: 32 additions & 0 deletions numpy/core/src/common/simd/neon/conversion.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_CVT_H
#define _NPY_SIMD_NEON_CVT_H

// convert boolean vectors to integer vectors
#define npyv_cvt_u8_b8(A) A
#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A)
#define npyv_cvt_u16_b16(A) A
#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
#define npyv_cvt_u32_b32(A) A
#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
#define npyv_cvt_u64_b64(A) A
#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)

// convert integer vectors to boolean vectors
#define npyv_cvt_b8_u8(BL) BL
#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL)
#define npyv_cvt_b16_u16(BL) BL
#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
#define npyv_cvt_b32_u32(BL) BL
#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
#define npyv_cvt_b64_u64(BL) BL
#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)

#endif // _NPY_SIMD_NEON_CVT_H
49 changes: 49 additions & 0 deletions numpy/core/src/common/simd/neon/memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_MEMORY_H
#define _NPY_SIMD_NEON_MEMORY_H

/***************************
* load/store
***************************/
// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors
#define NPYV_IMPL_NEON_MEM(SFX, CTYPE) \
NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \
{ return vld1q_##SFX((const CTYPE*)ptr); } \
NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \
{ return vld1q_##SFX((const CTYPE*)ptr); } \
NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \
{ return vld1q_##SFX((const CTYPE*)ptr); } \
NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \
{ \
return vcombine_##SFX( \
vld1_##SFX((const CTYPE*)ptr), vdup_n_##SFX(0) \
); \
} \
NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
{ vst1q_##SFX((CTYPE*)ptr, vec); } \
NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
{ vst1_##SFX((CTYPE*)ptr, vget_low_##SFX(vec)); } \
NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
{ vst1_##SFX((CTYPE*)ptr, vget_high_##SFX(vec)); }

NPYV_IMPL_NEON_MEM(u8, uint8_t)
NPYV_IMPL_NEON_MEM(s8, int8_t)
NPYV_IMPL_NEON_MEM(u16, uint16_t)
NPYV_IMPL_NEON_MEM(s16, int16_t)
NPYV_IMPL_NEON_MEM(u32, uint32_t)
NPYV_IMPL_NEON_MEM(s32, int32_t)
NPYV_IMPL_NEON_MEM(u64, uint64_t)
NPYV_IMPL_NEON_MEM(s64, int64_t)
NPYV_IMPL_NEON_MEM(f32, float)
#if NPY_SIMD_F64
NPYV_IMPL_NEON_MEM(f64, double)
#endif

#endif // _NPY_SIMD_NEON_MEMORY_H
Loading
0