8000 WIP::ENH:SIMD Improve the performance of comparison operators by seiko2plus · Pull Request #16960 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

WIP::ENH:SIMD Improve the performance of comparison operators #16960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Prev Previous commit
Next Next commit
ENH:NPYV add logical intrinsics for boolean vectors
  • Loading branch information
seiko2plus committed Aug 3, 2020
commit a9292b5333adad39e240c6dae821d3017152e4e9
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/avx2/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,48 +53,64 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
// AND
#define npyv_and_u8 _mm256_and_si256
#define npyv_and_s8 _mm256_and_si256
#define npyv_and_b8 _mm256_and_si256
#define npyv_and_u16 _mm256_and_si256
#define npyv_and_s16 _mm256_and_si256
#define npyv_and_b16 _mm256_and_si256
#define npyv_and_u32 _mm256_and_si256
#define npyv_and_s32 _mm256_and_si256
#define npyv_and_b32 _mm256_and_si256
#define npyv_and_u64 _mm256_and_si256
#define npyv_and_s64 _mm256_and_si256
#define npyv_and_b64 _mm256_and_si256
#define npyv_and_f32 _mm256_and_ps
#define npyv_and_f64 _mm256_and_pd

// OR
#define npyv_or_u8 _mm256_or_si256
#define npyv_or_s8 _mm256_or_si256
#define npyv_or_b8 _mm256_or_si256
#define npyv_or_u16 _mm256_or_si256
#define npyv_or_s16 _mm256_or_si256
#define npyv_or_b16 _mm256_or_si256
#define npyv_or_u32 _mm256_or_si256
#define npyv_or_s32 _mm256_or_si256
#define npyv_or_b32 _mm256_or_si256
#define npyv_or_u64 _mm256_or_si256
#define npyv_or_s64 _mm256_or_si256
#define npyv_or_b64 _mm256_or_si256
#define npyv_or_f32 _mm256_or_ps
#define npyv_or_f64 _mm256_or_pd

// XOR
#define npyv_xor_u8 _mm256_xor_si256
#define npyv_xor_s8 _mm256_xor_si256
#define npyv_xor_b8 _mm256_xor_si256
#define npyv_xor_u16 _mm256_xor_si256
#define npyv_xor_s16 _mm256_xor_si256
#define npyv_xor_b16 _mm256_xor_si256
#defin 8000 e npyv_xor_u32 _mm256_xor_si256
#define npyv_xor_s32 _mm256_xor_si256
#define npyv_xor_b32 _mm256_xor_si256
#define npyv_xor_u64 _mm256_xor_si256
#define npyv_xor_s64 _mm256_xor_si256
#define npyv_xor_b64 _mm256_xor_si256
#define npyv_xor_f32 _mm256_xor_ps
#define npyv_xor_f64 _mm256_xor_pd

// NOT
#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
#define npyv_not_s8 npyv_not_u8
#define npyv_not_b8 npyv_not_u8
#define npyv_not_u16 npyv_not_u8
#define npyv_not_s16 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#define npyv_not_u32 npyv_not_u8
#define npyv_not_s32 npyv_not_u8
#define npyv_not_b32 npyv_not_u8
#define npyv_not_u64 npyv_not_u8
#define npyv_not_s64 npyv_not_u8
#define npyv_not_b64 npyv_not_u8
#define npyv_not_f32(A) _mm256_xor_ps(A, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
#define npyv_not_f64(A) _mm256_xor_pd(A, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))

Expand Down
43 changes: 40 additions & 3 deletions numpy/core/src/common/simd/avx512/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,23 @@
#define npyv_and_s32 _mm512_and_si512
#define npyv_and_u64 _mm512_and_si512
#define npyv_and_s64 _mm512_and_si512
#define npyv_and_b32 _kand_mask16
#ifdef NPY_HAVE_AVX512BW
#define npyv_and_b8 _kand_mask64
#define npyv_and_b16 _kand_mask32
#else
#define npyv_and_b8 _mm512_and_si512
#define npyv_and_b16 _mm512_and_si512
#endif
#ifdef NPY_HAVE_AVX512DQ
#define npyv_and_b64 _kand_mask8
#define npyv_and_f32 _mm512_and_ps
#define npyv_and_f64 _mm512_and_pd
#else
#define npyv_and_b64(A, B) ((__mmask8)_kand_mask16((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512)
#endif

// OR
#define npyv_or_u8 _mm512_or_si512
#define npyv_or_s8 _mm512_or_si512
Expand All @@ -100,14 +109,23 @@
#define npyv_or_s32 _mm512_or_si512
#define npyv_or_u64 _mm512_or_si512
#define npyv_or_s64 _mm512_or_si512
#define npyv_or_b32 _kor_mask16
#ifdef NPY_HAVE_AVX512BW
#define npyv_or_b8 _kor_mask64
#define npyv_or_b16 _kor_mask32
#else
#define npyv_or_b8 _mm512_or_si512
#define npyv_or_b16 _mm512_or_si512
#endif
#ifdef NPY_HAVE_AVX512DQ
#define npyv_or_b64 _kor_mask8
#define npyv_or_f32 _mm512_or_ps
#define npyv_or_f64 _mm512_or_pd
#else
#define npyv_or_b64(A, B) ((__mmask8)_kor_mask16((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512)
#endif

// XOR
#define npyv_xor_u8 _mm512_xor_si512
#define npyv_xor_s8 _mm512_xor_si512

Expand All

@@ -117,14 +135,23 @@
#define npyv_xor_s32 _mm512_xor_si512
#define npyv_xor_u64 _mm512_xor_si512
#define npyv_xor_s64 _mm512_xor_si512
#define npyv_xor_b32 _kxor_mask16
#ifdef NPY_HAVE_AVX512BW
#define npyv_xor_b8 _kxor_mask64
#define npyv_xor_b16 _kxor_mask32
#else
#define npyv_xor_b8 _mm512_xor_si512
#define npyv_xor_b16 _mm512_xor_si512
#endif
#ifdef NPY_HAVE_AVX512DQ
#define npyv_xor_b64 _kxor_mask8
#define npyv_xor_f32 _mm512_xor_ps
#define npyv_xor_f64 _mm512_xor_pd
#else
#define npyv_xor_b64(A, B) ((__mmask8)_kxor_mask16((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512)
#endif

// NOT
#define npyv_not_u8(A) _mm512_xor_si512(A, _mm512_set1_epi32(-1))
#define npyv_not_s8 npyv_not_u8
Expand All @@ -134,10 +161,20 @@
#define npyv_not_s32 npyv_not_u8
#define npyv_not_u64 npyv_not_u8
#define npyv_not_s64 npyv_not_u8
#define npyv_not_b32 _knot_mask16
#ifdef NPY_HAVE_AVX512BW
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
#else
#define npyv_not_b8 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#endif
#ifdef NPY_HAVE_AVX512DQ
#define npyv_not_b64 _knot_mask8
#define npyv_not_f32(A) _mm512_xor_ps(A, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
#define npyv_not_f64(A) _mm512_xor_pd(A, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
#else
#define npyv_not_b64(A, B) ((__mmask8)_knot_mask16((__mmask16)(A), (__mmask16)(B)))
#define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A)))
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif
Expand Down
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/neon/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,16 @@
// AND
#define npyv_and_u8 vandq_u8
#define npyv_and_s8 vandq_s8
#define npyv_and_b8 vandq_u8
#define npyv_and_u16 vandq_u16
#define npyv_and_s16 vandq_s16
#define npyv_and_b16 vandq_u16
#define npyv_and_u32 vandq_u32
#define npyv_and_s32 vandq_s32
#define npyv_and_b32 vandq_u32
#define npyv_and_u64 vandq_u64
#define npyv_and_s64 vandq_s64
#define npyv_and_b64 vandq_u64
#define npyv_and_f32(A, B) \
vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_and_f64(A, B) \
Expand All @@ -62,12 +66,16 @@
// OR
#define npyv_or_u8 vorrq_u8
#define npyv_or_s8 vorrq_s8
#define npyv_or_b8 vorrq_u8
#define npyv_or_u16 vorrq_u16
#define npyv_or_s16 vorrq_s16
#define npyv_or_b16 vorrq_u16
#define npyv_or_u32 vorrq_u32
#define npyv_or_s32 vorrq_s32
#define npyv_or_b32 vorrq_u32
#define npyv_or_u64 vorrq_u64
#define npyv_or_s64 vorrq_s64
#define npyv_or_b64 vorrq_u64
#define npyv_or_f32(A, B) \
vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_or_f64(A, B) \
Expand All @@ -76,12 +84,16 @@
// XOR
#define npyv_xor_u8 veorq_u8
#define npyv_xor_s8 veorq_s8
#define npyv_xor_b8 veorq_u8
#define npyv_xor_u16 veorq_u16
#define npyv_xor_s16 veorq_s16
#define npyv_xor_b16 veorq_u16
#define npyv_xor_u32 veorq_u32
#define npyv_xor_s32 veorq_s32
#define npyv_xor_b32 veorq_u32
#define npyv_xor_u64 veorq_u64
#define npyv_xor_s64 veorq_s64
#define npyv_xor_b64 veorq_u64
#define npyv_xor_f32(A, B) \
vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_xor_f64(A, B) \
Expand All @@ -90,12 +102,16 @@
// NOT
#define npyv_not_u8 vmvnq_u8
#define npyv_not_s8 vmvnq_s8
#define npyv_not_b8 vmvnq_u8
#define npyv_not_u16 vmvnq_u16
#define npyv_not_s16 vmvnq_s16
#define npyv_not_b16 vmvnq_u16
#define npyv_not_u32 vmvnq_u32
#define npyv_not_s32 vmvnq_s32
#define npyv_not_b32 vmvnq_u32
#define npyv_not_u64(A) vreinterpretq_u64_u8(vmvnq_u8(vreinterpretq_u8_u64(A)))
#define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A)))
#define npyv_not_b64(A) vreinterpretq_u64_u8(vmvnq_u8(vreinterpretq_u8_u64(A)))
#define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A)))
#define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A)))

Expand Down
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/sse/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,48 +54,64 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
// AND
#define npyv_and_u8 _mm_and_si128
#define npyv_and_s8 _mm_and_si128
#define npyv_and_b8 _mm_and_si128
#define npyv_and_u16 _mm_and_si128
#define npyv_and_s16 _mm_and_si128
#define npyv_and_b16 _mm_and_si128
#define npyv_and_u32 _mm_and_si128
#define npyv_and_s32 _mm_and_si128
#define npyv_and_b32 _mm_and_si128
#define npyv_and_u64 _mm_and_si128
#define npyv_and_s64 _mm_and_si128
#define npyv_and_b64 _mm_and_si128
#define npyv_and_f32 _mm_and_ps
#define npyv_and_f64 _mm_and_pd

// OR
#define npyv_or_u8 _mm_or_si128
#define npyv_or_s8 _mm_or_si128
#define npyv_or_b8 _mm_or_si128
#define npyv_or_u16 _mm_or_si128
#define npyv_or_s16 _mm_or_si128
#define npyv_or_b16 _mm_or_si128
#define npyv_or_u32 _mm_or_si128
#define npyv_or_s32 _mm_or_si128
#define npyv_or_b32 _mm_or_si128
#define npyv_or_u64 _mm_or_si128
#define npyv_or_s64 _mm_or_si128
#define npyv_or_b64 _mm_or_si128
#define npyv_or_f32 _mm_or_ps
#define npyv_or_f64 _mm_or_pd

// XOR
#define npyv_xor_u8 _mm_xor_si128
#define npyv_xor_s8 _mm_xor_si128
#define npyv_xor_b8 _mm_xor_si128
#define npyv_xor_u16 _mm_xor_si128
#define npyv_xor_s16 _mm_xor_si128
#define npyv_xor_b16 _mm_xor_si128
#define npyv_xor_u32 _mm_xor_si128
#define npyv_xor_s32 _mm_xor_si128
#define npyv_xor_b32 _mm_xor_si128
#define npyv_xor_u64 _mm_xor_si128
#define npyv_xor_s64 _mm_xor_si128
#define npyv_xor_b64 _mm_xor_si128
#define npyv_xor_f32 _mm_xor_ps
#define npyv_xor_f64 _mm_xor_pd

// NOT
#define npyv_not_u8(A) _mm_xor_si128(A, _mm_set1_epi32(-1))
#define npyv_not_s8 npyv_not_u8
#define npyv_not_b8 npyv_not_u8
#define npyv_not_u16 npyv_not_u8
#define npyv_not_s16 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#define npyv_not_u32 npyv_not_u8
#define npyv_not_s32 npyv_not_u8
#define npyv_not_b32 npyv_not_u8
#define npyv_not_u64 npyv_not_u8
#define npyv_not_s64 npyv_not_u8
#define npyv_not_b64 npyv_not_u8
#define npyv_not_f32(A) _mm_xor_ps(A, _mm_castsi128_ps(_mm_set1_epi32(-1)))
#define npyv_not_f64(A) _mm_xor_pd(A, _mm_castsi128_pd(_mm_set1_epi32(-1)))

Expand Down
13 changes: 12 additions & 1 deletion numpy/core/src/common/simd/vsx/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,41 +48,52 @@
// AND
#define npyv_and_u8 vec_and
#define npyv_and_s8 vec_and
#define npyv_and_b8 vec_and
#define npyv_and_u16 vec_and
#define npyv_and_s16 vec_and
#define npyv_and_b16 vec_and
#define npyv_and_u32 vec_and
#define npyv_and_s32 vec_and
#define npyv_and_b32 vec_and
#define npyv_and_u64 vec_and
#define npyv_and_s64 vec_and
#define npyv_and_b64 vec_and
#define npyv_and_f32 vec_and
#define npyv_and_f64 vec_and

// OR
#define npyv_or_u8 vec_or
#define npyv_or_s8 vec_or
#define npyv_or_b8 vec_or
#define npyv_or_u16 vec_or
#define npyv_or_s16 vec_or
#define npyv_or_b16 vec_or
#define npyv_or_u32 vec_or
#define npyv_or_s32 vec_or
#define npyv_or_b32 vec_or
#define npyv_or_u64 vec_or
#define npyv_or_s64 vec_or
#define npyv_or_b64 vec_or
#define npyv_or_f32 vec_or
#define npyv_or_f64 vec_or

// XOR
#define npyv_xor_u8 vec_xor
#define npyv_xor_s8 vec_xor
#define npyv_xor_b8 vec_xor
#define npyv_xor_u16 vec_xor
#define npyv_xor_s16 vec_xor
#define npyv_xor_b16 vec_xor
#define npyv_xor_u32 vec_xor
#define npyv_xor_s32 vec_xor
#define npyv_xor_b32 vec_xor
#define npyv_xor_u64 vec_xor
#define npyv_xor_s64 vec_xor
#define npyv_xor_b64 vec_xor
#define npyv_xor_f32 vec_xor
#define npyv_xor_f64 vec_xor

// NOT
// note: we implement npyv_not_b*(boolen types) for internal use*/
#define NPYV_IMPL_VSX_NOT_INT(VEC_LEN) \
NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \
{ return vec_nor(a, a); } \
Expand Down
0