8000 WIP::ENH:SIMD Improve the performance of comparison operators by seiko2plus · Pull Request #16960 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

WIP::ENH:SIMD Improve the performance of comparison operators #16960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Prev Previous commit
Next Next commit
tumbling down (test AVX2)
  • Loading branch information
seiko2plus committed Aug 11, 2020
commit 8d4ae793660545526cd126d0c6e3b27f4653ef0f
5 changes: 2 additions & 3 deletions numpy/core/src/common/simd/avx2/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ NPY_FINLINE npyv_b8 npyv_pack_b16(npyv_b16 a, npyv_b16 b)
// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d)
{
const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i abcd = _mm256_packs_epi16(ab, cd);
return _mm256_permutevar8x32_epi32(abcd, perm);
__m256i abcd = npyv_pack_b16(ab, cd);
return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
}
// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b16 npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
Expand Down
14 changes: 14 additions & 0 deletions numpy/core/src/common/simd/avx2/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,19 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, int stride)
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, int stride)
{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
//// 64
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
{
__m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
__m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
__m128d a01 = _mm_loadh_pd(a0, ptr + stride);
__m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
}
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
/*
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
{
const __m128i steps = _mm_setr_epi32(0, 1, 2, 3);
Expand All @@ -127,6 +140,7 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
*/

/***************************
* Non-contiguous Store
Expand Down
11 changes: 9 additions & 2 deletions numpy/core/src/common/simd/avx2/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_xor_f64 _mm256_xor_pd

// NOT
#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
#define npyv_not_u8(A) _mm256_andnot_si256(A, _mm256_set1_epi32(-1))
#define npyv_not_s8 npyv_not_u8
#define npyv_not_b8 npyv_not_u8
#define npyv_not_u16 npyv_not_u8
Expand Down Expand Up @@ -151,6 +151,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_cmpge_s64(A, B) npyv_not_s64(_mm256_cmpgt_epi64(B, A))

// unsigned greater than
/*
#define NPYV_IMPL_AVX2_UNSIGNED_GT(LEN, SIGN) \
NPY_FINLINE __m256i npyv_cmpgt_u##LEN(__m256i a, __m256i b) \
{ \
Expand All @@ -163,7 +164,13 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
NPYV_IMPL_AVX2_UNSIGNED_GT(8, 0x80808080)
NPYV_IMPL_AVX2_UNSIGNED_GT(16, 0x80008000)
NPYV_IMPL_AVX2_UNSIGNED_GT(32, 0x80000000)

*/
NPY_FINLINE __m256i npyv_cmpgt_u8(__m256i a, __m256i b)
{ return npyv_not_u8(_mm256_cmpeq_epi8(b, _mm256_max_epu8(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u16(__m256i a, __m256i b)
{ return npyv_not_u16(_mm256_cmpeq_epi16(b, _mm256_max_epu16(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u32(__m256i a, __m256i b)
{ return npyv_not_u32(_mm256_cmpeq_epi32(b, _mm256_max_epu32(b, a))); }
NPY_FINLINE __m256i npyv_cmpgt_u64(__m256i a, __m256i b)
{
const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000);
Expand Down
Loading
0