8000 tumbling down (test AVX2) · numpy/numpy@8d4ae79 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8d4ae79

Browse files
committed
tumbling down (test AVX2)
1 parent 95c485a commit 8d4ae79

File tree

4 files changed

+231
-247
lines changed

4 files changed

+231
-247
lines changed

numpy/core/src/common/simd/avx2/conversion.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,10 @@ NPY_FINLINE npyv_b8 npyv_pack_b16(npyv_b16 a, npyv_b16 b)
3838
// pack four 32-bit boolean vectors into one 8-bit boolean vector
3939
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d)
4040
{
41-
const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
4241
__m256i ab = _mm256_packs_epi32(a, b);
4342
__m256i cd = _mm256_packs_epi32(c, d);
44-
__m256i abcd = _mm256_packs_epi16(ab, cd);
45-
return _mm256_permutevar8x32_epi32(abcd, perm);
43+
__m256i abcd = npyv_pack_b16(ab, cd);
44+
return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
4645
}
4746
// pack eight 64-bit boolean vectors into one 8-bit boolean vector
4847
NPY_FINLINE npyv_b16 npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,

numpy/core/src/common/simd/avx2/memory.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,19 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, int stride)
117117
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, int stride)
118118
{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
119119
//// 64
120+
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
121+
{
122+
__m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
123+
__m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
124+
__m128d a01 = _mm_loadh_pd(a0, ptr + stride);
125+
__m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
126+
return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
127+
}
128+
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
129+
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
130+
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
131+
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
132+
/*
120133
NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, int stride)
121134
{
122135
const __m128i steps = _mm_setr_epi32(0, 1, 2, 3);
@@ -127,6 +140,7 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, int stride)
127140
{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
128141
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, int stride)
129142
{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
143+
*/
130144

131145
/***************************
132146
* Non-contiguous Store

numpy/core/src/common/simd/avx2/operators.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
9999
#define npyv_xor_f64 _mm256_xor_pd
100100

101101
// NOT
102-
#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
102+
#define npyv_not_u8(A) _mm256_andnot_si256(A, _mm256_set1_epi32(-1))
103103
#define npyv_not_s8 npyv_not_u8
104104
#define npyv_not_b8 npyv_not_u8
105105
#define npyv_not_u16 npyv_not_u8
@@ -151,6 +151,7 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
151151
#define npyv_cmpge_s64(A, B) npyv_not_s64(_mm256_cmpgt_epi64(B, A))
152152

153153
// unsigned greater than
154+
/*
154155
#define NPYV_IMPL_AVX2_UNSIGNED_GT(LEN, SIGN) \
155156
NPY_FINLINE __m256i npyv_cmpgt_u##LEN(__m256i a, __m256i b) \
156157
{ \
@@ -163,7 +164,13 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
163164
NPYV_IMPL_AVX2_UNSIGNED_GT(8, 0x80808080)
164165
NPYV_IMPL_AVX2_UNSIGNED_GT(16, 0x80008000)
165166
NPYV_IMPL_AVX2_UNSIGNED_GT(32, 0x80000000)
166-
167+
*/
168+
NPY_FINLINE __m256i npyv_cmpgt_u8(__m256i a, __m256i b)
169+
{ return npyv_not_u8(_mm256_cmpeq_epi8(b, _mm256_max_epu8(b, a))); }
170+
NPY_FINLINE __m256i npyv_cmpgt_u16(__m256i a, __m256i b)
171+
{ return npyv_not_u16(_mm256_cmpeq_epi16(b, _mm256_max_epu16(b, a))); }
172+
NPY_FINLINE __m256i npyv_cmpgt_u32(__m256i a, __m256i b)
173+
{ return npyv_not_u32(_mm256_cmpeq_epi32(b, _mm256_max_epu32(b, a))); }
167174
NPY_FINLINE __m256i npyv_cmpgt_u64(__m256i a, __m256i b)
168175
{
169176
const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000);

0 commit comments

Comments
 (0)
0