@@ -116,12 +116,13 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
116
116
const __m128i shf1 = _mm512_castsi512_si128 (divisor .val [1 ]);
117
117
const __m128i shf2 = _mm512_castsi512_si128 (divisor .val [2 ]);
118
118
#ifdef NPY_HAVE_AVX512BW
119
+ const __m512i bmask = _mm512_set1_epi32 (0x00FF00FF );
119
120
const __m512i shf1b = _mm512_set1_epi8 (0xFFU >> _mm_cvtsi128_si32 (shf1 ));
120
121
const __m512i shf2b = _mm512_set1_epi8 (0xFFU >> _mm_cvtsi128_si32 (shf2 ));
121
122
// high part of unsigned multiplication
122
- __m512i mulhi_odd = _mm512_mulhi_epu16 (a , divisor .val [0 ]);
123
- __m512i mulhi_even = _mm512_mulhi_epu16 (_mm512_slli_epi16 (a , 8 ), divisor .val [0 ]);
123
+ __m512i mulhi_even = _mm512_mullo_epi16 (_mm512_and_si512 (a , bmask ), divisor .val [0 ]);
124
124
mulhi_even = _mm512_srli_epi16 (mulhi_even , 8 );
125
+ __m512i mulhi_odd = _mm512_mullo_epi16 (_mm512_srli_epi16 (a , 8 ), divisor .val [0 ]);
125
126
__m512i mulhi = _mm512_mask_mov_epi8 (mulhi_even , 0xAAAAAAAAAAAAAAAA , mulhi_odd );
126
127
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
127
128
__m512i q = _mm512_sub_epi8 (a , mulhi );
@@ -130,18 +131,18 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
130
131
q = _mm512_and_si512 (_mm512_srl_epi16 (q , shf2 ), shf2b );
131
132
return q ;
132
133
#else
133
- const __m256i bmask = _mm256_set1_epi32 (0xFF00FF00 );
134
+ const __m256i bmask = _mm256_set1_epi32 (0x00FF00FF );
134
135
const __m256i shf1b = _mm256_set1_epi8 (0xFFU >> _mm_cvtsi128_si32 (shf1 ));
135
136
const __m256i shf2b = _mm256_set1_epi8 (0xFFU >> _mm_cvtsi128_si32 (shf2 ));
136
137
const __m512i shf2bw = npyv512_combine_si256 (shf2b , shf2b );
137
138
const __m256i mulc = npyv512_lower_si256 (divisor .val [0 ]);
138
139
//// lower 256-bit
139
140
__m256i lo_a = npyv512_lower_si256 (a );
140
141
// high part of unsigned multiplication
141
- __m256i mulhi_odd = _mm256_mulhi_epu16 (lo_a , mulc );
142
- __m256i mulhi_even = _mm256_mulhi_epu16 (_mm256_slli_epi16 (lo_a , 8 ), mulc );
142
+ __m256i mulhi_even = _mm256_mullo_epi16 (_mm256_and_si256 (lo_a , bmask ), mulc );
143
143
mulhi_even = _mm256_srli_epi16 (mulhi_even , 8 );
144
- __m256i mulhi = _mm256_blendv_epi8 (mulhi_even , mulhi_odd , bmask );
144
+ __m256i mulhi_odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (lo_a , 8 ), mulc );
145
+ __m256i mulhi = _mm256_blendv_epi8 (mulhi_odd , mulhi_even , bmask );
145
146
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
146
147
__m256i lo_q = _mm256_sub_epi8 (lo_a , mulhi );
147
148
lo_q = _mm256_and_si256 (_mm256_srl_epi16 (lo_q , shf1 ), shf1b );
@@ -151,10 +152,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
151
152
//// higher 256-bit
152
153
__m256i hi_a = npyv512_higher_si256 (a );
153
154
// high part of unsigned multiplication
154
- mulhi_odd = _mm256_mulhi_epu16 (hi_a , mulc );
155
- mulhi_even = _mm256_mulhi_epu16 (_mm256_slli_epi16 (hi_a , 8 ), mulc );
155
+ mulhi_even = _mm256_mullo_epi16 (_mm256_and_si256 (hi_a , bmask ), mulc );
156
156
mulhi_even = _mm256_srli_epi16 (mulhi_even , 8 );
157
- mulhi = _mm256_blendv_epi8 (mulhi_even , mulhi_odd , bmask );
157
+ mulhi_odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (hi_a , 8 ), mulc );
158
+ mulhi = _mm256_blendv_epi8 (mulhi_odd , mulhi_even , bmask );
158
159
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
159
160
__m256i hi_q = _mm256_sub_epi8 (hi_a , mulhi );
160
161
hi_q = _mm256_and_si256 (_mm256_srl_epi16 (hi_q , shf1 ), shf1b );
0 commit comments