@@ -224,21 +224,18 @@ int avx2_double_compressstore32(void *left_addr,
224
224
typename avx2_vector<T>::reg_t reg)
225
225
{
226
226
using vtype = avx2_vector<T>;
227
- const __m256i oxff = _mm256_set1_epi32 (0xFFFFFFFF );
228
227
229
228
T *leftStore = (T *)left_addr;
230
229
T *rightStore = (T *)right_addr;
231
230
232
231
int32_t shortMask = convert_avx2_mask_to_int (k);
233
232
const __m256i &perm = _mm256_loadu_si256 (
234
233
(const __m256i *)avx2_compressstore_lut32_perm[shortMask].data ());
235
- const __m256i &left = _mm256_loadu_si256 (
236
- (const __m256i *)avx2_compressstore_lut32_left[shortMask].data ());
237
234
238
235
typename vtype::reg_t temp = vtype::permutevar (reg, perm);
239
236
240
- vtype::mask_storeu (leftStore, left , temp);
241
- vtype::mask_storeu (rightStore, _mm256_xor_si256 (oxff, left) , temp);
237
+ vtype::storeu (leftStore, temp);
238
+ vtype::storeu (rightStore, temp);
242
239
243
240
return _mm_popcnt_u32 (shortMask);
244
241
}
@@ -250,22 +247,19 @@ int32_t avx2_double_compressstore64(void *left_addr,
250
247
typename avx2_vector<T>::reg_t reg)
251
248
{
252
249
using vtype = avx2_vector<T>;
253
- const __m256i oxff = _mm256_set1_epi32 (0xFFFFFFFF );
254
250
255
251
T *leftStore = (T *)left_addr;
256
252
T *rightStore = (T *)right_addr;
257
253
258
254
int32_t shortMask = convert_avx2_mask_to_int_64bit (k);
259
255
const __m256i &perm = _mm256_loadu_si256 (
260
256
(const __m256i *)avx2_compressstore_lut64_perm[shortMask].data ());
261
- const __m256i &left = _mm256_loadu_si256 (
262
- (const __m256i *)avx2_compressstore_lut64_left[shortMask].data ());
263
257
264
258
typename vtype::reg_t temp = vtype::cast_from (
265
259
_mm256_permutevar8x32_epi32 (vtype::cast_to (reg), perm));
266
260
267
- vtype::mask_storeu (leftStore, left , temp);
268
- vtype::mask_storeu (rightStore, _mm256_xor_si256 (oxff, left) , temp);
261
+ vtype::storeu (leftStore, temp);
262
+ vtype::storeu (rightStore, temp);
269
263
270
264
return _mm_popcnt_u32 (shortMask);
271
265
}
0 commit comments