@@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
196
196
__m256i vnlane = _mm256_set1_epi32 (nlane > 8 ? 8 : (int )nlane );
197
197
__m256i mask = _mm256_cmpgt_epi32 (vnlane , steps );
198
198
__m256i payload = _mm256_maskload_epi32 ((const int * )ptr , mask );
199
- return _mm256_blendv_epi8 (vfill , payload , mask );
199
+ __m256i ret = _mm256_blendv_epi8 (vfill , payload , mask );
200
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
201
+ volatile __m256i workaround = ret ;
202
+ ret = _mm256_or_si256 (workaround , ret );
203
+ #endif
204
+ return ret ;
200
205
}
201
206
// fill zero to rest lanes
202
207
NPY_FINLINE npyv_s32 npyv_load_tillz_s32 (const npy_int32 * ptr , npy_uintp nlane )
@@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
205
210
const __m256i steps = _mm256_setr_epi32 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 );
206
211
__m256i vnlane = _mm256_set1_epi32 (nlane > 8 ? 8 : (int )nlane );
207
212
__m256i mask = _mm256_cmpgt_epi32 (vnlane , steps );
208
- return _mm256_maskload_epi32 ((const int * )ptr , mask );
213
+ __m256i ret = _mm256_maskload_epi32 ((const int * )ptr , mask );
214
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
215
+ volatile __m256i workaround = ret ;
216
+ ret = _mm256_or_si256 (workaround , ret );
217
+ #endif
218
+ return ret ;
209
219
}
210
220
//// 64
211
221
NPY_FINLINE npyv_s64 npyv_load_till_s64 (const npy_int64 * ptr , npy_uintp nlane , npy_int64 fill )
@@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
216
226
__m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
217
227
__m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
218
228
__m256i payload = _mm256_maskload_epi64 ((const long long * )ptr , mask );
219
- return _mm256_blendv_epi8 (vfill , payload , mask );
229
+ __m256i ret = _mm256_blendv_epi8 (vfill , payload , mask );
230
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
231
+ volatile __m256i workaround = ret ;
232
+ ret = _mm256_or_si256 (workaround , ret );
233
+ #endif
234
+ return ret ;
220
235
}
221
236
// fill zero to rest lanes
222
237
NPY_FINLINE npyv_s64 npyv_load_tillz_s64 (const npy_int64 * ptr , npy_uintp nlane )
@@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
225
240
const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
226
241
__m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
227
242
__m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
228
- return _mm256_maskload_epi64 ((const long long * )ptr , mask );
243
+ __m256i ret = _mm256_maskload_epi64 ((const long long * )ptr , mask );
244
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
245
+ volatile __m256i workaround = ret ;
246
+ ret = _mm256_or_si256 (workaround , ret );
247
+ #endif
248
+ return ret ;
229
249
}
230
250
231
251
//// 64-bit nlane
@@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
241
261
__m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
242
262
__m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
243
263
__m256i payload = _mm256_maskload_epi64 ((const long long * )ptr , mask );
244
- return _mm256_blendv_epi8 (vfill , payload , mask );
264
+ __m256i ret = _mm256_blendv_epi8 (vfill , payload , mask );
265
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
266
+ volatile __m256i workaround = ret ;
267
+ ret = _mm256_or_si256 (workaround , ret );
268
+ #endif
269
+ return ret ;
245
270
}
246
271
// fill zero to rest lanes
247
272
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32 (const npy_int32 * ptr , npy_uintp nlane )
@@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
251
276
NPY_FINLINE npyv_u64 npyv_load2_tillz_s64 (const npy_int64 * ptr , npy_uintp nlane )
252
277
{
253
278
assert (nlane > 0 );
254
- npy_int64 m = - ((npy_int64 )(nlane > 1 ));
279
+ npy_int64 m = - ((npy_int64 )(nlane > 1 ));
255
280
__m256i mask = npyv_set_s64 (-1 , -1 , m , m );
256
- return _mm256_maskload_epi64 ((const long long * )ptr , mask );
281
+ __m256i ret = _mm256_maskload_epi64 ((const long long * )ptr , mask );
282
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
283
+ volatile __m256i workaround = ret ;
284
+ ret = _mm256_or_si256 (workaround , ret );
285
+ #endif
286
+ return ret ;
257
287
}
258
288
// fill zero to rest lanes
259
289
NPY_FINLINE npyv_u64 npyv_load2_till_s64 (const npy_int64 * ptr , npy_uintp nlane ,
260
290
npy_int64 fill_lo , npy_int64 fill_hi )
261
291
{
262
292
const __m256i vfill = npyv_set_s64 (0 , 0 , fill_lo , fill_hi );
263
- npy_int64 m = - ((npy_int64 )(nlane > 1 ));
264
- __m256i mask = npyv_set_s64 (-1 , -1 , m , m );
293
+ npy_int64 m = - ((npy_int64 )(nlane > 1 ));
294
+ __m256i mask = npyv_set_s64 (-1 , -1 , m , m );
265
295
__m256i payload = _mm256_maskload_epi64 ((const long long * )ptr , mask );
266
- return _mm256_blendv_epi8 (vfill , payload , mask );
296
+ __m256i ret = _mm256_blendv_epi8 (vfill , payload , mask );
297
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
298
+ volatile __m256i workaround = ret ;
299
+ ret = _mm256_or_si256 (workaround , ret );
300
+ #endif
301
+ return ret ;
267
302
}
268
303
/*********************************
269
304
* Non-contiguous partial load
@@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
277
312
const __m256i vfill = _mm256_set1_epi32 (fill );
278
313
const __m256i steps = _mm256_setr_epi32 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 );
279
314
const __m256i idx = _mm256_mullo_epi32 (_mm256_set1_epi32 ((int )stride ), steps );
280
- __m256i vnlane = _mm256_set1_epi32 (nlane > 8 ? 8 : (int )nlane );
281
- __m256i mask = _mm256_cmpgt_epi32 (vnlane , steps );
282
- return _mm256_mask_i32gather_epi32 (vfill , (const int * )ptr , idx , mask , 4 );
315
+ __m256i vnlane = _mm256_set1_epi32 (nlane > 8 ? 8 : (int )nlane );
316
+ __m256i mask = _mm256_cmpgt_epi32 (vnlane , steps );
317
+ __m256i ret = _mm256_mask_i32gather_epi32 (vfill , (const int * )ptr , idx , mask , 4 );
318
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
319
+ volatile __m256i workaround = ret ;
320
+ ret = _mm256_or_si256 (workaround , ret );
321
+ #endif
322
+ return ret ;
283
323
}
284
324
// fill zero to rest lanes
285
325
NPY_FINLINE npyv_s32
@@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
293
333
const __m256i vfill = npyv_setall_s64 (fill );
294
334
const __m256i idx = npyv_set_s64 (0 , 1 * stride , 2 * stride , 3 * stride );
295
335
const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
296
- __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
297
- __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
298
- return _mm256_mask_i64gather_epi64 (vfill , (const long long * )ptr , idx , mask , 8 );
336
+ __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
337
+ __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
338
+ __m256i ret = _mm256_mask_i64gather_epi64 (vfill , (const long long * )ptr , idx , mask , 8 );
339
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
340
+ volatile __m256i workaround = ret ;
341
+ ret = _mm256_or_si256 (workaround , ret );
342
+ #endif
343
+ return ret ;
299
344
}
300
345
// fill zero to rest lanes
301
346
NPY_FINLINE npyv_s64
@@ -313,17 +358,22 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
313
358
);
314
359
const __m256i idx = npyv_set_s64 (0 , 1 * stride , 2 * stride , 3 * stride );
315
360
const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
316
- __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
317
- __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
318
- return _mm256_mask_i64gather_epi64 (vfill , (const long long * )ptr , idx , mask , 4 );
361
+ __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
362
+ __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
363
+ __m256i ret = _mm256_mask_i64gather_epi64 (vfill , (const long long * )ptr , idx , mask , 4 );
364
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
365
+ volatile __m256i workaround = ret ;
366
+ ret = _mm256_or_si256 (workaround , ret );
367
+ #endif
368
+ return ret ;
319
369
}
320
370
// fill zero to rest lanes
321
371
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32 (const npy_int32 * ptr , npy_intp stride , npy_uintp nlane )
322
372
{ return npyv_loadn2_till_s32 (ptr , stride , nlane , 0 , 0 ); }
323
373
324
374
//// 128-bit load over 64-bit stride
325
375
NPY_FINLINE npyv_s64 npyv_loadn2_till_s64 (const npy_int64 * ptr , npy_intp stride , npy_uintp nlane ,
326
- npy_int64 fill_lo , npy_int64 fill_hi )
376
+ npy_int64 fill_lo , npy_int64 fill_hi )
327
377
{
328
378
assert (nlane > 0 );
329
379
__m256i a = npyv_loadl_s64 (ptr );
@@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
336
386
__m128i fill = _mm_set_epi64x (fill_hi , fill_lo );
337
387
#endif
338
388
__m128i b = nlane > 1 ? _mm_loadu_si128 ((const __m128i * )(ptr + stride )) : fill ;
339
- return _mm256_inserti128_si256 (a , b , 1 );
389
+ __m256i ret = _mm256_inserti128_si256 (a , b , 1 );
390
+ #if NPY_SIMD_GUARD_PARTIAL_LOAD
391
+ volatile __m256i workaround = ret ;
392
+ ret = _mm256_or_si256 (workaround , ret );
393
+ #endif
394
+ return ret ;
340
395
}
341
396
// fill zero to rest lanes
342
397
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64 (const npy_int64 * ptr , npy_intp stride , npy_uintp nlane )
0 commit comments