8000 SIMD: Refactor partial load Workaround for Clang · Python-Repository-Hub/numpy@83cec53 · GitHub
[go: up one dir, main page]

Skip to content

Commit 83cec53

Browse files
committed
SIMD: Refactor partial load Workaround for Clang
Clang exhibits aggressive optimization behavior when the `-ftrapping-math` flag is not fully supported, starting from -O1 optimization level. When partially loading a vector register for operations that require filling up the remaining lanes with specific values (e.g., divide operations needing non-zero integers to prevent FP exception divide-by-zero), Clang's optimizer recognizes that the full register is unnecessary for the store operation. Consequently, it optimizes out the fill step involving non-zero integers for the remaining elements. As a solution, we apply the `volatile` keyword to the returned register, followed by a symmetric operand operation like `or`, to inform the compiler about the necessity of the full vector. This refactor involves transferring this workaround from the source files to the universal intrinsic headers, also to guarantee that it is applied by all kernels. Furthermore, the workaround is disabled when the `-ftrapping-math` flag is fully supported by the Clang compiler. This patch also enables `-ftrapping-math` flag for clang-cl and suppress floating point exceptions warnings.
1 parent 0ed6bb1 commit 83cec53

File tree

10 files changed

+332
-302
lines changed

10 files changed

+332
-302
lines changed

meson.build

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,33 @@ add_project_arguments(
5555
#
5656
# Clang defaults to a non-strict floating error point model, but we need strict
5757
# behavior. `-ftrapping-math` is equivalent to `-ffp-exception-behavior=strict`.
58-
# Note that this is only supported on macOS arm64 as of XCode 14.3
59-
if cc.get_id() == 'clang'
60-
add_project_arguments(
61-
cc.get_supported_arguments('-ftrapping-math'), language: ['c', 'cpp'],
62-
)
58+
# This flag is also required to prevent the activation of SIMD partial load workarounds.
59+
# For further clarification, refer to gh-24461.
60+
cc_id = cc.get_id()
61+
if cc_id.startswith('clang')
62+
# Determine the compiler flags for trapping math exceptions.
63+
trapping_math = {
64+
'clang-cl': '/clang:-ftrapping-math',
65+
}.get(cc_id, '-ftrapping-math')
66+
# Check if the compiler supports the trapping math flag.
67+
if cc.has_argument(trapping_math)
68+
# TODO: Consider upgrading the vendored Meson to 1.3.0 to support the parameter `werror`
69+
# Detect whether the compiler actually supports strict handling of floating-point exceptions
70+
# by treating warnings as errors.
71+
if cc.compiles('int main() { return 0; }', args: [trapping_math, '-Werror'])
72+
trapping_math = [trapping_math, '-DNPY_HAVE_CLANG_FPSTRICT']
73+
else
74+
# Suppress warnings about unsupported floating-point optimization.
75+
trapping_math = [trapping_math, '-Wno-unsupported-floating-point-opt']
76+
# Inform the user about the workaround.
77+
message(
78+
'NumPy is being built against a version of Clang that does not strictly enforce ' +
79+
'floating-point exception handling. Workarounds will be used, which may impact performance.\n' +
80+
'Consider upgrading Clang to the latest version.'
81+
)
82+
endif
83+
add_project_arguments(trapping_math, language: ['c', 'cpp'])
84+
endif
6385
endif
6486

6587
subdir('meson_cpu')

numpy/core/meson.build

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -841,9 +841,7 @@ foreach gen_mtargets : [
841841
[
842842
'loops_exponent_log.dispatch.h',
843843
src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
844-
# Enabling SIMD on clang-cl raises spurious FP exceptions
845-
# TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
846-
compiler_id == 'clang-cl' ? [] : [
844+
[
847845
AVX512_SKX, AVX512F, [AVX2, FMA3]
848846
]
849847
],
@@ -887,9 +885,7 @@ foreach gen_mtargets : [
887885
[
888886
'loops_trigonometric.dispatch.h',
889887
src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
890-
# Enabling SIMD on clang-cl raises spurious FP exceptions
891-
# TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
892-
compiler_id == 'clang-cl' ? [] : [
888+
[
893889
AVX512F, [AVX2, FMA3],
894890
VSX4, VSX3, VSX2,
895891
NEON_VFPV4,

numpy/core/src/common/simd/avx2/memory.h

Lines changed: 76 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
196196
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
197197
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
198198
__m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
199-
return _mm256_blendv_epi8(vfill, payload, mask);
199+
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
200+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
201+
volatile __m256i workaround = ret;
202+
ret = _mm256_or_si256(workaround, ret);
203+
#endif
204+
return ret;
200205
}
201206
// fill zero to rest lanes
202207
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
205210
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
206211
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
207212
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
208-
return _mm256_maskload_epi32((const int*)ptr, mask);
213+
__m256i ret = _mm256_maskload_epi32((const int*)ptr, mask);
214+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
215+
volatile __m256i workaround = ret;
216+
ret = _mm256_or_si256(workaround, ret);
217+
#endif
218+
return ret;
209219
}
210220
//// 64
211221
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
@@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
216226
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
217227
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
218228
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
219-
return _mm256_blendv_epi8(vfill, payload, mask);
229+
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
230+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
231+
volatile __m256i workaround = ret;
232+
ret = _mm256_or_si256(workaround, ret);
233+
#endif
234+
return ret;
220235
}
221236
// fill zero to rest lanes
222237
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
@@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
225240
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
226241
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
227242
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
228-
return _mm256_maskload_epi64((const long long*)ptr, mask);
243+
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
244+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
245+
volatile __m256i workaround = ret;
246+
ret = _mm256_or_si256(workaround, ret);
247+
#endif
248+
return ret;
229249
}
230250

231251
//// 64-bit nlane
@@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
241261
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
242262
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
243263
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
244-
return _mm256_blendv_epi8(vfill, payload, mask);
264+
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
265+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
266+
volatile __m256i workaround = ret;
267+
ret = _mm256_or_si256(workaround, ret);
268+
#endif
269+
return ret;
245270
}
246271
// fill zero to rest lanes
247272
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
251276
NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
252277
{
253278
assert(nlane > 0);
254-
npy_int64 m = -((npy_int64)(nlane > 1));
279+
npy_int64 m = -((npy_int64)(nlane > 1));
255280
__m256i mask = npyv_set_s64(-1, -1, m, m);
256-
return _mm256_maskload_epi64((const long long*)ptr, mask);
281+
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
282+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
283+
volatile __m256i workaround = ret;
284+
ret = _mm256_or_si256(workaround, ret);
285+
#endif
286+
return ret;
257287
}
258288
// fill zero to rest lanes
259289
NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
260290
npy_int64 fill_lo, npy_int64 fill_hi)
261291
{
262292
const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
263-
npy_int64 m = -((npy_int64)(nlane > 1));
264-
__m256i mask = npyv_set_s64(-1, -1, m, m);
293+
npy_int64 m = -((npy_int64)(nlane > 1));
294+
__m256i mask = npyv_set_s64(-1, -1, m, m);
265295
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
266-
return _mm256_blendv_epi8(vfill, payload, mask);
296+
__m256i ret =_mm256_blendv_epi8(vfill, payload, mask);
297+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
298+
volatile __m256i workaround = ret;
299+
ret = _mm256_or_si256(workaround, ret);
300+
#endif
301+
return ret;
267302
}
268303
/*********************************
269304
* Non-contiguous partial load
@@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
277312
const __m256i vfill = _mm256_set1_epi32(fill);
278313
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
279314
const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
280-
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
281-
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
282-
return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
315+
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
316+
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
317+
__m256i ret = _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
318+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
319+
volatile __m256i workaround = ret;
320+
ret = _mm256_or_si256(workaround, ret);
321+
#endif
322+
return ret;
283323
}
284324
// fill zero to rest lanes
285325
NPY_FINLINE npyv_s32
@@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
293333
const __m256i vfill = npyv_setall_s64(fill);
294334
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
295335
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
296-
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
297-
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
298-
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
336+
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
337+
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
338+
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
339+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
340+
volatile __m256i workaround = ret;
341+
ret = _mm256_or_si256(workaround, ret);
342+
#endif
343+
return ret;
299344
}
300345
// fill zero to rest lanes
301346
NPY_FINLINE npyv_s64
@@ -313,17 +358,22 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
313358
);
314359
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
315360
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
316-
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
317-
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
318-
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
361+
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
362+
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
363+
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
364+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
365+
volatile __m256i workaround = ret;
366+
ret = _mm256_or_si256(workaround, ret);
367+
#endif
368+
return ret;
319369
}
320370
// fill zero to rest lanes
321371
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
322372
{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
323373

324374
//// 128-bit load over 64-bit stride
325375
NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
326-
npy_int64 fill_lo, npy_int64 fill_hi)
376+
npy_int64 fill_lo, npy_int64 fill_hi)
327377
{
328378
assert(nlane > 0);
329379
__m256i a = npyv_loadl_s64(ptr);
@@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
336386
__m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
337387
#endif
338388
__m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
339-
return _mm256_inserti128_si256(a, b, 1);
389+
__m256i ret = _mm256_inserti128_si256(a, b, 1);
390+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
391+
volatile __m256i workaround = ret;
392+
ret = _mm256_or_si256(workaround, ret);
393+
#endif
394+
return ret;
340395
}
341396
// fill zero to rest lanes
342397
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)

numpy/core/src/common/simd/avx512/memory.h

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -248,29 +248,49 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
248248
assert(nlane > 0);
249249
const __m512i vfill = _mm512_set1_epi32(fill);
250250
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
251-
return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
251+
__m512i ret = _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
252+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
253+
volatile __m512i workaround = ret;
254+
ret = _mm512_or_si512(workaround, ret);
255+
#endif
256+
return ret;
252257
}
253258
// fill zero to rest lanes
254259
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
255260
{
256261
assert(nlane > 0);
257262
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
258-
return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
263+
__m512i ret = _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
264+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
265+
volatile __m512i workaround = ret;
266+
ret = _mm512_or_si512(workaround, ret);
267+
#endif
268+
return ret;
259269
}
260270
//// 64
261271
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
262272
{
263273
assert(nlane > 0);
264274
const __m512i vfill = npyv_setall_s64(fill);
265275
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
266-
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
276+
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
277+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
278+
volatile __m512i workaround = ret;
279+
ret = _mm512_or_si512(workaround, ret);
280+
#endif
281+
return ret;
267282
}
268283
// fill zero to rest lanes
269284
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
270285
{
271286
assert(nlane > 0);
272287
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
273-
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
288+
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
289+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
290+
volatile __m512i workaround = ret;
291+
ret = _mm512_or_si512(workaround, ret);
292+
#endif
293+
return ret;
274294
}
275295

276296
//// 64-bit nlane
@@ -280,7 +300,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
280300
assert(nlane > 0);
281301
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
282302
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
283-
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
303+
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
304+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
305+
volatile __m512i workaround = ret;
306+
ret = _mm512_or_si512(workaround, ret);
307+
#endif
308+
return ret;
284309
}
285310
// fill zero to rest lanes
286311
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -293,14 +318,24 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
293318
assert(nlane > 0);
294319
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
295320
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
296-
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
321+
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
322+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
323+
volatile __m512i workaround = ret;
324+
ret = _mm512_or_si512(workaround, ret);
325+
#endif
326+
return ret;
297327
}
298328
// fill zero to rest lanes
299329
NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
300330
{
301331
assert(nlane > 0);
302332
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
303-
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
333+
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
334+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
335+
volatile __m512i workaround = ret;
336+
ret = _mm512_or_si512(workaround, ret);
337+
#endif
338+
return ret;
304339
}
305340
/*********************************
306341
* Non-contiguous partial load
@@ -317,7 +352,12 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
317352
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
318353
const __m512i vfill = _mm512_set1_epi32(fill);
319354
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
320-
return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
355+
__m512i ret = _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
356+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
357+
volatile __m512i workaround = ret;
358+
ret = _mm512_or_si512(workaround, ret);
359+
#endif
360+
return ret;
321361
}
322362
// fill zero to rest lanes
323363
NPY_FINLINE npyv_s32
@@ -334,7 +374,12 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
334374
);
335375
const __m512i vfill = npyv_setall_s64(fill);
336376
const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
337-
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
377+
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
378+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
379+
volatile __m512i workaround = ret;
380+
ret = _mm512_or_si512(workaround, ret);
381+
#endif
382+
return ret;
338383
}
339384
// fill zero to rest lanes
340385
NPY_FINLINE npyv_s64
@@ -352,7 +397,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
352397
);
353398
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
354399
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
355-
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
400+
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
401+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
402+
volatile __m512i workaround = ret;
403+
ret = _mm512_or_si512(workaround, ret);
404+
#endif
405+
return ret;
356406
}
357407
// fill zero to rest lanes
358408
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -369,7 +419,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
369419
);
370420
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
371421
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
372-
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
422+
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
423+
#if NPY_SIMD_GUARD_PARTIAL_LOAD
424+
volatile __m512i workaround = ret;
425+
ret = _mm512_or_si512(workaround, ret);
426+
#endif
427+
return ret;
373428
}
374429
// fill zero to rest lanes
375430
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)

0 commit comments

Comments
 (0)
0