8000 MAINT: Refactor partial load Workaround for Clang by charris · Pull Request #24648 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

MAINT: Refactor partial load Workaround for Clang #24648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,33 @@ add_project_arguments(
#
# Clang defaults to a non-strict floating error point model, but we need strict
# behavior. `-ftrapping-math` is equivalent to `-ffp-exception-behavior=strict`.
# Note that this is only supported on macOS arm64 as of XCode 14.3
if cc.get_id() == 'clang'
add_project_arguments(
cc.get_supported_arguments('-ftrapping-math'), language: ['c', 'cpp'],
)
# This flag is also required to prevent the activation of SIMD partial load workarounds.
# For further clarification, refer to gh-24461.
cc_id = cc.get_id()
if cc_id.startswith('clang')
# Determine the compiler flags for trapping math exceptions.
trapping_math = {
'clang-cl': '/clang:-ftrapping-math',
}.get(cc_id, '-ftrapping-math')
# Check if the compiler supports the trapping math flag.
if cc.has_argument(trapping_math)
# TODO: Consider upgrading the vendored Meson to 1.3.0 to support the parameter `werror`
# Detect whether the compiler actually supports strict handling of floating-point exceptions
# by treating warnings as errors.
if cc.compiles('int main() { return 0; }', args: [trapping_math, '-Werror'])
trapping_math = [trapping_math, '-DNPY_HAVE_CLANG_FPSTRICT']
else
# Suppress warnings about unsupported floating-point optimization.
trapping_math = [trapping_math, '-Wno-unsupported-floating-point-opt']
# Inform the user about the workaround.
message(
'NumPy is being built against a version of Clang that does not strictly enforce ' +
'floating-point exception handling. Workarounds will be used, which may impact performance.\n' +
'Consider upgrading Clang to the latest version.'
)
endif
add_project_arguments(trapping_math, language: ['c', 'cpp'])
endif
endif

subdir('meson_cpu')
Expand Down
8 changes: 2 additions & 6 deletions numpy/core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -838,9 +838,7 @@ foreach gen_mtargets : [
[
'loops_exponent_log.dispatch.h',
src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
# Enabling SIMD on clang-cl raises spurious FP exceptions
# TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
compiler_id == 'clang-cl' ? [] : [
[
AVX512_SKX, AVX512F, [AVX2, FMA3]
]
],
Expand Down Expand Up @@ -884,9 +882,7 @@ foreach gen_mtargets : [
[
'loops_trigonometric.dispatch.h',
src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
# Enabling SIMD on clang-cl raises spurious FP exceptions
# TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
compiler_id == 'clang-cl' ? [] : [
[
AVX512F, [AVX2, FMA3],
VSX4, VSX3, VSX2,
NEON_VFPV4,
Expand Down
97 changes: 76 additions & 21 deletions numpy/core/src/common/simd/avx2/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
__m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
return _mm256_maskload_epi32((const int*)ptr, mask);
__m256i ret = _mm256_maskload_epi32((const int*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
//// 64
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
Expand All @@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
Expand All @@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_maskload_epi64((const long long*)ptr, mask);
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}

//// 64-bit nlane
Expand All @@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
npy_int64 m = -((npy_int64)(nlane > 1));
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
return _mm256_maskload_epi64((const long long*)ptr, mask);
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
npy_int64 fill_lo, npy_int64 fill_hi)
{
const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret =_mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
/*********************************
* Non-contiguous partial load
Expand All @@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i vfill = _mm256_set1_epi32(fill);
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 6D40 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
__m256i ret = _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32
Expand All @@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i vfill = npyv_setall_s64(fill);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
Expand All @@ -313,17 +358,22 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }

//// 128-bit load over 64-bit stride
NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
npy_int64 fill_lo, npy_int64 fill_hi)
npy_int64 fill_lo, npy_int64 fill_hi)
{
assert(nlane > 0);
__m256i a = npyv_loadl_s64(ptr);
Expand All @@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
__m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
#endif
__m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
return _mm256_inserti128_si256(a, b, 1);
__m256i ret = _mm256_inserti128_si256(a, b, 1);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
Expand Down
77 changes: 66 additions & 11 deletions numpy/core/src/common/simd/avx512/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,29 +248,49 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi32(fill);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
F438 volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
//// 64
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}

//// 64-bit nlane
Expand All @@ -280,7 +300,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
assert(nlane > 0);
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -293,14 +318,24 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
assert(nlane > 0);
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
/*********************************
* Non-contiguous partial load
Expand All @@ -317,7 +352,12 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
const __m512i vfill = _mm512_set1_epi32(fill);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
__m512i ret = _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32
Expand All @@ -334,7 +374,12 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
Expand All @@ -352,7 +397,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
);
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
Expand All @@ -369,7 +419,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
Expand Down
Loading
0