8000 SIMD: add fast integer division intrinsics for all supported platforms by seiko2plus · Pull Request #18178 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

SIMD: add fast integer division intrinsics for all supported platforms #18178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 8, 2021
19 changes: 16 additions & 3 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
Expand All @@ -27,6 +27,7 @@
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down Expand Up @@ -442,14 +448,15 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/


//#########################################################################
//## Attach module functions
//#########################################################################
static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
Expand All @@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@)
SIMD_INTRIN_DEF(div_@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_INTRIN_DEF(divisor_@sfx@)
SIMD_INTRIN_DEF(divc_@sfx@)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down
160 changes: 158 additions & 2 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,164 @@
// saturated
// TODO: after implment Packs intrins

/***************************
* Integer Division
***************************/
// See simd/intdiv.h for more clarification
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi8(a, mulhi);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
q = _mm256_add_epi8(mulhi, q);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf2), shf2b);
return q;
}
// divide each signed 8-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
// instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow
__m256i divc_even = npyv_divc_s16(_mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8), divisor);
__m256i divc_odd = npyv_divc_s16(_mm256_srai_epi16(a, 8), divisor);
divc_odd = _mm256_slli_epi16(divc_odd, 8);
return _mm256_blendv_epi8(divc_odd, divc_even, bmask);
}
// divide each unsigned 16-bit element by a precomputed divisor
NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi = _mm256_mulhi_epu16(a, divisor.val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi16(a, mulhi);
q = _mm256_srl_epi16(q, shf1);
q = _mm256_add_epi16(mulhi, q);
q = _mm256_srl_epi16(q, shf2);
return q;
}
// divide each signed 16-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of signed multiplication
__m256i mulhi = _mm256_mulhi_epi16(a, divisor.val[0]);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1);
q = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15));
q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
// divide each unsigned 32-bit element by a precomputed divisor
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi32(a, mulhi);
q = _mm256_srl_epi32(q, shf1);
q = _mm256_add_epi32(mulhi, q);
q = _mm256_srl_epi32(q, shf2);
return q;
}
// divide each signed 32-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of signed multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1);
q = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31));
q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
// returns the high 64 bits of unsigned 64-bit multiplication
// xref https://stackoverflow.com/a/28827013
NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
{
__m256i lomask = npyv_setall_s64(0xffffffff);
__m256i a_hi = _mm256_srli_epi64(a, 32); // a0l, a0h, a1l, a1h
__m256i b_hi = _mm256_srli_epi64(b, 32); // b0l, b0h, b1l, b1h
// compute partial products
__m256i w0 = _mm256_mul_epu32(a, b); // a0l*b0l, a1l*b1l
__m256i w1 = _mm256_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h
__m256i w2 = _mm256_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l
__m256i w3 = _mm256_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h
// sum partial products
__m256i w0h = _mm256_srli_epi64(w0, 32);
__m256i s1 = _mm256_add_epi64(w1, w0h);
__m256i s1l = _mm256_and_si256(s1, lomask);
__m256i s1h = _mm256_srli_epi64(s1, 32);

__m256i s2 = _mm256_add_epi64(w2, s1l);
__m256i s2h = _mm256_srli_epi64(s2, 32);

__m256i hi = _mm256_add_epi64(w3, s1h);
hi = _mm256_add_epi64(hi, s2h);
return hi;
}
// divide each unsigned 64-bit element by a divisor
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi64(a, mulhi);
q = _mm256_srl_epi64(q, shf1);
q = _mm256_add_epi64(mulhi, q);
q = _mm256_srl_epi64(q, shf2);
return q;
}
// divide each unsigned 64-bit element by a divisor (round towards zero)
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
// convert unsigned to signed high multiplication
// mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
__m256i asign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
__m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]);
__m256i m_asign = _mm256_and_si256(divisor.val[0], asign);
__m256i a_msign = _mm256_and_si256(a, msign);
mulhi = _mm256_sub_epi64(mulhi, m_asign);
mulhi = _mm256_sub_epi64(mulhi, a_msign);
// q = (a + mulhi) >> sh
__m256i q = _mm256_add_epi64(a, mulhi);
// emulate arithmetic right shift
const __m256i sigb = npyv_setall_s64(1LL << 63);
q = _mm256_srl_epi64(_mm256_add_epi64(q, sigb), shf1);
q = _mm256_sub_epi64(q, _mm256_srl_epi64(sigb, shf1));
// q = q - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
q = _mm256_sub_epi64(q, asign);
q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
/***************************
* Division
***************************/
Expand Down Expand Up @@ -176,5 +334,3 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
}

#endif // _NPY_SIMD_AVX2_ARITHMETIC_H


Loading
0