8000 SIMD: add fast integer division intrinsics for all supported platforms by seiko2plus · Pull Request #18178 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content
19 changes: 16 additions & 3 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
Expand All @@ -27,6 +27,7 @@
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down Expand Up @@ -442,14 +448,15 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/


//#########################################################################
//## Attach module functions
//#########################################################################
static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
Expand All @@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@)
SIMD_INTRIN_DEF(div_@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_INTRIN_DEF(divisor_@sfx@)
SIMD_INTRIN_DEF(divc_@sfx@)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down
160 changes: 158 additions & 2 deletions numpy/core/src/common/simd/avx2/arithmetic.h
C02E
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,164 @@
// saturated
// TODO: after implment Packs intrins

/***************************
* Integer Division
***************************/
// See simd/intdiv.h for more clarification
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi8(a, mulhi);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
q = _mm256_add_epi8(mulhi, q);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf2), shf2b);
return q;
}
// divide each signed 8-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
// instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow
__m256i divc_even = npyv_divc_s16(_mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8), divisor);
__m256i divc_odd = npyv_divc_s16(_mm256_srai_epi16(a, 8), divisor);
divc_odd = _mm256_slli_epi16(divc_odd, 8);
return _mm256_blendv_epi8(divc_odd, divc_even, bmask);
}
// divide each unsigned 16-bit element by a precomputed divisor
NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi = _mm256_mulhi_epu16(a, divisor.val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi16(a, mulhi);
q = _mm256_srl_epi16(q, shf1);
q = _mm256_add_epi16(mulhi, q);
q = _mm256_srl_epi16(q, shf2);
return q;
}
// divide each signed 16-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of signed multiplication
__m256i mulhi = _mm256_mulhi_epi16(a, divisor.val[0]);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1);
q = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15));
q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
// divide each unsigned 32-bit element by a precomputed divisor
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi32(a, mulhi);
q = _mm256_srl_epi32(q, shf1);
q = _mm256_add_epi32(mulhi, q);
q = _mm256_srl_epi32(q, shf2);
return q;
}
// divide each signed 32-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of signed multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1);
q = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31));
q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
// returns the high 64 bits of unsigned 64-bit multiplication
// xref https://stackoverflow.com/a/28827013
NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
{
__m256i lomask = npyv_setall_s64(0xffffffff);
__m256i a_hi = _mm256_srli_epi64(a, 32); // a0l, a0h, a1l, a1h
__m256i b_hi = _mm256_srli_epi64(b, 32); // b0l, b0h, b1l, b1h
// compute partial products
__m256i w0 = _mm256_mul_epu32(a, b); // a0l*b0l, a1l*b1l
__m256i w1 = _mm256_mul_epu32(a, b_hi); // a0l*b0h, a1l*b1h
__m256i w2 = _mm256_mul_epu32(a_hi, b); // a0h*b0l, a1h*b0l
__m256i w3 = _mm256_mul_epu32(a_hi, b_hi); // a0h*b0h, a1h*b1h
// sum partial products
__m256i w0h = _mm256_srli_epi64(w0, 32);
__m256i s1 = _mm256_add_epi64(w1, w0h);
__m256i s1l = _mm256_and_si256(s1, lomask);
__m256i s1h = _mm256_srli_epi64(s1, 32);

__m256i s2 = _mm256_add_epi64(w2, s1l);
__m256i s2h = _mm256_srli_epi64(s2, 32);

__m256i hi = _mm256_add_epi64(w3, s1h);
hi = _mm256_add_epi64(hi, s2h);
return hi;
}
// divide each unsigned 64-bit element by a divisor
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi64(a, mulhi);
q = _mm256_srl_epi64(q, shf1);
q = _mm256_add_epi64(mulhi, q);
q = _mm256_srl_epi64(q, shf2);
return q;
}
// divide each unsigned 64-bit element by a divisor (round towards zero)
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
// convert unsigned to signed high multiplication
// mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
__m256i asign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
__m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]);
__m256i m_asign = _mm256_and_si256(divisor.val[0], asign);
__m256i a_msign = _mm256_and_si256(a, msign);
mulhi = _mm256_sub_epi64(mulhi, m_asign);
mulhi = _mm256_sub_epi64(mulhi, a_msign);
// q = (a + mulhi) >> sh
__m256i q = _mm256_add_epi64(a, mulhi);
// emulate arithmetic right shift
const __m256i sigb = npyv_setall_s64(1LL << 63);
q = _mm256_srl_epi64(_mm256_add_epi64(q, sigb), shf1);
q = _mm256_sub_epi64(q, _mm256_srl_epi64(sigb, shf1));
// q = q - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
q = _mm256_sub_epi64(q, asign);
q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
return q;
}
/***************************
* Division
***************************/
Expand Down Expand Up @@ -176,5 +334,3 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
}

#endif // _NPY_SIMD_AVX2_ARITHMETIC_H


Loading
0