8000 MAINT, SIMD: Pass divisor by refernce in npyv_divc_* by ganesh-k13 · Pull Request #19114 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

MAINT, SIMD: Pass divisor by refernce in npyv_divc_* #19114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)

#if @intdiv_sup@
SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
SIMD_IMPL_INTRIN_2_REFERNCE2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
#endif // intdiv_sup

#if @fused_sup@
Expand Down
22 changes: 22 additions & 0 deletions numpy/core/src/_simd/_simd_easyintrin.inc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,28 @@
return simd_arg_to_obj(&ret); \
}

#define SIMD_IMPL_INTRIN_2_REFERNCE2(NAME, RET, IN0, IN1) \
static PyObject *simd__intrin_##NAME \
(PyObject* NPY_UNUSED(self), PyObject *args) \
{ \
simd_arg arg1 = {.dtype = simd_data_##IN0}; \
simd_arg arg2 = {.dtype = simd_data_##IN1}; \
if (!PyArg_ParseTuple( \
args, "O&O&:"NPY_TOSTRING(NAME), \
simd_arg_converter, &arg1, \
simd_arg_converter, &arg2 \
)) return NULL; \
simd_data data = {.RET = npyv_##NAME( \
arg1.data.IN0, &arg2.data.IN1 \
)}; \
simd_arg_free(&arg1); \
simd_arg_free(&arg2); \
simd_arg ret = { \
.data = data, .dtype = simd_data_##RET \
}; \
return simd_arg_to_obj(&ret); \
}

#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
C == arg2.data.u8 ? NPY_CAT(npyv_, NAME)(arg1.data.IN0, C) :

Expand Down
70 changes: 35 additions & 35 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,17 @@
***************************/
// See simd/intdiv.h for more clarification
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 *divisor)
{
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor->val[2]);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor.val[0]);
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor->val[0]);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor.val[0]);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor->val[0]);
__m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi8(a, mulhi);
Expand All @@ -91,8 +91,8 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
return q;
}
// divide each signed 8-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor);
NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 *divisor);
NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 *divisor)
{
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
// instead of _mm256_cvtepi8_epi16/_mm256_packs_epi16 to wrap around overflow
Expand All @@ -102,12 +102,12 @@ NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
return _mm256_blendv_epi8(divc_odd, divc_even, bmask);
}
// divide each unsigned 16-bit element by a precomputed divisor
NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor->val[2]);
// high part of unsigned multiplication
__m256i mulhi = _mm256_mulhi_epu16(a, divisor.val[0]);
__m256i mulhi = _mm256_mulhi_epu16(a, divisor->val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi16(a, mulhi);
q = _mm256_srl_epi16(q, shf1);
Expand All @@ -116,26 +116,26 @@ NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
return q;
}
// divide each signed 16-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
// high part of signed multiplication
__m256i mulhi = _mm256_mulhi_epi16(a, divisor.val[0]);
__m256i mulhi = _mm256_mulhi_epi16(a, divisor->val[0]);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi16(_mm256_add_epi16(a, mulhi), shf1);
q = _mm256_sub_epi16(q, _mm256_srai_epi16(a, 15));
q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
q = _mm256_sub_epi16(_mm256_xor_si256(q, divisor->val[2]), divisor->val[2]);
return q;
}
// divide each unsigned 32-bit element by a precomputed divisor
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor->val[2]);
// high part of unsigned multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epu32(a, divisor->val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epu32(_mm256_srli_epi64(a, 32), divisor->val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi32(a, mulhi);
Expand All @@ -145,18 +145,18 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
return q;
}
// divide each signed 32-bit element by a precomputed divisor (round towards zero)
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
// high part of signed multiplication
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor.val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor.val[0]);
__m256i mulhi_even = _mm256_srli_epi64(_mm256_mul_epi32(a, divisor->val[0]), 32);
__m256i mulhi_odd = _mm256_mul_epi32(_mm256_srli_epi64(a, 32), divisor->val[0]);
__m256i mulhi = _mm256_blend_epi32(mulhi_even, mulhi_odd, 0xAA);
// q = ((a + mulhi) >> sh1) - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
__m256i q = _mm256_sra_epi32(_mm256_add_epi32(a, mulhi), shf1);
q = _mm256_sub_epi32(q, _mm256_srai_epi32(a, 31));
q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
q = _mm256_sub_epi32(_mm256_xor_si256(q, divisor->val[2]), divisor->val[2]);
return q;
}
// returns the high 64 bits of unsigned 64-bit multiplication
Expand Down Expand Up @@ -185,12 +185,12 @@ NPY_FINLINE npyv_u64 npyv__mullhi_u64(npyv_u64 a, npyv_u64 b)
return hi;
}
// divide each unsigned 64-bit element by a divisor
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor->val[2]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
__m256i mulhi = npyv__mullhi_u64(a, divisor->val[0]);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi64(a, mulhi);
q = _mm256_srl_epi64(q, shf1);
Expand All @@ -199,16 +199,16 @@ NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor)
return q;
}
// divide each unsigned 64-bit element by a divisor (round towards zero)
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 *divisor)
{
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf1 = _mm256_castsi256_si128(divisor->val[1]);
// high part of unsigned multiplication
__m256i mulhi = npyv__mullhi_u64(a, divisor.val[0]);
__m256i mulhi = npyv__mullhi_u64(a, divisor->val[0]);
// convert unsigned to signed high multiplication
// mulhi - ((a < 0) ? m : 0) - ((m < 0) ? a : 0);
__m256i asign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
__m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor.val[0]);
__m256i m_asign = _mm256_and_si256(divisor.val[0], asign);
__m256i msign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), divisor->val[0]);
__m256i m_asign = _mm256_and_si256(divisor->val[0], asign);
__m256i a_msign = _mm256_and_si256(a, msign);
mulhi = _mm256_sub_epi64(mulhi, m_asign);
mulhi = _mm256_sub_epi64(mulhi, a_msign);
Expand All @@ -221,7 +221,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// q = q - XSIGN(a)
// trunc(a/d) = (q ^ dsign) - dsign
q = _mm256_sub_epi64(q, asign);
q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor.val[2]), divisor.val[2]);
q = _mm256_sub_epi64(_mm256_xor_si256(q, divisor->val[2]), divisor->val[2]);
return q;
}
/***************************
Expand Down
Loading
0