8000 SIMD: add fast integer division intrinsics for all supported platforms by seiko2plus · Pull Request #18178 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

SIMD: add fast integer division intrinsics for all supported platforms #18178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 8, 2021
Prev Previous commit
Next Next commit
SIMD, TST: add test cases for NPYV fast integer division
  • Loading branch information
seiko2plus committed Mar 8, 2021
commit 6c94b4c2c7d48acf08a0f4d2d5844f7d7f3669de
19 changes: 16 additions & 3 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
Expand All @@ -27,6 +27,7 @@
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -354,6 +355,11 @@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@)
SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down Expand Up @@ -442,14 +448,15 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/


//#########################################################################
//## Attach module functions
//#########################################################################
static PyMethodDef simd__intrinsics_methods[] = {
/**begin repeat
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
Expand All @@ -461,6 +468,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
Expand Down Expand Up @@ -568,6 +576,11 @@ SIMD_INTRIN_DEF(mul_@sfx@)
SIMD_INTRIN_DEF(div_@sfx@)
#endif // div_sup

#if @intdiv_sup@
SIMD_INTRIN_DEF(divisor_@sfx@)
SIMD_INTRIN_DEF(divc_@sfx@)
#endif // intdiv_sup

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
Expand Down
70 changes: 70 additions & 0 deletions numpy/core/tests/test_simd.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,76 @@ def test_arithmetic_div(self):
div = self.div(vdata_a, vdata_b)
assert div == data_div

def test_arithmetic_intdiv(self):
"""
Test integer division intrinics:
npyv_divisor_##sfx
npyv_divc_##sfx
"""
if self._is_fp():
return

def trunc_div(a, d):
"""
Divide towards zero works with large integers > 2^53,
equivalent to int(a/d)
"""
sign_a, sign_d = a < 0, d < 0
if a == 0 or sign_a == sign_d:
return a // d
return (a + sign_d - sign_a) // d + 1

int_min = self._int_min() if self._is_signed() else 1
int_max = self._int_max()
rdata = (
0, 1, self.nlanes, int_max-self.nlanes,
int_min, int_min//2 + 1
)
divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)

for x, d in zip(rdata, divisors):
data = self._data(x)
vdata = self.load(data)
data_divc = [trunc_div(a, d) for a in data]
divisor = self.divisor(d)
divc = self.divc(vdata, divisor)
assert divc == data_divc

if not self._is_signed():
return

safe_neg = lambda x: -x-1 if -x > int_max else -x
# test round divison for signed integers
for x, d in zip(rdata, divisors):
d_neg = safe_neg(d)
data = self._data(x)
data_neg = [safe_neg(a) for a in data]
vdata = self.load(data)
vdata_neg = self.load(data_neg)
divisor = self.divisor(d)
divisor_neg = self.divisor(d_neg)

# round towards zero
data_divc = [trunc_div(a, d_neg) for a in data]
divc = self.divc(vdata, divisor_neg)
assert divc == data_divc
data_divc = [trunc_div(a, d) for a in data_neg]
divc = self.divc(vdata_neg, divisor)
assert divc == data_divc

# test truncate sign if the dividend is zero
vzero = self.zero()
for d in (-1, -10, -100, int_min//2, int_min):
divisor = self.divisor(d)
divc = self.divc(vzero, divisor)
assert divc == vzero

# test overflow
vmin = self.setall(int_min)
divisor = self.divisor(-1)
divc = self.divc(vmin, divisor)
assert divc == vmin

def test_arithmetic_reduce_sum(self):
"""
Test reduce sum intrinics:
Expand Down
0