8000 BUG, SIMD: Fix unexpected result of uint8 division on X86 by seiko2plus · Pull Request #19046 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG, SIMD: Fix unexpected result of uint8 division on X86 #19046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]);
const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor.val[0]);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor.val[0]);
__m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i q = _mm256_sub_epi8(a, mulhi);
q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
Expand Down
19 changes: 10 additions & 9 deletions numpy/core/src/common/simd/avx512/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,13 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]);
const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]);
#ifdef NPY_HAVE_AVX512BW
const __m512i bmask = _mm512_set1_epi32(0x00FF00FF);
const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
// high part of unsigned multiplication
__m512i mulhi_odd = _mm512_mulhi_epu16(a, divisor.val[0]);
__m512i mulhi_even = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]);
__m512i mulhi_even = _mm512_mullo_epi16(_mm512_and_si512(a, bmask), divisor.val[0]);
mulhi_even = _mm512_srli_epi16(mulhi_even, 8);
__m512i mulhi_odd = _mm512_mullo_epi16(_mm512_srli_epi16(a, 8), divisor.val[0]);
__m512i mulhi = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m512i q = _mm512_sub_epi8(a, mulhi);
Expand All @@ -130,18 +131,18 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
q = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b);
return q;
#else
const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b);
const __m256i mulc = npyv512_lower_si256(divisor.val[0]);
//// lower 256-bit
__m256i lo_a = npyv512_lower_si256(a);
// high part of unsigned multiplication
__m256i mulhi_odd = _mm256_mulhi_epu16(lo_a, mulc);
__m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc);
__m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(lo_a, bmask), mulc);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
__m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
__m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(lo_a, 8), mulc);
__m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i lo_q = _mm256_sub_epi8(lo_a, mulhi);
lo_q = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b);
Expand All @@ -151,10 +152,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
//// higher 256-bit
__m256i hi_a = npyv512_higher_si256(a);
// high part of unsigned multiplication
mulhi_odd = _mm256_mulhi_epu16(hi_a, mulc);
mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc);
mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(hi_a, bmask), mulc);
mulhi_even = _mm256_srli_epi16(mulhi_even, 8);
mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(hi_a, 8), mulc);
mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m256i hi_q = _mm256_sub_epi8(hi_a, mulhi);
hi_q = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b);
Expand Down
4 changes: 3 additions & 1 deletion numpy/core/src/common/simd/intdiv.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,16 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
sh1 = 1; sh2 = l - 1; // shift counts
}
npyv_u8x3 divisor;
divisor.val[0] = npyv_setall_u8(m);
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[0] = npyv_setall_u16(m);
divisor.val[1] = npyv_set_u8(sh1);
divisor.val[2] = npyv_set_u8(sh2);
#elif defined(NPY_HAVE_VSX2)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_setall_u8(sh1);
divisor.val[2] = npyv_setall_u8(sh2);
#elif defined(NPY_HAVE_NEON)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
#else
Expand Down
8 changes: 4 additions & 4 deletions numpy/core/src/common/simd/sse/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// divide each unsigned 8-bit element by a precomputed divisor
NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
{
const __m128i bmask = _mm_set1_epi32(0xFF00FF00);
const __m128i bmask = _mm_set1_epi32(0x00FF00FF);
const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1]));
const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2]));
// high part of unsigned multiplication
__m128i mulhi_odd = _mm_mulhi_epu16(a, divisor.val[0]);
__m128i mulhi_even = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]);
__m128i mulhi_even = _mm_mullo_epi16(_mm_and_si128(a, bmask), divisor.val[0]);
__m128i mulhi_odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8), divisor.val[0]);
mulhi_even = _mm_srli_epi16(mulhi_even, 8);
__m128i mulhi = npyv_select_u8(bmask, mulhi_odd, mulhi_even);
__m128i mulhi = npyv_select_u8(bmask, mulhi_even, mulhi_odd);
// floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2
__m128i q = _mm_sub_epi8(a, mulhi);
q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);
Expand Down
11 changes: 7 additions & 4 deletions numpy/core/tests/test_simd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
# may be involved in their functionality.
import pytest, math, re
import itertools
from numpy.core._simd import targets
from numpy.core._multiarray_umath import __cpu_baseline__

Expand Down Expand Up @@ -820,8 +821,10 @@ def test_arithmetic_intdiv(self):
def trunc_div(a, d):
"""
Divide towards zero works with large integers > 2^53,
equivalent to int(a/d)
and wrap around overflow similar to what C does.
"""
if d == -1 and a == int_min:
return a
sign_a, sign_d = a < 0, d < 0
if a == 0 or sign_a == sign_d:
return a // d
Expand All @@ -833,9 +836,9 @@ def trunc_div(a, d):
0, 1, self.nlanes, int_max-self.nlanes,
int_min, int_min//2 + 1
)
divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)
divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2)

for x, d in zip(rdata, divisors):
for x, d in itertools.product(rdata, divisors):
data = self._data(x)
vdata = self.load(data)
data_divc = [trunc_div(a, d) for a in data]
Expand All @@ -848,7 +851,7 @@ def trunc_div(a, d):

safe_neg = lambda x: -x-1 if -x > int_max else -x
# test round divison for signed integers
for x, d in zip(rdata, divisors):
for x, d in itertools.product(rdata, divisors):
d_neg = safe_neg(d)
data = self._data(x)
data_neg = [safe_neg(a) for a in data]
Expand Down
127 changes: 100 additions & 27 deletions numpy/core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
import sys
from fractions import Fraction
from functools import reduce

import numpy.core.umath as ncu
from numpy.core import _umath_tests as ncu_tests
Expand Down Expand Up @@ -249,43 +250,115 @@ def test_division_int(self):
assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])

@pytest.mark.parametrize("input_dtype",
np.sctypes['int'] + np.sctypes['uint'])
def test_division_int_boundary(self, input_dtype):
iinfo = np.iinfo(input_dtype)

# Unsigned:
# Create list with 0, 25th, 50th, 75th percentile and max
if iinfo.min == 0:
lst = [0, iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
divisors = [iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
# Signed:
# Create list with min, 25th percentile, 0, 75th percentile, max
else:
lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
a = np.array(lst, dtype=input_dtype)
@pytest.mark.parametrize("dtype,ex_val", itertools.product(
np.sctypes['int'] + np.sctypes['uint'], (
(
# dividend
"np.arange(fo.max-lsize, fo.max, dtype=dtype),"
# divisors
"np.arange(lsize, dtype=dtype),"
# scalar divisors
"range(15)"
),
(
# dividend
"np.arange(fo.min, fo.min+lsize, dtype=dtype),"
# divisors
"np.arange(lsize//-2, lsize//2, dtype=dtype),"
# scalar divisors
"range(fo.min, fo.min + 15)"
), (
# dividend
"np.arange(fo.max-lsize, fo.max, dtype=dtype),"
# divisors
"np.arange(lsize, dtype=dtype),"
# scalar divisors
"[1,3,9,13,neg, fo.min+1, fo.min//2, fo.max//3, fo.max//4]"
)
)
))
def test_division_int_boundary(self, dtype, ex_val):
fo = np.iinfo(dtype)
neg = -1 if fo.min < 0 else 1
# Large enough to test SIMD loops and remaind elements
lsize = 512 + 7
a, b, divisors = eval(ex_val)
a_lst, b_lst = a.tolist(), b.tolist()

c_div = lambda n, d: (
0 if d == 0 or (n and n == fo.min and d == -1) else n//d
)
with np.errstate(divide='ignore'):
ac = a.copy()
ac //= b
div_ab = a // b
div_lst = [c_div(x, y) for x, y in zip(a_lst, b_lst)]

msg = "Integer arrays floor division check (//)"
assert all(div_ab == div_lst), msg
msg_eq = "Integer arrays floor division check (//=)"
assert all(ac == div_lst), msg_eq

for divisor in divisors:
div_a = a // divisor
b = a.copy(); b //= divisor
div_lst = [i // divisor for i in lst]
ac = a.copy()
with np.errstate(divide='ignore'):
div_a = a // divisor
ac //= divisor
div_lst = [c_div(i, divisor) for i in a_lst]

msg = "Integer arrays floor division check (//)"
assert all(div_a == div_lst), msg

msg = "Integer arrays floor division check (//=)"
assert all(div_a == b), msg
assert all(ac == div_lst), msg_eq

with np.errstate(divide='raise'):
if 0 in b or (fo.min and -1 in b and fo.min in a):
# Verify overflow case
with pytest.raises(FloatingPointError):
a // b
else:
a // b
if fo.min and fo.min in a:
with pytest.raises(FloatingPointError):
a // -1
elif fo.min:
a // -1
with pytest.raises(FloatingPointError):
a // 0
with pytest.raises(FloatingPointError):
a //= 0
ac = a.copy()
ac //= 0

np.array([], dtype=dtype) // 0

@pytest.mark.parametrize("dtype,ex_val", itertools.product(
np.sctypes['int'] + np.sctypes['uint'], (
"np.array([fo.max, 1, 2, 1, 1, 2, 3], dtype=dtype)",
"np.array([fo.min, 1, -2, 1, 1, 2, -3], dtype=dtype)",
"np.arange(fo.min, fo.min+(100*10), 10, dtype=dtype)",
"np.arange(fo.max-(100*7), fo.max, 7, dtype=dtype)",
)
))
def test_division_int_reduce(self, dtype, ex_val):
fo = np.iinfo(dtype)
a = eval(ex_val)
lst = a.tolist()
c_div = lambda n, d: (
0 if d == 0 or (n and n == fo.min and d == -1) else n//d
)

with np.errstate(divide='ignore'):
div_a = np.floor_divide.reduce(a)
div_lst = reduce(c_div, lst)
msg = "Reduce floor integer division check"
assert div_a == div_lst, msg

np.array([], dtype=input_dtype) // 0
with np.errstate(divide='raise'):
with pytest.raises(FloatingPointError):
np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype))
if fo.min:
with pytest.raises(FloatingPointError):
np.floor_divide.reduce(
np.array([fo.min, 1, -1], dtype=dtype)
)

@pytest.mark.parametrize(
"dividend,divisor,quotient",
Expand Down
0