From 6f205497e01bc68113fc4b5bb7589e56de8d0620 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Wed, 19 May 2021 19:24:05 +0200 Subject: [PATCH 1/2] TST, SIMD: Improve test cases of integer division --- numpy/core/tests/test_simd.py | 11 +-- numpy/core/tests/test_umath.py | 127 ++++++++++++++++++++++++++------- 2 files changed, 107 insertions(+), 31 deletions(-) diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 3be28c3bb1dd..ea5bbe103900 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -1,6 +1,7 @@ # NOTE: Please avoid the use of numpy.testing since NPYV intrinsics # may be involved in their functionality. import pytest, math, re +import itertools from numpy.core._simd import targets from numpy.core._multiarray_umath import __cpu_baseline__ @@ -820,8 +821,10 @@ def test_arithmetic_intdiv(self): def trunc_div(a, d): """ Divide towards zero works with large integers > 2^53, - equivalent to int(a/d) + and wrap around overflow similar to what C does. """ + if d == -1 and a == int_min: + return a sign_a, sign_d = a < 0, d < 0 if a == 0 or sign_a == sign_d: return a // d @@ -833,9 +836,9 @@ def trunc_div(a, d): 0, 1, self.nlanes, int_max-self.nlanes, int_min, int_min//2 + 1 ) - divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2) + divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2) - for x, d in zip(rdata, divisors): + for x, d in itertools.product(rdata, divisors): data = self._data(x) vdata = self.load(data) data_divc = [trunc_div(a, d) for a in data] @@ -848,7 +851,7 @@ def trunc_div(a, d): safe_neg = lambda x: -x-1 if -x > int_max else -x # test round divison for signed integers - for x, d in zip(rdata, divisors): + for x, d in itertools.product(rdata, divisors): d_neg = safe_neg(d) data = self._data(x) data_neg = [safe_neg(a) for a in data] diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 345aeb6c022a..9d1b13b53a86 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -5,6 +5,7 @@ import pytest import sys from fractions import Fraction +from functools import reduce import numpy.core.umath as ncu from numpy.core import _umath_tests as ncu_tests @@ -249,43 +250,115 @@ def test_division_int(self): assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2]) assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80]) - @pytest.mark.parametrize("input_dtype", - np.sctypes['int'] + np.sctypes['uint']) - def test_division_int_boundary(self, input_dtype): - iinfo = np.iinfo(input_dtype) - - # Unsigned: - # Create list with 0, 25th, 50th, 75th percentile and max - if iinfo.min == 0: - lst = [0, iinfo.max//4, iinfo.max//2, - int(iinfo.max/1.33), iinfo.max] - divisors = [iinfo.max//4, iinfo.max//2, - int(iinfo.max/1.33), iinfo.max] - # Signed: - # Create list with min, 25th percentile, 0, 75th percentile, max - else: - lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] - divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] - a = np.array(lst, dtype=input_dtype) + @pytest.mark.parametrize("dtype,ex_val", itertools.product( + np.sctypes['int'] + np.sctypes['uint'], ( + ( + # dividend + "np.arange(fo.max-lsize, fo.max, dtype=dtype)," + # divisors + "np.arange(lsize, dtype=dtype)," + # scalar divisors + "range(15)" + ), + ( + # dividend + "np.arange(fo.min, fo.min+lsize, dtype=dtype)," + # divisors + "np.arange(lsize//-2, lsize//2, dtype=dtype)," + # scalar divisors + "range(fo.min, fo.min + 15)" + ), ( + # dividend + "np.arange(fo.max-lsize, fo.max, dtype=dtype)," + # divisors + "np.arange(lsize, dtype=dtype)," + # scalar divisors + "[1,3,9,13,neg, fo.min+1, fo.min//2, fo.max//3, fo.max//4]" + ) + ) + )) + def test_division_int_boundary(self, dtype, ex_val): + fo = np.iinfo(dtype) + neg = -1 if fo.min < 0 else 1 + # Large enough to test SIMD loops and remaind elements + lsize = 512 + 7 + a, b, divisors = eval(ex_val) + a_lst, b_lst = a.tolist(), b.tolist() + + c_div = lambda n, d: ( + 0 if d == 0 or (n and n == fo.min and d == -1) else n//d + ) + with np.errstate(divide='ignore'): + ac = a.copy() + ac //= b + div_ab = a // b + div_lst = [c_div(x, y) for x, y in zip(a_lst, b_lst)] + + msg = "Integer arrays floor division check (//)" + assert all(div_ab == div_lst), msg + msg_eq = "Integer arrays floor division check (//=)" + assert all(ac == div_lst), msg_eq for divisor in divisors: - div_a = a // divisor - b = a.copy(); b //= divisor - div_lst = [i // divisor for i in lst] + ac = a.copy() + with np.errstate(divide='ignore'): + div_a = a // divisor + ac //= divisor + div_lst = [c_div(i, divisor) for i in a_lst] - msg = "Integer arrays floor division check (//)" assert all(div_a == div_lst), msg - - msg = "Integer arrays floor division check (//=)" - assert all(div_a == b), msg + assert all(ac == div_lst), msg_eq with np.errstate(divide='raise'): + if 0 in b or (fo.min and -1 in b and fo.min in a): + # Verify overflow case + with pytest.raises(FloatingPointError): + a // b + else: + a // b + if fo.min and fo.min in a: + with pytest.raises(FloatingPointError): + a // -1 + elif fo.min: + a // -1 with pytest.raises(FloatingPointError): a // 0 with pytest.raises(FloatingPointError): - a //= 0 + ac = a.copy() + ac //= 0 + + np.array([], dtype=dtype) // 0 + + @pytest.mark.parametrize("dtype,ex_val", itertools.product( + np.sctypes['int'] + np.sctypes['uint'], ( + "np.array([fo.max, 1, 2, 1, 1, 2, 3], dtype=dtype)", + "np.array([fo.min, 1, -2, 1, 1, 2, -3], dtype=dtype)", + "np.arange(fo.min, fo.min+(100*10), 10, dtype=dtype)", + "np.arange(fo.max-(100*7), fo.max, 7, dtype=dtype)", + ) + )) + def test_division_int_reduce(self, dtype, ex_val): + fo = np.iinfo(dtype) + a = eval(ex_val) + lst = a.tolist() + c_div = lambda n, d: ( + 0 if d == 0 or (n and n == fo.min and d == -1) else n//d + ) + + with np.errstate(divide='ignore'): + div_a = np.floor_divide.reduce(a) + div_lst = reduce(c_div, lst) + msg = "Reduce floor integer division check" + assert div_a == div_lst, msg - np.array([], dtype=input_dtype) // 0 + with np.errstate(divide='raise'): + with pytest.raises(FloatingPointError): + np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype)) + if fo.min: + with pytest.raises(FloatingPointError): + np.floor_divide.reduce( + np.array([fo.min, 1, -1], dtype=dtype) + ) @pytest.mark.parametrize( "dividend,divisor,quotient", From 519ab995e59b33b68ec28ac0c635158f3acc5447 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Wed, 19 May 2021 19:25:12 +0200 Subject: [PATCH 2/2] BUG, SIMD: Fix unexpected result of uint8 division on X86 The bug can occur in special cases e.g. when the divisor is scalar and equal to 9 or 13 and the dividend is array contains consecutive duplicate values of 233. --- numpy/core/src/common/simd/avx2/arithmetic.h | 8 ++++---- .../core/src/common/simd/avx512/arithmetic.h | 19 ++++++++++--------- numpy/core/src/common/simd/intdiv.h | 4 +++- numpy/core/src/common/simd/sse/arithmetic.h | 8 ++++---- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index b1e297988d51..e1b170863a34 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -73,16 +73,16 @@ // divide each unsigned 8-bit element by a precomputed divisor NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) { - const __m256i bmask = _mm256_set1_epi32(0xFF00FF00); + const __m256i bmask = _mm256_set1_epi32(0x00FF00FF); const __m128i shf1 = _mm256_castsi256_si128(divisor.val[1]); const __m128i shf2 = _mm256_castsi256_si128(divisor.val[2]); const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); // high part of unsigned multiplication - __m256i mulhi_odd = _mm256_mulhi_epu16(a, divisor.val[0]); - __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]); + __m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor.val[0]); mulhi_even = _mm256_srli_epi16(mulhi_even, 8); - __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + __m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor.val[0]); + __m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask); // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 __m256i q = _mm256_sub_epi8(a, mulhi); q = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b); diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 8a2790e93b17..f8632e701790 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -116,12 +116,13 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) const __m128i shf1 = _mm512_castsi512_si128(divisor.val[1]); const __m128i shf2 = _mm512_castsi512_si128(divisor.val[2]); #ifdef NPY_HAVE_AVX512BW + const __m512i bmask = _mm512_set1_epi32(0x00FF00FF); const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); // high part of unsigned multiplication - __m512i mulhi_odd = _mm512_mulhi_epu16(a, divisor.val[0]); - __m512i mulhi_even = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]); + __m512i mulhi_even = _mm512_mullo_epi16(_mm512_and_si512(a, bmask), divisor.val[0]); mulhi_even = _mm512_srli_epi16(mulhi_even, 8); + __m512i mulhi_odd = _mm512_mullo_epi16(_mm512_srli_epi16(a, 8), divisor.val[0]); __m512i mulhi = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd); // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 __m512i q = _mm512_sub_epi8(a, mulhi); @@ -130,7 +131,7 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) q = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b); return q; #else - const __m256i bmask = _mm256_set1_epi32(0xFF00FF00); + const __m256i bmask = _mm256_set1_epi32(0x00FF00FF); const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1)); const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2)); const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b); @@ -138,10 +139,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) //// lower 256-bit __m256i lo_a = npyv512_lower_si256(a); // high part of unsigned multiplication - __m256i mulhi_odd = _mm256_mulhi_epu16(lo_a, mulc); - __m256i mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc); + __m256i mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(lo_a, bmask), mulc); mulhi_even = _mm256_srli_epi16(mulhi_even, 8); - __m256i mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + __m256i mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(lo_a, 8), mulc); + __m256i mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask); // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 __m256i lo_q = _mm256_sub_epi8(lo_a, mulhi); lo_q = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b); @@ -151,10 +152,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) //// higher 256-bit __m256i hi_a = npyv512_higher_si256(a); // high part of unsigned multiplication - mulhi_odd = _mm256_mulhi_epu16(hi_a, mulc); - mulhi_even = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc); + mulhi_even = _mm256_mullo_epi16(_mm256_and_si256(hi_a, bmask), mulc); mulhi_even = _mm256_srli_epi16(mulhi_even, 8); - mulhi = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask); + mulhi_odd = _mm256_mullo_epi16(_mm256_srli_epi16(hi_a, 8), mulc); + mulhi = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask); // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 __m256i hi_q = _mm256_sub_epi8(hi_a, mulhi); hi_q = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b); diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h index a6c293d87f64..1ce3b4df834d 100644 --- a/numpy/core/src/common/simd/intdiv.h +++ b/numpy/core/src/common/simd/intdiv.h @@ -204,14 +204,16 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) sh1 = 1; sh2 = l - 1; // shift counts } npyv_u8x3 divisor; - divisor.val[0] = npyv_setall_u8(m); #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 + divisor.val[0] = npyv_setall_u16(m); divisor.val[1] = npyv_set_u8(sh1); divisor.val[2] = npyv_set_u8(sh2); #elif defined(NPY_HAVE_VSX2) + divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_setall_u8(sh1); divisor.val[2] = npyv_setall_u8(sh2); #elif defined(NPY_HAVE_NEON) + divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1)); divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2)); #else diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 1b02a4107b45..bced35108116 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -92,14 +92,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // divide each unsigned 8-bit element by a precomputed divisor NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) { - const __m128i bmask = _mm_set1_epi32(0xFF00FF00); + const __m128i bmask = _mm_set1_epi32(0x00FF00FF); const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1])); const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2])); // high part of unsigned multiplication - __m128i mulhi_odd = _mm_mulhi_epu16(a, divisor.val[0]); - __m128i mulhi_even = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]); + __m128i mulhi_even = _mm_mullo_epi16(_mm_and_si128(a, bmask), divisor.val[0]); + __m128i mulhi_odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8), divisor.val[0]); mulhi_even = _mm_srli_epi16(mulhi_even, 8); - __m128i mulhi = npyv_select_u8(bmask, mulhi_odd, mulhi_even); + __m128i mulhi = npyv_select_u8(bmask, mulhi_even, mulhi_odd); // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 __m128i q = _mm_sub_epi8(a, mulhi); q = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);