From 6f205497e01bc68113fc4b5bb7589e56de8d0620 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Wed, 19 May 2021 19:24:05 +0200
Subject: [PATCH 1/2] TST, SIMD: Improve test cases of integer division

---
 numpy/core/tests/test_simd.py  |  11 +--
 numpy/core/tests/test_umath.py | 127 ++++++++++++++++++++++++++-------
 2 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 3be28c3bb1dd..ea5bbe103900 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -1,6 +1,7 @@
 # NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
 # may be involved in their functionality.
 import pytest, math, re
+import itertools
 from numpy.core._simd import targets
 from numpy.core._multiarray_umath import __cpu_baseline__
 
@@ -820,8 +821,10 @@ def test_arithmetic_intdiv(self):
         def trunc_div(a, d):
             """
             Divide towards zero works with large integers > 2^53,
-            equivalent to int(a/d)
+            and wrap around overflow similar to what C does.
             """
+            if d == -1 and a == int_min:
+                return a
             sign_a, sign_d = a < 0, d < 0
             if a == 0 or sign_a == sign_d:
                 return a // d
@@ -833,9 +836,9 @@ def trunc_div(a, d):
             0, 1, self.nlanes, int_max-self.nlanes,
             int_min, int_min//2 + 1
         )
-        divisors = (1, 2, self.nlanes, int_min, int_max, int_max//2)
+        divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2)
 
-        for x, d in zip(rdata, divisors):
+        for x, d in itertools.product(rdata, divisors):
             data = self._data(x)
             vdata = self.load(data)
             data_divc = [trunc_div(a, d) for a in data]
@@ -848,7 +851,7 @@ def trunc_div(a, d):
 
         safe_neg = lambda x: -x-1 if -x > int_max else -x
         # test round divison for signed integers
-        for x, d in zip(rdata, divisors):
+        for x, d in itertools.product(rdata, divisors):
             d_neg = safe_neg(d)
             data = self._data(x)
             data_neg = [safe_neg(a) for a in data]
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 345aeb6c022a..9d1b13b53a86 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -5,6 +5,7 @@
 import pytest
 import sys
 from fractions import Fraction
+from functools import reduce
 
 import numpy.core.umath as ncu
 from numpy.core import _umath_tests as ncu_tests
@@ -249,43 +250,115 @@ def test_division_int(self):
         assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
-    @pytest.mark.parametrize("input_dtype",
-            np.sctypes['int'] + np.sctypes['uint'])
-    def test_division_int_boundary(self, input_dtype):
-        iinfo = np.iinfo(input_dtype)
-
-        # Unsigned:
-        # Create list with 0, 25th, 50th, 75th percentile and max
-        if iinfo.min == 0:
-            lst = [0, iinfo.max//4, iinfo.max//2,
-                    int(iinfo.max/1.33), iinfo.max]
-            divisors = [iinfo.max//4, iinfo.max//2,
-                    int(iinfo.max/1.33), iinfo.max]
-        # Signed:
-        # Create list with min, 25th percentile, 0, 75th percentile, max
-        else:
-            lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
-            divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
-        a = np.array(lst, dtype=input_dtype)
+    @pytest.mark.parametrize("dtype,ex_val", itertools.product(
+        np.sctypes['int'] + np.sctypes['uint'], (
+            (
+                # dividend
+                "np.arange(fo.max-lsize, fo.max, dtype=dtype),"
+                # divisors
+                "np.arange(lsize, dtype=dtype),"
+                # scalar divisors
+                "range(15)"
+            ),
+            (
+                # dividend
+                "np.arange(fo.min, fo.min+lsize, dtype=dtype),"
+                # divisors
+                "np.arange(lsize//-2, lsize//2, dtype=dtype),"
+                # scalar divisors
+                "range(fo.min, fo.min + 15)"
+            ), (
+                # dividend
+                "np.arange(fo.max-lsize, fo.max, dtype=dtype),"
+                # divisors
+                "np.arange(lsize, dtype=dtype),"
+                # scalar divisors
+                "[1,3,9,13,neg, fo.min+1, fo.min//2, fo.max//3, fo.max//4]"
+            )
+        )
+    ))
+    def test_division_int_boundary(self, dtype, ex_val):
+        fo = np.iinfo(dtype)
+        neg = -1 if fo.min < 0 else 1
+        # Large enough to test SIMD loops and remaind elements
+        lsize = 512 + 7
+        a, b, divisors = eval(ex_val)
+        a_lst, b_lst = a.tolist(), b.tolist()
+
+        c_div = lambda n, d: (
+            0 if d == 0 or (n and n == fo.min and d == -1) else n//d
+        )
+        with np.errstate(divide='ignore'):
+            ac = a.copy()
+            ac //= b
+            div_ab = a // b
+        div_lst = [c_div(x, y) for x, y in zip(a_lst, b_lst)]
+
+        msg = "Integer arrays floor division check (//)"
+        assert all(div_ab == div_lst), msg
+        msg_eq = "Integer arrays floor division check (//=)"
+        assert all(ac == div_lst), msg_eq
 
         for divisor in divisors:
-            div_a = a // divisor
-            b = a.copy(); b //= divisor
-            div_lst = [i // divisor for i in lst]
+            ac = a.copy()
+            with np.errstate(divide='ignore'):
+                div_a = a // divisor
+                ac //= divisor
+            div_lst = [c_div(i, divisor) for i in a_lst]
 
-            msg = "Integer arrays floor division check (//)"
             assert all(div_a == div_lst), msg
-
-            msg = "Integer arrays floor division check (//=)"
-            assert all(div_a == b), msg
+            assert all(ac == div_lst), msg_eq
 
         with np.errstate(divide='raise'):
+            if 0 in b or (fo.min and -1 in b and fo.min in a):
+                # Verify overflow case
+                with pytest.raises(FloatingPointError):
+                    a // b
+            else:
+                a // b
+            if fo.min and fo.min in a:
+                with pytest.raises(FloatingPointError):
+                    a // -1
+            elif fo.min:
+                a // -1
             with pytest.raises(FloatingPointError):
                 a // 0
             with pytest.raises(FloatingPointError):
-                a //= 0
+                ac = a.copy()
+                ac //= 0
+
+            np.array([], dtype=dtype) // 0
+
+    @pytest.mark.parametrize("dtype,ex_val", itertools.product(
+        np.sctypes['int'] + np.sctypes['uint'], (
+            "np.array([fo.max, 1, 2, 1, 1, 2, 3], dtype=dtype)",
+            "np.array([fo.min, 1, -2, 1, 1, 2, -3], dtype=dtype)",
+            "np.arange(fo.min, fo.min+(100*10), 10, dtype=dtype)",
+            "np.arange(fo.max-(100*7), fo.max, 7, dtype=dtype)",
+        )
+    ))
+    def test_division_int_reduce(self, dtype, ex_val):
+        fo = np.iinfo(dtype)
+        a = eval(ex_val)
+        lst = a.tolist()
+        c_div = lambda n, d: (
+            0 if d == 0 or (n and n == fo.min and d == -1) else n//d
+        )
+
+        with np.errstate(divide='ignore'):
+            div_a = np.floor_divide.reduce(a)
+        div_lst = reduce(c_div, lst)
+        msg = "Reduce floor integer division check"
+        assert div_a == div_lst, msg
 
-            np.array([], dtype=input_dtype) // 0
+        with np.errstate(divide='raise'):
+            with pytest.raises(FloatingPointError):
+                np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype))
+            if fo.min:
+                with pytest.raises(FloatingPointError):
+                    np.floor_divide.reduce(
+                        np.array([fo.min, 1, -1], dtype=dtype)
+                    )
 
     @pytest.mark.parametrize(
             "dividend,divisor,quotient",

From 519ab995e59b33b68ec28ac0c635158f3acc5447 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Wed, 19 May 2021 19:25:12 +0200
Subject: [PATCH 2/2] BUG, SIMD: Fix unexpected result of uint8 division on X86

  The bug can occur in special cases e.g. when the divisor is scalar and equal
  to 9 or 13 and the dividend is array contains consecutive duplicate values of 233.
---
 numpy/core/src/common/simd/avx2/arithmetic.h  |  8 ++++----
 .../core/src/common/simd/avx512/arithmetic.h  | 19 ++++++++++---------
 numpy/core/src/common/simd/intdiv.h           |  4 +++-
 numpy/core/src/common/simd/sse/arithmetic.h   |  8 ++++----
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index b1e297988d51..e1b170863a34 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -73,16 +73,16 @@
 // divide each unsigned 8-bit element by a precomputed divisor
 NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
 {
-    const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+    const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
     const __m128i shf1  = _mm256_castsi256_si128(divisor.val[1]);
     const __m128i shf2  = _mm256_castsi256_si128(divisor.val[2]);
     const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
     const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
     // high part of unsigned multiplication
-    __m256i mulhi_odd   = _mm256_mulhi_epu16(a, divisor.val[0]);
-    __m256i mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(a, 8), divisor.val[0]);
+    __m256i mulhi_even  = _mm256_mullo_epi16(_mm256_and_si256(a, bmask), divisor.val[0]);
             mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
-    __m256i mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+    __m256i mulhi_odd   = _mm256_mullo_epi16(_mm256_srli_epi16(a, 8), divisor.val[0]);
+    __m256i mulhi       = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
     // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     __m256i q           = _mm256_sub_epi8(a, mulhi);
             q           = _mm256_and_si256(_mm256_srl_epi16(q, shf1), shf1b);
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 8a2790e93b17..f8632e701790 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -116,12 +116,13 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
     const __m128i shf1  = _mm512_castsi512_si128(divisor.val[1]);
     const __m128i shf2  = _mm512_castsi512_si128(divisor.val[2]);
 #ifdef NPY_HAVE_AVX512BW
+    const __m512i bmask = _mm512_set1_epi32(0x00FF00FF);
     const __m512i shf1b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
     const __m512i shf2b = _mm512_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
     // high part of unsigned multiplication
-    __m512i mulhi_odd   = _mm512_mulhi_epu16(a, divisor.val[0]);
-    __m512i mulhi_even  = _mm512_mulhi_epu16(_mm512_slli_epi16(a, 8), divisor.val[0]);
+    __m512i mulhi_even  = _mm512_mullo_epi16(_mm512_and_si512(a, bmask), divisor.val[0]);
             mulhi_even  = _mm512_srli_epi16(mulhi_even, 8);
+    __m512i mulhi_odd   = _mm512_mullo_epi16(_mm512_srli_epi16(a, 8), divisor.val[0]);
     __m512i mulhi       = _mm512_mask_mov_epi8(mulhi_even, 0xAAAAAAAAAAAAAAAA, mulhi_odd);
     // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     __m512i q           = _mm512_sub_epi8(a, mulhi);
@@ -130,7 +131,7 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
             q           = _mm512_and_si512(_mm512_srl_epi16(q, shf2), shf2b);
     return  q;
 #else
-    const __m256i bmask = _mm256_set1_epi32(0xFF00FF00);
+    const __m256i bmask = _mm256_set1_epi32(0x00FF00FF);
     const __m256i shf1b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf1));
     const __m256i shf2b = _mm256_set1_epi8(0xFFU >> _mm_cvtsi128_si32(shf2));
     const __m512i shf2bw= npyv512_combine_si256(shf2b, shf2b);
@@ -138,10 +139,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
     //// lower 256-bit
     __m256i lo_a        = npyv512_lower_si256(a);
     // high part of unsigned multiplication
-    __m256i mulhi_odd   = _mm256_mulhi_epu16(lo_a, mulc);
-    __m256i mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(lo_a, 8), mulc);
+    __m256i mulhi_even  = _mm256_mullo_epi16(_mm256_and_si256(lo_a, bmask), mulc);
             mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
-    __m256i mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+    __m256i mulhi_odd   = _mm256_mullo_epi16(_mm256_srli_epi16(lo_a, 8), mulc);
+    __m256i mulhi       = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
     // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     __m256i lo_q        = _mm256_sub_epi8(lo_a, mulhi);
             lo_q        = _mm256_and_si256(_mm256_srl_epi16(lo_q, shf1), shf1b);
@@ -151,10 +152,10 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
     //// higher 256-bit
     __m256i hi_a        = npyv512_higher_si256(a);
     // high part of unsigned multiplication
-            mulhi_odd   = _mm256_mulhi_epu16(hi_a, mulc);
-            mulhi_even  = _mm256_mulhi_epu16(_mm256_slli_epi16(hi_a, 8), mulc);
+            mulhi_even  = _mm256_mullo_epi16(_mm256_and_si256(hi_a, bmask), mulc);
             mulhi_even  = _mm256_srli_epi16(mulhi_even, 8);
-            mulhi       = _mm256_blendv_epi8(mulhi_even, mulhi_odd, bmask);
+            mulhi_odd   = _mm256_mullo_epi16(_mm256_srli_epi16(hi_a, 8), mulc);
+            mulhi       = _mm256_blendv_epi8(mulhi_odd, mulhi_even, bmask);
     // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     __m256i hi_q        = _mm256_sub_epi8(hi_a, mulhi);
             hi_q        = _mm256_and_si256(_mm256_srl_epi16(hi_q, shf1), shf1b);
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index a6c293d87f64..1ce3b4df834d 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -204,14 +204,16 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
         sh1 = 1;  sh2 = l - 1;                     // shift counts
     }
     npyv_u8x3 divisor;
-    divisor.val[0] = npyv_setall_u8(m);
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+    divisor.val[0] = npyv_setall_u16(m);
     divisor.val[1] = npyv_set_u8(sh1);
     divisor.val[2] = npyv_set_u8(sh2);
 #elif defined(NPY_HAVE_VSX2)
+    divisor.val[0] = npyv_setall_u8(m);
     divisor.val[1] = npyv_setall_u8(sh1);
     divisor.val[2] = npyv_setall_u8(sh2);
 #elif defined(NPY_HAVE_NEON)
+    divisor.val[0] = npyv_setall_u8(m);
     divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
     divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
 #else
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 1b02a4107b45..bced35108116 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -92,14 +92,14 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // divide each unsigned 8-bit element by a precomputed divisor
 NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
 {
-    const __m128i bmask = _mm_set1_epi32(0xFF00FF00);
+    const __m128i bmask = _mm_set1_epi32(0x00FF00FF);
     const __m128i shf1b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[1]));
     const __m128i shf2b = _mm_set1_epi8(0xFFU >> _mm_cvtsi128_si32(divisor.val[2]));
     // high part of unsigned multiplication
-    __m128i mulhi_odd   = _mm_mulhi_epu16(a, divisor.val[0]);
-    __m128i mulhi_even  = _mm_mulhi_epu16(_mm_slli_epi16(a, 8), divisor.val[0]);
+    __m128i mulhi_even  = _mm_mullo_epi16(_mm_and_si128(a, bmask), divisor.val[0]);
+    __m128i mulhi_odd   = _mm_mullo_epi16(_mm_srli_epi16(a, 8), divisor.val[0]);
             mulhi_even  = _mm_srli_epi16(mulhi_even, 8);
-    __m128i mulhi       = npyv_select_u8(bmask, mulhi_odd, mulhi_even);
+    __m128i mulhi       = npyv_select_u8(bmask, mulhi_even, mulhi_odd);
     // floor(a/d)       = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     __m128i q           = _mm_sub_epi8(a, mulhi);
             q           = _mm_and_si128(_mm_srl_epi16(q, divisor.val[1]), shf1b);