From 24b2a2d36a7e8356310cd16dbe60abd9d0e682dc Mon Sep 17 00:00:00 2001
From: Jaime Fernandez <jaimefrio@google.com>
Date: Mon, 28 Mar 2016 01:23:42 +0200
Subject: [PATCH 1/4] BUG: shift operator cycles, fixes #2449

---
 numpy/core/src/umath/loops.c.src      | 114 ++++++++++++++++++++++++--
 numpy/core/src/umath/scalarmath.c.src |  45 ++++++++--
 numpy/core/tests/test_scalarmath.py   |  32 ++++++++
 3 files changed, 180 insertions(+), 11 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0d9806f5d6ab..9b90b67be38c 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -772,6 +772,7 @@ BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
  * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
  *          npy_double, npy_double, npy_double, npy_double#
+ * #issigned = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -824,15 +825,15 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 
 /**begin repeat1
  * Arithmetic
- * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
- *          left_shift, right_shift#
- * #OP = +, -,*, &, |, ^, <<, >>#
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = +, -,*, &, |, ^#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps,
+              void *NPY_UNUSED(func))
 {
-    if(IS_BINARY_REDUCE) {
+    if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(@type@) {
             io1 @OP@= *(@type@ *)ip2;
         }
@@ -845,6 +846,109 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 
 /**end repeat1**/
 
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+
+#define LEFT_SHIFT_OP                               \
+    do {                                            \
+        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
+            *out = in1 << in2;                      \
+        }                                           \
+        else {                                      \
+            *out = 0;                               \
+        }                                           \
+    } while (0)
+
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_left_shift(char **args, npy_intp *dimensions, npy_intp *steps,
+                  void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            @type@ ip2_val = *(@type@ *)ip2;
+
+            if (NPY_LIKELY(ip2_val < sizeof(@type@) * 8)) {
+                io1 <<= ip2_val;
+            }
+            else {
+                io1 = 0;
+            }
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, LEFT_SHIFT_OP);
+    }
+}
+
+#undef LEFT_SHIFT_OP
+
+#define RIGHT_SHIFT_OP_SIGNED                       \
+    do {                                            \
+        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
+            *out = in1 >> in2;                      \
+        }                                           \
+        else if (in1 < 0) {                         \
+            *out = -1;                              \
+        }                                           \
+        else {                                      \
+            *out = 0;                               \
+        }                                           \
+    } while (0)
+
+#define RIGHT_SHIFT_OP_UNSIGNED                     \
+    do {                                            \
+        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
+            *out = in1 >> in2;                      \
+        }                                           \
+        else {                                      \
+            *out = 0;                               \
+        }                                           \
+    } while (0)
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_right_shift(char **args, npy_intp *dimensions, npy_intp *steps,
+                   void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            @type@ ip2_val = *(@type@ *)ip2;
+
+            if (NPY_LIKELY(ip2_val < sizeof(@type@) * 8)) {
+                io1 >>= ip2_val;
+            }
+#if @issigned@
+            else if (io1 < 0) {
+                io1 = -1;
+            }
+#endif
+            else {
+                io1 = 0;
+            }
+        }
+        *((@type@ *)iop1) = io1;
+    }
+    else {
+#if @issigned@
+        BINARY_LOOP_FAST(@type@, @type@, RIGHT_SHIFT_OP_SIGNED);
+#else
+        BINARY_LOOP_FAST(@type@, @type@, RIGHT_SHIFT_OP_UNSIGNED);
+#endif
+    }
+}
+
+#undef RIGHT_SHIFT_OP_SIGNED
+#undef RIGHT_SHIFT_OP_UNSIGNED
+
+
 /**begin repeat1
  * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
  *         logical_and, logical_or#
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index c651383eb17f..84dc8e0f7ebe 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -245,25 +245,58 @@ static void
 /**end repeat**/
 
 
-
-/* QUESTION:  Should we check for overflow / underflow in (l,r)shift? */
-
 /**begin repeat
  * #name = byte, ubyte, short, ushort, int, uint,
  *         long, ulong, longlong, ulonglong#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #issigned = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
  */
 
 /**begin repeat1
- * #oper = and, xor, or, lshift, rshift#
- * #op = &, ^, |, <<, >>#
+ * #oper = and, xor, or#
+ * #op = &, ^, |#
  */
 
 #define @name@_ctype_@oper@(arg1, arg2, out) *(out) = (arg1) @op@ (arg2)
 
 /**end repeat1**/
 
+#define @name@_ctype_lshift(arg1, arg2, out)           \
+    do {                                               \
+        if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
+            *(out) = (arg1) << (arg2);                 \
+        }                                              \
+        else {                                         \
+            *(out) = 0;                                \
+        }                                              \
+    } while (0)
+
+#if @issigned@
+    #define @name@_ctype_rshift(arg1, arg2, out)           \
+        do {                                               \
+            if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
+                *(out) = (arg1) >> (arg2);                 \
+            }                                              \
+            else if ((arg1) < 0) {                         \
+                *(out) = -1;                               \
+            }                                              \
+            else {                                         \
+                *(out) = 0;                                \
+            }                                              \
+        } while (0)
+#else
+    #define @name@_ctype_rshift(arg1, arg2, out)           \
+        do {                                               \
+            if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
+                *(out) = (arg1) >> (arg2);                 \
+            }                                              \
+            else {                                         \
+                *(out) = 0;                                \
+            }                                              \
+        } while (0)
+#endif
+
 /**end repeat**/
 
 /**begin repeat
@@ -575,7 +608,7 @@ static void
  * 1) Convert the types to the common type if both are scalars (0 return)
  * 2) If both are not scalars use ufunc machinery (-2 return)
  * 3) If both are scalars but cannot be cast to the right type
- * return NotImplmented (-1 return)
+ * return NotImplemented (-1 return)
  *
  * 4) Perform the function on the C-type.
  * 5) If an error condition occurred, check to see
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index b8f4388b16b4..52c9d3bc63f6 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -525,5 +525,37 @@ def test_numpy_abs(self):
         self._test_abs_func(np.abs)
 
 
+class TestBitShifts(TestCase):
+
+    def test_left_shift(self):
+        # gh-2449
+        for dt in np.typecodes['AllInteger']:
+            arr = np.array([5, -5], dtype=dt)
+            scl_pos, scl_neg = arr
+            for shift in np.array([arr.dtype.itemsize * 8], dtype=dt):
+                res_pos = scl_pos << shift
+                res_neg = scl_neg << shift
+                assert_equal(res_pos, 0)
+                assert_equal(res_neg, 0)
+                # Result on scalars should be the same as on arrays
+                assert_array_equal(arr << shift, [res_pos, res_neg])
+
+    def test_right_shift(self):
+        # gh-2449
+        for dt in np.typecodes['AllInteger']:
+            arr = np.array([5, -5], dtype=dt)
+            scl_pos, scl_neg = arr
+            for shift in np.array([arr.dtype.itemsize * 8], dtype=dt):
+                res_pos = scl_pos >> shift
+                res_neg = scl_neg >> shift
+                assert_equal(res_pos, 0)
+                if dt in np.typecodes['UnsignedInteger']:
+                    assert_equal(res_neg, 0)
+                else:
+                    assert_equal(res_neg, -1)
+                # Result on scalars should be the same as on arrays
+                assert_array_equal(arr >> shift, [res_pos, res_neg], dt)
+
+
 if __name__ == "__main__":
     run_module_suite()

From fca077c7e2fc8f21c639ba479267a34eb16a2810 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 8 Jun 2019 21:13:09 -0700
Subject: [PATCH 2/4] MAINT: Respond to review comments on gh-7473

This:
* Inlines the macros in loops.c.src
* Replaces 8 with `CHAR_BIT `. The `NPY_SIZEOF_*` macros are not used here because it's too much work to apply them to the signed types, and they expand to the same thing anyway.
* Removes the reduce loop specializations which likely no one cares about
* Uses pytest.mark.parametrize to shorten the test
---
 numpy/core/src/umath/loops.c.src      | 104 +++++++-------------------
 numpy/core/src/umath/scalarmath.c.src |  56 +++++++-------
 numpy/core/tests/test_scalarmath.py   |  46 ++++++------
 3 files changed, 77 insertions(+), 129 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 2cf43d10f1e0..eccb9d82cdf6 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -777,7 +777,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 /**begin repeat2
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
- * #OP = +, -,*, &, |, ^#
+ * #OP = +, -, *, &, |, ^#
  */
 
 #if @CHK@
@@ -808,98 +808,48 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
  *       which is undefined in C.
  */
 
-#define LEFT_SHIFT_OP                               \
-    do {                                            \
-        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
-            *out = in1 << in2;                      \
-        }                                           \
-        else {                                      \
-            *out = 0;                               \
-        }                                           \
-    } while (0)
-
-
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_left_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
                   void *NPY_UNUSED(func))
 {
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            @type@ ip2_val = *(@type@ *)ip2;
-
-            if (NPY_LIKELY(ip2_val < sizeof(@type@) * 8)) {
-                io1 <<= ip2_val;
-            }
-            else {
-                io1 = 0;
-            }
+    BINARY_LOOP_FAST(@type@, @type@,
+        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
+            *out = in1 << in2;
         }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP_FAST(@type@, @type@, LEFT_SHIFT_OP);
-    }
-}
-
-#undef LEFT_SHIFT_OP
-
-#define RIGHT_SHIFT_OP_SIGNED                       \
-    do {                                            \
-        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
-            *out = in1 >> in2;                      \
-        }                                           \
-        else if (in1 < 0) {                         \
-            *out = -1;                              \
-        }                                           \
-        else {                                      \
-            *out = 0;                               \
-        }                                           \
-    } while (0)
-
-#define RIGHT_SHIFT_OP_UNSIGNED                     \
-    do {                                            \
-        if (NPY_LIKELY(in2 < sizeof(@type@) * 8)) { \
-            *out = in1 >> in2;                      \
-        }                                           \
-        else {                                      \
-            *out = 0;                               \
-        }                                           \
-    } while (0)
+        else {
+            *out = 0;
+        }
+    );
+}
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_right_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
                    void *NPY_UNUSED(func))
 {
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            @type@ ip2_val = *(@type@ *)ip2;
-
-            if (NPY_LIKELY(ip2_val < sizeof(@type@) * 8)) {
-                io1 >>= ip2_val;
-            }
 #if @SIGNED@
-            else if (io1 < 0) {
-                io1 = -1;
-            }
-#endif
-            else {
-                io1 = 0;
-            }
+    BINARY_LOOP_FAST(@type@, @type@, {
+        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
+            *out = in1 >> in2;
         }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-#if @SIGNED@
-        BINARY_LOOP_FAST(@type@, @type@, RIGHT_SHIFT_OP_SIGNED);
+        else if (in1 < 0) {
+            *out = (@type@)-1;  /* shift right preserves the sign bit */
+        }
+        else {
+            *out = 0;
+        }
+    });
 #else
-        BINARY_LOOP_FAST(@type@, @type@, RIGHT_SHIFT_OP_UNSIGNED);
+    BINARY_LOOP_FAST(@type@, @type@, {
+        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
+            *out = in1 >> in2;
+        }
+        else {
+            *out = 0;
+        }
+    });
 #endif
-    }
 }
 
-#undef RIGHT_SHIFT_OP_SIGNED
-#undef RIGHT_SHIFT_OP_UNSIGNED
-
 
 /**begin repeat2
  * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 8b586caf6f1f..b691115e7ab6 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -263,38 +263,40 @@ static void
 
 /**end repeat1**/
 
-#define @name@_ctype_lshift(arg1, arg2, out)           \
-    do {                                               \
-        if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
-            *(out) = (arg1) << (arg2);                 \
-        }                                              \
-        else {                                         \
-            *(out) = 0;                                \
-        }                                              \
+/* Note: these need to be kept in sync with the shift ufuncs */
+
+#define @name@_ctype_lshift(arg1, arg2, out)                  \
+    do {                                                      \
+        if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
+            *(out) = (arg1) << (arg2);                        \
+        }                                                     \
+        else {                                                \
+            *(out) = 0;                                       \
+        }                                                     \
     } while (0)
 
 #if @issigned@
-    #define @name@_ctype_rshift(arg1, arg2, out)           \
-        do {                                               \
-            if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
-                *(out) = (arg1) >> (arg2);                 \
-            }                                              \
-            else if ((arg1) < 0) {                         \
-                *(out) = -1;                               \
-            }                                              \
-            else {                                         \
-                *(out) = 0;                                \
-            }                                              \
+    #define @name@_ctype_rshift(arg1, arg2, out)                  \
+        do {                                                      \
+            if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
+                *(out) = (arg1) >> (arg2);                        \
+            }                                                     \
+            else if ((arg1) < 0) {                                \
+                *(out) = -1;                                      \
+            }                                                     \
+            else {                                                \
+                *(out) = 0;                                       \
+            }                                                     \
         } while (0)
 #else
-    #define @name@_ctype_rshift(arg1, arg2, out)           \
-        do {                                               \
-            if (NPY_LIKELY((arg2) < sizeof(@type@) * 8)) { \
-                *(out) = (arg1) >> (arg2);                 \
-            }                                              \
-            else {                                         \
-                *(out) = 0;                                \
-            }                                              \
+    #define @name@_ctype_rshift(arg1, arg2, out)                  \
+        do {                                                      \
+            if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
+                *(out) = (arg1) >> (arg2);                        \
+            }                                                     \
+            else {                                                \
+                *(out) = 0;                                       \
+            }                                                     \
         } while (0)
 #endif
 
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 7a1771ed8b4a..854df5590157 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -668,31 +668,27 @@ def test_numpy_abs(self):
 
 class TestBitShifts(object):
 
-    def test_left_shift(self):
+    @pytest.mark.parametrize('type_code', np.typecodes['AllInteger'])
+    @pytest.mark.parametrize('op',
+        [operator.rshift, operator.lshift], ids=['>>', '<<'])
+    def test_shift_all_bits(self, type_code, op):
+        """ Shifts where the shift amount is the width of the type or wider """
         # gh-2449
-        for dt in np.typecodes['AllInteger']:
-            arr = np.array([5, -5], dtype=dt)
-            scl_pos, scl_neg = arr
-            for shift in np.array([arr.dtype.itemsize * 8], dtype=dt):
-                res_pos = scl_pos << shift
-                res_neg = scl_neg << shift
-                assert_equal(res_pos, 0)
-                assert_equal(res_neg, 0)
-                # Result on scalars should be the same as on arrays
-                assert_array_equal(arr << shift, [res_pos, res_neg])
-
-    def test_right_shift(self):
-        # gh-2449
-        for dt in np.typecodes['AllInteger']:
-            arr = np.array([5, -5], dtype=dt)
-            scl_pos, scl_neg = arr
-            for shift in np.array([arr.dtype.itemsize * 8], dtype=dt):
-                res_pos = scl_pos >> shift
-                res_neg = scl_neg >> shift
-                assert_equal(res_pos, 0)
-                if dt in np.typecodes['UnsignedInteger']:
-                    assert_equal(res_neg, 0)
+        dt = np.dtype(type_code)
+        nbits = dt.itemsize * 8
+        for val in [5, -5]:
+            for shift in [nbits, nbits + 4]:
+                val_scl = dt.type(val)
+                shift_scl = dt.type(shift)
+                res_scl = op(val_scl, shift_scl)
+                if val_scl < 0 and op is operator.rshift:
+                    # sign bit is preserved
+                    assert_equal(res_scl, -1)
                 else:
-                    assert_equal(res_neg, -1)
+                    assert_equal(res_scl, 0)
+
                 # Result on scalars should be the same as on arrays
-                assert_array_equal(arr >> shift, [res_pos, res_neg], dt)
+                val_arr = np.array([val]*32, dtype=dt)
+                shift_arr = np.array([shift]*32, dtype=dt)
+                res_arr = op(val_arr, shift_arr)
+                assert_equal(res_arr, res_scl)

From 79d7bc276afbe89c746e462d28d4bfbb4fc56148 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Tue, 11 Jun 2019 21:49:15 -0700
Subject: [PATCH 3/4] MAINT: Move shift implementation to npy_math

This avoids duplication, as suggested in review.
The suffices `l`, `ull` etc are for consistency with `strtoull` and friends.
`h` and `uhh` are inspired from the C99 printf specifiers.
---
 numpy/core/include/numpy/npy_math.h           | 22 +++++++++
 .../core/src/npymath/npy_math_internal.h.src  | 41 +++++++++++++++++
 numpy/core/src/umath/loops.c.src              | 45 +++++++------------
 numpy/core/src/umath/scalarmath.c.src         | 40 ++---------------
 4 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 6a78ff3c2b54..3f3532d3601e 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -162,6 +162,28 @@ NPY_INPLACE npy_long npy_lcml(npy_long a, npy_long b);
 NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b);
 
+NPY_INPLACE npy_ubyte npy_rshiftuhh(npy_ubyte a, npy_ubyte b);
+NPY_INPLACE npy_ubyte npy_lshiftuhh(npy_ubyte a, npy_ubyte b);
+NPY_INPLACE npy_ushort npy_rshiftuh(npy_ushort a, npy_ushort b);
+NPY_INPLACE npy_ushort npy_lshiftuh(npy_ushort a, npy_ushort b);
+NPY_INPLACE npy_uint npy_rshiftu(npy_uint a, npy_uint b);
+NPY_INPLACE npy_uint npy_lshiftu(npy_uint a, npy_uint b);
+NPY_INPLACE npy_ulong npy_rshiftul(npy_ulong a, npy_ulong b);
+NPY_INPLACE npy_ulong npy_lshiftul(npy_ulong a, npy_ulong b);
+NPY_INPLACE npy_ulonglong npy_rshiftull(npy_ulonglong a, npy_ulonglong b);
+NPY_INPLACE npy_ulonglong npy_lshiftull(npy_ulonglong a, npy_ulonglong b);
+
+NPY_INPLACE npy_byte npy_rshifthh(npy_byte a, npy_byte b);
+NPY_INPLACE npy_byte npy_lshifthh(npy_byte a, npy_byte b);
+NPY_INPLACE npy_short npy_rshifth(npy_short a, npy_short b);
+NPY_INPLACE npy_short npy_lshifth(npy_short a, npy_short b);
+NPY_INPLACE npy_int npy_rshift(npy_int a, npy_int b);
+NPY_INPLACE npy_int npy_lshift(npy_int a, npy_int b);
+NPY_INPLACE npy_long npy_rshiftl(npy_long a, npy_long b);
+NPY_INPLACE npy_long npy_lshiftl(npy_long a, npy_long b);
+NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
+NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
+
 /*
  * C99 double math funcs
  */
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index fa820baac3b8..18b6d1434ef4 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -716,3 +716,44 @@ npy_@func@@c@(@type@ a, @type@ b)
     return npy_@func@u@c@(a < 0 ? -a : a, b < 0 ? -b : b);
 }
 /**end repeat**/
+
+/* Unlike LCM and GCD, we need byte and short variants for the shift operators,
+ * since the result is dependent on the width of the type
+ */
+/**begin repeat
+ *
+ * #type = byte, short, int, long, longlong#
+ * #c = hh,h,,l,ll#
+ */
+/**begin repeat1
+ *
+ * #u         = u,#
+ * #is_signed = 0,1#
+ */
+NPY_INPLACE npy_@u@@type@
+npy_lshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_@u@@type@
+npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if @is_signed@
+    else if (a < 0) {
+        return (npy_@u@@type@)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index eccb9d82cdf6..3d0b41318c5b 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -699,6 +699,7 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
  * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
  *          npy_double, npy_double, npy_double, npy_double#
  * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -808,46 +809,30 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
  *       which is undefined in C.
  */
 
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_left_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
                   void *NPY_UNUSED(func))
 {
-    BINARY_LOOP_FAST(@type@, @type@,
-        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
-            *out = in1 << in2;
-        }
-        else {
-            *out = 0;
-        }
-    );
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
 }
 
+#undef INT_left_shift_needs_clear_floatstatus
+#undef UINT_left_shift_needs_clear_floatstatus
+
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_right_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
                    void *NPY_UNUSED(func))
 {
-#if @SIGNED@
-    BINARY_LOOP_FAST(@type@, @type@, {
-        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
-            *out = in1 >> in2;
-        }
-        else if (in1 < 0) {
-            *out = (@type@)-1;  /* shift right preserves the sign bit */
-        }
-        else {
-            *out = 0;
-        }
-    });
-#else
-    BINARY_LOOP_FAST(@type@, @type@, {
-        if (NPY_LIKELY(in2 < sizeof(@type@) * CHAR_BIT)) {
-            *out = in1 >> in2;
-        }
-        else {
-            *out = 0;
-        }
-    });
-#endif
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
 }
 
 
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index b691115e7ab6..df440e09569d 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -251,7 +251,7 @@ static void
  *         long, ulong, longlong, ulonglong#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
- * #issigned = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #suffix = hh,uhh,h,uh,,u,l,ul,ll,ull#
  */
 
 /**begin repeat1
@@ -263,42 +263,8 @@ static void
 
 /**end repeat1**/
 
-/* Note: these need to be kept in sync with the shift ufuncs */
-
-#define @name@_ctype_lshift(arg1, arg2, out)                  \
-    do {                                                      \
-        if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
-            *(out) = (arg1) << (arg2);                        \
-        }                                                     \
-        else {                                                \
-            *(out) = 0;                                       \
-        }                                                     \
-    } while (0)
-
-#if @issigned@
-    #define @name@_ctype_rshift(arg1, arg2, out)                  \
-        do {                                                      \
-            if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
-                *(out) = (arg1) >> (arg2);                        \
-            }                                                     \
-            else if ((arg1) < 0) {                                \
-                *(out) = -1;                                      \
-            }                                                     \
-            else {                                                \
-                *(out) = 0;                                       \
-            }                                                     \
-        } while (0)
-#else
-    #define @name@_ctype_rshift(arg1, arg2, out)                  \
-        do {                                                      \
-            if (NPY_LIKELY((arg2) < sizeof(@type@) * CHAR_BIT)) { \
-                *(out) = (arg1) >> (arg2);                        \
-            }                                                     \
-            else {                                                \
-                *(out) = 0;                                       \
-            }                                                     \
-        } while (0)
-#endif
+#define @name@_ctype_lshift(arg1, arg2, out) *(out) = npy_lshift@suffix@(arg1, arg2)
+#define @name@_ctype_rshift(arg1, arg2, out) *(out) = npy_rshift@suffix@(arg1, arg2)
 
 /**end repeat**/
 

From 6cf6ece43589670a28b765fd03402cc08ada61f0 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Wed, 12 Jun 2019 01:13:07 -0700
Subject: [PATCH 4/4] BUG: Disable -O3 on right_shift on compilers which emit
 an internal error

Needed to make CI pass
---
 numpy/core/setup.py              |  6 +++++
 numpy/core/setup_common.py       | 39 ++++++++++++++++++++++++++++++++
 numpy/core/src/umath/loops.c.src |  6 ++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 338502791383..e6088a48a805 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -463,6 +463,12 @@ def generate_config_h(ext, build_dir):
             rep = check_long_double_representation(config_cmd)
             moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
 
+            if check_for_right_shift_internal_compiler_error(config_cmd):
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift')
+
             # Py3K check
             if sys.version_info[0] == 3:
                 moredefs.append(('NPY_PY3K', 1))
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 6e3109ab5786..6168e3f8deca 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -5,6 +5,7 @@
 import warnings
 import copy
 import binascii
+import textwrap
 
 from numpy.distutils.misc_util import mingw32
 
@@ -414,3 +415,41 @@ def long_double_representation(lines):
     else:
         # We never detected the after_sequence
         raise ValueError("Could not lock sequences (%s)" % saw)
+
+
+def check_for_right_shift_internal_compiler_error(cmd):
+    """
+    On our arm CI, this fails with an internal compilation error
+
+    The failure looks like the following, and can be reproduced on ARM64 GCC 5.4:
+
+        <source>: In function 'right_shift':
+        <source>:4:20: internal compiler error: in expand_shift_1, at expmed.c:2349
+               ip1[i] = ip1[i] >> in2;
+                      ^
+        Please submit a full bug report,
+        with preprocessed source if appropriate.
+        See <http://gcc.gnu.org/bugs.html> for instructions.
+        Compiler returned: 1
+
+    This function returns True if this compiler bug is present, and we need to
+    turn off optimization for the function
+    """
+    cmd._check_compiler()
+    has_optimize = cmd.try_compile(textwrap.dedent("""\
+        __attribute__((optimize("O3"))) void right_shift() {}
+        """), None, None)
+    if not has_optimize:
+        return False
+
+    no_err = cmd.try_compile(textwrap.dedent("""\
+        typedef long the_type;  /* fails also for unsigned and long long */
+        __attribute__((optimize("O3"))) void right_shift(the_type in2, the_type *ip1, int n) {
+            for (int i = 0; i < n; i++) {
+                if (in2 < (the_type)sizeof(the_type) * 8) {
+                    ip1[i] = ip1[i] >> in2;
+                }
+            }
+        }
+        """), None, None)
+    return not no_err
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3d0b41318c5b..0c5465761a2f 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -828,7 +828,11 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 #undef INT_left_shift_needs_clear_floatstatus
 #undef UINT_left_shift_needs_clear_floatstatus
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
+NPY_NO_EXPORT
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+NPY_GCC_OPT_3
+#endif
+void
 @TYPE@_right_shift@isa@(char **args, npy_intp *dimensions, npy_intp *steps,
                    void *NPY_UNUSED(func))
 {