From 1964b6a4c5ee377e6489923cbc6b2c61b013a6d5 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Mon, 27 Mar 2017 14:55:35 +0200
Subject: [PATCH] MAINT: restore auto-vectorization of inplace operations

GCC 6/7 lost the ability to vectorize inplace operations with our
current hinting. This causes inplace operations to become slower than
out of place operations which is bad, especially as we automatically
avoid temporaries now.
This issue has been filed in GCC PR80198.

Luckily gcc also has a no loop dependence pragma which we can use to
enforce the vectorization in the inplace code path.
In the inplace scalar path an extra code hint is sufficient.
---
 numpy/core/src/umath/loops.c.src | 39 ++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 107d525fcacb..24364afbde8b 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -138,6 +138,29 @@
         tout * out = (tout *)op1; \
         op; \
     }
+/*
+ * unfortunately gcc 6/7 regressed and we need to give it additional hints to
+ * vectorize inplace operations (PR80198)
+ * must only be used after op1 == ip1 or ip2 has been checked
+ * TODO: using ivdep might allow other compilers to vectorize too
+ */
+#if __GNUC__ >= 6
+#define IVDEP_LOOP _Pragma("GCC ivdep")
+#else
+#define IVDEP_LOOP
+#endif
+#define BASE_BINARY_LOOP_INP(tin, tout, op) \
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    IVDEP_LOOP \
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
+        const tin in1 = *(tin *)ip1; \
+        const tin in2 = *(tin *)ip2; \
+        tout * out = (tout *)op1; \
+        op; \
+    }
 #define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
     const tin cin = *(tin *)cinp; \
     BINARY_LOOP { \
@@ -145,15 +168,23 @@
         tout * out = (tout *)op1; \
         op; \
     }
+/* PR80198 again, scalar works without the pragma */
+#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
+    const tin cin = *(tin *)cinp; \
+    BINARY_LOOP { \
+        const tin vin = *(tin *)vinp; \
+        tout * out = (tout *)vinp; \
+        op; \
+    }
 #define BINARY_LOOP_FAST(tin, tout, op) \
     do { \
     /* condition allows compiler to optimize the generic macro */ \
     if (IS_BINARY_CONT(tin, tout)) { \
         if (args[2] == args[0]) { \
-            BASE_BINARY_LOOP(tin, tout, op) \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
         } \
         else if (args[2] == args[1]) { \
-            BASE_BINARY_LOOP(tin, tout, op) \
+            BASE_BINARY_LOOP_INP(tin, tout, op) \
         } \
         else { \
             BASE_BINARY_LOOP(tin, tout, op) \
@@ -161,7 +192,7 @@
     } \
     else if (IS_BINARY_CONT_S1(tin, tout)) { \
         if (args[1] == args[2]) { \
-            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
         } \
         else { \
             BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
@@ -169,7 +200,7 @@
     } \
     else if (IS_BINARY_CONT_S2(tin, tout)) { \
         if (args[0] == args[2]) { \
-            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+            BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
         } \
         else { \
             BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \