numpy · mattip · Jul 31, 2020 · Jul 28, 2020 · Jul 28, 2020 · Jul 29, 2020
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
@@ -28,6 +28,7 @@
 #undef __AVX512F__
 #endif
 #endif
+#include "simd/simd.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
@@ -505,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
+ *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
 
 /**begin repeat1
@@ -553,6 +555,18 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
+#elif @VECTOR@
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
 #endif
 
 static NPY_INLINE int
@@ -584,6 +598,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
+#elif @VECTOR@
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
 #endif
     return 0;
 }
@@ -3694,7 +3727,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 /**end repeat**/
 
 #undef VECTOR_SIZE_BYTES
+#else  /* NPY_HAVE_SSE2_INTRINSICS */
 
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #sfx = f32, f64#
+ *  #CHK =    , _F64#
+ */
+
+#if NPY_SIMD@CHK@
 
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif
 #endif