From ae244d74e9e9380a95a701ba06eb407c79b3ebf0 Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Tue, 28 Jul 2020 18:03:08 +0800
Subject: [PATCH 01/11] ENH: Add Neon implmentation for add, sub, mul, div

---
 numpy/core/src/umath/simd.inc.src | 178 ++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 7866f8143c6a..4a02ed047bc6 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,6 +28,9 @@
 #undef __AVX512F__
 #endif
 #endif
+#ifdef NPY_HAVE_NEON
+#include <arm_neon.h>
+#endif
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
@@ -553,6 +556,18 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
+#elif @vector@ && defined NPY_HAVE_NEON
+
+static void
+neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
 #endif
 
 static NPY_INLINE int
@@ -584,6 +599,30 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
+#elif @vector@ && defined NPY_HAVE_NEON
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+#if defined __aarch64__
+    const npy_uintp vector_size_bytes = 16;
+#else
+    const npy_uintp vector_size_bytes =  8;
+#endif
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
+        neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
+        neon_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
+        neon_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
 #endif
     return 0;
 }
@@ -3697,4 +3736,143 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 
 #endif /* NPY_HAVE_SSE2_INTRINSICS */
 
+#ifdef NPY_HAVE_NEON
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #Nvtype128 = float32x4_t, float64x2_t#
+ *  #Nvtype64 = float32x2_t, float64x1_t#
+ *  #Nvpre = v, v#
+ *  #Nvsuf = f32, f64#
+ *  #double = 0, 1#
+ */
+
+
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef  __aarch64__
+    const npy_intp vector_size_bytes = 16;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
+            @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, a);
+            @Nvpre@st1q_@Nvsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
+            @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]);
+            @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
+            @Nvpre@st1q_@Nvsuf@(&op[i], c);
+        }
+    }
+
+#elif defined __aarch32__
+    const npy_intp vector_size_bytes = 8;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+        op[i] = ip1[i] @OP@ ip2[i];
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
+            @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a);
+            @Nvpre@st1q_@Nvsuf@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
+            @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
+            @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]);
+            @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
+            @Nvpre@st1_@Nvsuf@(&op[i], c);
+        }
+    }
+
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
+
+
+static void
+neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef __aarch64__
+    const npy_intp vector_size_bytes = 16;
+    const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[0] @OP@ ip2[i];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]);
+        @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
+        @Nvpre@st1q_@Nvsuf@(&op[i], c);
+    }
+
+#elif defined __aarch32__
+    const npy_intp vector_size_bytes = 8;
+    const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[0] @OP@ ip2[i];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]);
+        @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
+        @Nvpre@st1_@Nvsuf@(&op[i], c);
+    }
+
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
+
+
+static void
+neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef __aarch64__
+    const npy_intp vector_size_bytes = 16;
+    const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[i] @OP@ ip2[0];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
+        @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
+        @Nvpre@st1q_@Nvsuf@(&op[i], c);
+    }
+
+#elif defined __aarch32__
+    const npy_intp vector_size_bytes = 16;
+    const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[i] @OP@ ip2[0];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
+        @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
+        @Nvpre@st1_@Nvsuf@(&op[i], c);
+    }
+
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+#endif /* NPY_HAVENEON */
 #endif

From ad20babe8d46b01033e2fafd23e0a67d10bb5b05 Mon Sep 17 00:00:00 2001
From: DumbMice <bangchengyang@hotmail.com>
Date: Tue, 28 Jul 2020 22:22:53 +0800
Subject: [PATCH 02/11] Update simd.inc.src

---
 numpy/core/src/umath/simd.inc.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4a02ed047bc6..f5a841944883 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3855,7 +3855,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
     }
 
 #elif defined __aarch32__
-    const npy_intp vector_size_bytes = 16;
+    const npy_intp vector_size_bytes = 8;
     const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
     op[i] = ip1[i] @OP@ ip2[0];

From 6c32cd020a21285ab3699f0f12b0b31b3785f9e7 Mon Sep 17 00:00:00 2001
From: DumbMice <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 10:15:31 +0800
Subject: [PATCH 03/11] delete the unused

---
 numpy/core/src/umath/simd.inc.src | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index f5a841944883..8fb88ba6e37d 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3745,7 +3745,6 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
  *  #Nvtype64 = float32x2_t, float64x1_t#
  *  #Nvpre = v, v#
  *  #Nvsuf = f32, f64#
- *  #double = 0, 1#
  */
 
 

From 380029d3f06146d6309123dc66d64d2561eb1ee6 Mon Sep 17 00:00:00 2001
From: DumbMice <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 10:33:05 +0800
Subject: [PATCH 04/11] Update simd.inc.src

---
 numpy/core/src/umath/simd.inc.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8fb88ba6e37d..04dccc347899 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3788,7 +3788,7 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
         LOOP_BLOCKED(@type@, vector_size_bytes) {
             @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
             @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a);
-            @Nvpre@st1q_@Nvsuf@(&op[i], c);
+            @Nvpre@st1_@Nvsuf@(&op[i], c);
         }
     }
     else {

From df7b19929801a49724d3b3c8a7c1a6245e9d39bd Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 14:56:40 +0800
Subject: [PATCH 05/11] update; extract from scalar1&2

---
 numpy/core/src/._.DS_Store                    | Bin 0 -> 4096 bytes
 .../src/umath/_umath_tests.dispatch.asimdhp.c |  12 ++++
 numpy/core/src/umath/simd.inc.src             |  62 ++++++++++++++----
 3 files changed, 60 insertions(+), 14 deletions(-)
 create mode 100644 numpy/core/src/._.DS_Store
 create mode 100644 numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c

diff --git a/numpy/core/src/._.DS_Store b/numpy/core/src/._.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..41837bc8eb701c2c8b198f9c5595938388d2c675
GIT binary patch
literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIhYCu0iY;W;207T
z1eEv-(ZNuGRne&6Xb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDP&ot`f#!iQ
z7|4ZWWELwFr55Lx7A2=Dq~#Z7D`e)Cq~?`m=I15m<Wwr;q^6Z9B;_RLWdpmqP+dcs
PLiIn~D>4jn|NjF3PEQ@v

literal 0
HcmV?d00001

diff --git a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c
new file mode 100644
index 000000000000..97f646f2867e
--- /dev/null
+++ b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c
@@ -0,0 +1,12 @@
+/**
+ * AUTOGENERATED DON'T EDIT
+ * Please make changes to the code generator              (distutils/ccompiler_opt.py)
+ */
+#define NPY__CPU_TARGET_MODE
+#define NPY__CPU_TARGET_CURRENT ASIMDHP
+#define NPY__CPU_TARGET_NEON
+#define NPY__CPU_TARGET_NEON_FP16
+#define NPY__CPU_TARGET_NEON_VFPV4
+#define NPY__CPU_TARGET_ASIMD
+#define NPY__CPU_TARGET_ASIMDHP
+#include "/home/bangcheng/project/proj-2021348/numpy/core/src/umath/_umath_tests.dispatch.c"
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 04dccc347899..a8d6de3843e5 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -36,7 +36,14 @@
 #include <float.h>
 #include <string.h> /* for memcpy */
 
+#ifdef NPY_HAVE_NEON
+#ifdef __aarch64__
+#define vector_size_bytes 16
+#else
+#define vector_size_bytes 8
+#endif
 #define VECTOR_SIZE_BYTES 16
+#endif
 
 /*
  * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
@@ -604,11 +611,6 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
     @type@ * ip2 = (@type@ *)args[1];
     @type@ * op = (@type@ *)args[2];
     npy_intp n = dimensions[0];
-#if defined __aarch64__
-    const npy_uintp vector_size_bytes = 16;
-#else
-    const npy_uintp vector_size_bytes =  8;
-#endif
     /* argument one scalar */
     if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
         neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
@@ -3759,7 +3761,6 @@ static void
 neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef  __aarch64__
-    const npy_intp vector_size_bytes = 16;
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
@@ -3779,8 +3780,7 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
         }
     }
 
-#elif defined __aarch32__
-    const npy_intp vector_size_bytes = 8;
+#else
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
@@ -3805,13 +3805,49 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
         op[i] = ip1[i] @OP@ ip2[i];
     }
 }
+/**begin repeat2
+ * scalar1 & scalar2
+ *
+ * # scalar_loc = 1, 2#
+ * # vector_loc = 2, 1#
+ * # ip1ind = 0, i#
+ * # ip2ind = i, 0#
+ */
 
+static void
+neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+#ifdef __aarch64__
+    const @Nvtype128@ v@scalar_loc@ = @Nvpre@ld1q_dup_@Nvsuf@(ip@scalar_loc@);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype128@ v@vector_loc@ = @Nvpre@ld1q_@Nvsuf@(&ip@vector_loc@[i]);
+        @Nvtype128@ v3 = @Nvpre@@VOP@q_@Nvsuf@(v1, v2);
+        @Nvpre@st1q_@Nvsuf@(&op[i], v3);
+    }
+
+#else
+    const @Nvtype64@ v@scalar_loc@ = @Nvpre@ld1_dup_@Nvsuf@(ip@scalar_loc@);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
+    LOOP_BLOCKED(@type@, vector_size_bytes) {
+        @Nvtype64@ v@vector_loc@ = @Nvpre@ld1_@Nvsuf@(&ip@vector_loc@[i]);
+        @Nvtype64@ v3 = @Nvpre@@VOP@_@Nvsuf@(v1, v2);
+        @Nvpre@st1_@Nvsuf@(&op[i], v3);
+    }
 
+#endif
+    LOOP_BLOCKED_END {
+        op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
+    }
+}
+/**end repeat2**/
+/*
 static void
 neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef __aarch64__
-    const npy_intp vector_size_bytes = 16;
     const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
     op[i] = ip1[0] @OP@ ip2[i];
@@ -3821,8 +3857,7 @@ neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
         @Nvpre@st1q_@Nvsuf@(&op[i], c);
     }
 
-#elif defined __aarch32__
-    const npy_intp vector_size_bytes = 8;
+#elif __aarch32__
     const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
     op[i] = ip1[0] @OP@ ip2[i];
@@ -3843,7 +3878,6 @@ static void
 neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef __aarch64__
-    const npy_intp vector_size_bytes = 16;
     const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
     op[i] = ip1[i] @OP@ ip2[0];
@@ -3853,8 +3887,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
         @Nvpre@st1q_@Nvsuf@(&op[i], c);
     }
 
-#elif defined __aarch32__
-    const npy_intp vector_size_bytes = 8;
+#elif __aarch32__
     const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
     op[i] = ip1[i] @OP@ ip2[0];
@@ -3869,6 +3902,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
         op[i] = ip1[i] @OP@ ip2[0];
     }
 }
+*/
 
 /**end repeat1**/
 /**end repeat**/

From 867b2b9176e722fdd4e4661f26c6f41fd964aad9 Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 15:37:14 +0800
Subject: [PATCH 06/11] update

---
 numpy/core/src/._.DS_Store                       | Bin 4096 -> 0 bytes
 .../src/umath/_umath_tests.dispatch.asimdhp.c    |  12 ------------
 2 files changed, 12 deletions(-)
 delete mode 100644 numpy/core/src/._.DS_Store
 delete mode 100644 numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c

diff --git a/numpy/core/src/._.DS_Store b/numpy/core/src/._.DS_Store
deleted file mode 100644
index 41837bc8eb701c2c8b198f9c5595938388d2c675..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIhYCu0iY;W;207T
z1eEv-(ZNuGRne&6Xb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDP&ot`f#!iQ
z7|4ZWWELwFr55Lx7A2=Dq~#Z7D`e)Cq~?`m=I15m<Wwr;q^6Z9B;_RLWdpmqP+dcs
PLiIn~D>4jn|NjF3PEQ@v

diff --git a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c
deleted file mode 100644
index 97f646f2867e..000000000000
--- a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/**
- * AUTOGENERATED DON'T EDIT
- * Please make changes to the code generator              (distutils/ccompiler_opt.py)
- */
-#define NPY__CPU_TARGET_MODE
-#define NPY__CPU_TARGET_CURRENT ASIMDHP
-#define NPY__CPU_TARGET_NEON
-#define NPY__CPU_TARGET_NEON_FP16
-#define NPY__CPU_TARGET_NEON_VFPV4
-#define NPY__CPU_TARGET_ASIMD
-#define NPY__CPU_TARGET_ASIMDHP
-#include "/home/bangcheng/project/proj-2021348/numpy/core/src/umath/_umath_tests.dispatch.c"

From 8d1d95c7402ed3020eb0cba54c142fb1313edf0d Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 15:53:45 +0800
Subject: [PATCH 07/11] update

---
 numpy/core/src/umath/simd.inc.src | 63 +------------------------------
 1 file changed, 1 insertion(+), 62 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index a8d6de3843e5..fa2c39b3ae46 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3843,69 +3843,8 @@ neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ *
     }
 }
 /**end repeat2**/
-/*
-static void
-neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __aarch64__
-    const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[0] @OP@ ip2[i];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]);
-        @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
-        @Nvpre@st1q_@Nvsuf@(&op[i], c);
-    }
-
-#elif __aarch32__
-    const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[0] @OP@ ip2[i];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]);
-        @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
-        @Nvpre@st1_@Nvsuf@(&op[i], c);
-    }
-
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
-
-
-static void
-neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef __aarch64__
-    const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[i] @OP@ ip2[0];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
-        @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
-        @Nvpre@st1q_@Nvsuf@(&op[i], c);
-    }
-
-#elif __aarch32__
-    const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[i] @OP@ ip2[0];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
-        @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
-        @Nvpre@st1_@Nvsuf@(&op[i], c);
-    }
-
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-}
-*/
-
 /**end repeat1**/
 /**end repeat**/
 
-#endif /* NPY_HAVENEON */
+#endif /* NPY_HAVE_NEON */
 #endif

From 708bf277793444676483ff26ff27d29b3236b361 Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Wed, 29 Jul 2020 16:19:37 +0800
Subject: [PATCH 08/11] fix macros

---
 numpy/core/src/umath/simd.inc.src | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index fa2c39b3ae46..1d27fe4864b2 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -42,9 +42,10 @@
 #else
 #define vector_size_bytes 8
 #endif
-#define VECTOR_SIZE_BYTES 16
 #endif
 
+#define VECTOR_SIZE_BYTES 16
+
 /*
  * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
  * Very large step size can be as slow as processing it using scalar. The

From 96b6b13c27e6ee2423ad5cc79d4761727c74a4c3 Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Thu, 30 Jul 2020 01:09:12 +0800
Subject: [PATCH 09/11] transfer neon into universal intrinsics

---
 numpy/core/src/umath/simd.inc.src | 144 +++++++++++-------------------
 1 file changed, 51 insertions(+), 93 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 1d27fe4864b2..d4073fd15ae3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,22 +28,12 @@
 #undef __AVX512F__
 #endif
 #endif
-#ifdef NPY_HAVE_NEON
-#include <arm_neon.h>
-#endif
+#include "simd/simd.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
 #include <string.h> /* for memcpy */
 
-#ifdef NPY_HAVE_NEON
-#ifdef __aarch64__
-#define vector_size_bytes 16
-#else
-#define vector_size_bytes 8
-#endif
-#endif
-
 #define VECTOR_SIZE_BYTES 16
 
 /*
@@ -516,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
+ *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
 
 /**begin repeat1
@@ -564,16 +555,16 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
-#elif @vector@ && defined NPY_HAVE_NEON
+#elif @VECTOR@
 
 static void
-neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                           npy_intp n);
 static void
-neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 static void
-neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
 #endif
@@ -607,23 +598,23 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
-#elif @vector@ && defined NPY_HAVE_NEON
+#elif @VECTOR@
     @type@ * ip1 = (@type@ *)args[0];
     @type@ * ip2 = (@type@ *)args[1];
     @type@ * op = (@type@ *)args[2];
     npy_intp n = dimensions[0];
     /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
-        neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
     /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
-        neon_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
-        neon_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
 #endif
@@ -3739,17 +3730,13 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 
 #endif /* NPY_HAVE_SSE2_INTRINSICS */
 
-#ifdef NPY_HAVE_NEON
 /**begin repeat
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #Nvtype128 = float32x4_t, float64x2_t#
- *  #Nvtype64 = float32x2_t, float64x1_t#
- *  #Nvpre = v, v#
- *  #Nvsuf = f32, f64#
+ *  #sfx = f32, f64#
+ *  #CHK =    , _F64#
  */
-
+#if NPY_SIMD@CHK@
 
 /**begin repeat1
 * Arithmetic
@@ -3759,93 +3746,64 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 */
 
 static void
-neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-#ifdef  __aarch64__
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
-            @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, a);
-            @Nvpre@st1q_@Nvsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
-            @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]);
-            @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
-            @Nvpre@st1q_@Nvsuf@(&op[i], c);
-        }
-    }
-
-#else
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
     if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
-            @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a);
-            @Nvpre@st1_@Nvsuf@(&op[i], c);
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+            npyv_store_@sfx@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
-            @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]);
-            @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
-            @Nvpre@st1_@Nvsuf@(&op[i], c);
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+            npyv_store_@sfx@(&op[i], c);
         }
     }
-
-#endif
     LOOP_BLOCKED_END {
         op[i] = ip1[i] @OP@ ip2[i];
     }
 }
-/**begin repeat2
- * scalar1 & scalar2
- *
- * # scalar_loc = 1, 2#
- * # vector_loc = 2, 1#
- * # ip1ind = 0, i#
- * # ip2ind = i, 0#
- */
 
 static void
-neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-#ifdef __aarch64__
-    const @Nvtype128@ v@scalar_loc@ = @Nvpre@ld1q_dup_@Nvsuf@(ip@scalar_loc@);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype128@ v@vector_loc@ = @Nvpre@ld1q_@Nvsuf@(&ip@vector_loc@[i]);
-        @Nvtype128@ v3 = @Nvpre@@VOP@q_@Nvsuf@(v1, v2);
-        @Nvpre@st1q_@Nvsuf@(&op[i], v3);
+    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
+    op[i] = ip1[0] @OP@ ip2[i];
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
     }
-
-#else
-    const @Nvtype64@ v@scalar_loc@ = @Nvpre@ld1_dup_@Nvsuf@(ip@scalar_loc@);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-    op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
-    LOOP_BLOCKED(@type@, vector_size_bytes) {
-        @Nvtype64@ v@vector_loc@ = @Nvpre@ld1_@Nvsuf@(&ip@vector_loc@[i]);
-        @Nvtype64@ v3 = @Nvpre@@VOP@_@Nvsuf@(v1, v2);
-        @Nvpre@st1_@Nvsuf@(&op[i], v3);
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
     }
+}
 
-#endif
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
+    op[i] = ip1[i] @OP@ ip2[0];
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
     LOOP_BLOCKED_END {
-        op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@];
+        op[i] = ip1[i] @OP@ ip2[0];
     }
 }
-/**end repeat2**/
 /**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
 /**end repeat**/
 
-#endif /* NPY_HAVE_NEON */
 #endif

From 1286dc45d74cc33369a2dcc849545807908459e9 Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Thu, 30 Jul 2020 02:38:14 +0800
Subject: [PATCH 10/11] avoid defining simd_binary functions for sse2-enabled
 machines

---
 numpy/core/src/umath/simd.inc.src | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index d4073fd15ae3..49452c77b2e9 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3728,8 +3728,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 
 #undef VECTOR_SIZE_BYTES
 
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
-
+#else  /* NPY_HAVE_SSE2_INTRINSICS */
 /**begin repeat
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
@@ -3805,5 +3804,5 @@ simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
 /**end repeat1**/
 #endif /* NPY_SIMD@CHK@ */
 /**end repeat**/
-
+#endif
 #endif

From a36153f72e51833af4a2116a51b4c5ca30d77bbd Mon Sep 17 00:00:00 2001
From: Bangcheng Yang <bangchengyang@hotmail.com>
Date: Thu, 30 Jul 2020 10:25:53 +0800
Subject: [PATCH 11/11] ENH: Add Neon SIMD implmentation for add, sub, mul, div

---
 numpy/core/src/umath/simd.inc.src | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 49452c77b2e9..40bb76914fab 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -3727,14 +3727,15 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 /**end repeat**/
 
 #undef VECTOR_SIZE_BYTES
-
 #else  /* NPY_HAVE_SSE2_INTRINSICS */
+
 /**begin repeat
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #sfx = f32, f64#
  *  #CHK =    , _F64#
  */
+
 #if NPY_SIMD@CHK@
 
 /**begin repeat1
@@ -3747,8 +3748,9 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 static void
 simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
         op[i] = ip1[i] @OP@ ip2[i];
+    }
     /* lots of specializations, to squeeze out max performance */
     if (ip1 == ip2) {
         LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
@@ -3774,8 +3776,9 @@ static void
 simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
     const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
-    op[i] = ip1[0] @OP@ ip2[i];
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
     LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
         npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
         npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
@@ -3790,8 +3793,9 @@ static void
 simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
     const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH)
-    op[i] = ip1[i] @OP@ ip2[0];
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
     LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
         npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
         npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);