add explantion

numpy · Qiyu8 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020
commit f18ade4b08cf0db63d8897b86025ebaa516a2ed6
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
@@ -205,11 +205,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * This test is faster than a direct modulo.
      * Note alignment value of 0 is allowed and returns False.
      */
-#ifdef NPY_HAVE_NEON
-    return 0;
-#else
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
-#endif
 }
 
 /* Get equivalent "uint" alignment given an itemsize, for use in copy code */

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -16,6 +16,25 @@
 #include "simd/simd.h"
 #include "common.h"
 
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**
+ * Unroll by four/eight scalars in case of:
+ *  - The main SIMD loop un-rolled by NPY_SIMD_WIDTH*(2|4), (2|4) represents
+ *    the number of times the loop that unrolled, Eg: for float32, 2 simd loop
+ *    is unrolled, for float64, 4 simd loop is unrolled, which may lead to
+ *    un-vectorize the remained scalars in bytes range <=NPY_SIMD_WIDTH*(2|4)
+ *    if The SIMD width is higher than 128bit, The performance loss on remained
+ *    arrays is nonnegligible, so we choose to use the compiler auto-vectorize.
+ *  - To give the chance to the compiler to
+ *    auto-vectorize in case of NPYV wasn't available.
+ */
+
 #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 /**
  * Unroll by four/eight scalars in case of: