numpy
diff --git a/‎numpy/core/src/multiarray/einsum_sumprod.c.src
Lines changed: 45 additions & 141 deletions b/‎numpy/core/src/multiarray/einsum_sumprod.c.src
Lines changed: 45 additions & 141 deletions
@@ -589,164 +589,68 @@ finish_after_unrolled_loop:
     goto finish_after_unrolled_loop;
 }
 
-static void
+static NPY_GCC_OPT_3 void
 @name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
     @type@ *data0 = (@type@ *)dataptr[0];
     @type@ *data1 = (@type@ *)dataptr[1];
     @temptype@ accum = 0;
 
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                     (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the 
10000
loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
 
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
              */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            /**end repeat3**/
+            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
+            npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
+            npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
+                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
     }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    /**end repeat2**/
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+        npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+        vaccum = npyv_muladd_@sfx@(a, b, vaccum);
     }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
 #else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
+        /**end repeat2**/
+        accum += ab0 + ab1 + ab2 + ab3;
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for @type@
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
 }
 
 static void