numpy · Qiyu8 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 11, 2020
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
@@ -112,11 +112,14 @@ class Einsum(Benchmark):
     def setup(self, dtype):
         self.a = np.arange(2900, dtype=dtype)
         self.b = np.arange(3000, dtype=dtype)
+        self.b1 = np.arange(240000, dtype=dtype).reshape(400, 600)
         self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
         self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40)
+        self.c2 = np.arange(480000, dtype=dtype)
+        self.c3 = np.arange(600, dtype=dtype)
         self.d = np.arange(10000, dtype=dtype).reshape(10,100,10)
 
-    #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
+    # outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
     def time_einsum_outer(self, dtype):
         np.einsum("i,j", self.a, self.b, optimize=True)
 
@@ -130,4 +133,16 @@ def time_einsum_sum_mul(self, dtype):
 
     # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
     def time_einsum_sum_mul2(self, dtype):
-            np.einsum("i...,->", self.d, 300, optimize=True)
      
        
      
    
+        np.einsum("i...,->", self.d, 300, optimize=True)
+
+    # scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
+    def time_einsum_mul(self, dtype):
+        np.einsum("i,->i", self.c2, 300, optimize=True)
+
+    # trigger contig_contig_outstride0_two
+    def time_einsum_contig_contig(self, dtype):
+        np.einsum("ji,i->", self.b1, self.c3, optimize=True)
+
+    # trigger sum_of_products_contig_outstride0_one
+    def time_einsum_contig_outstride0(self, dtype):
+        np.einsum("i->", self.c2, optimize=True)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -853,6 +853,7 @@ def get_mathlib_info(*args):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
+            join('src', 'multiarray', 'einsum.dispatch.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),

diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -62,6 +62,13 @@
 #define npyv_mul_f32 _mm256_mul_ps
 #define npyv_mul_f64 _mm256_mul_pd
 
+#ifdef NPY_HAVE_FMA3
+    #define npyv_muladd_f32 _mm256_fmadd_ps
+    #define npyv_muladd_f64 _mm256_fmadd_pd
+#else
+    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
+    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
+#endif
 // saturated
 // TODO: after implment Packs intrins
 
@@ -72,4 +79,22 @@
 #define npyv_div_f32 _mm256_div_ps
 #define npyv_div_f64 _mm256_div_pd
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+    __m128 t1 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a,1));
+    __m128 t2 = _mm_movehdup_ps(t1);
+    __m128 t
10000
3 = _mm_add_ps(t1, t2);
+    __m128 t4 = _mm_movehl_ps(t3, t3);
+    __m128 t5 = _mm_add_ss(t3, t4);
+    return _mm_cvtss_f32(t5);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+    __m128d t1 = _mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a,1));
+    __m128d t2 = _mm_unpackhi_pd(t1, t1);
+    __m128d t3 = _mm_add_sd(t2, t1);
+    return _mm_cvtsd_f64(t3);
+}
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -103,6 +103,9 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_mul_f32 _mm512_mul_ps
 #define npyv_mul_f64 _mm512_mul_pd
 
+#define npyv_muladd_f32 _mm512_fmadd_ps
+#define npyv_muladd_f64 _mm512_fmadd_pd
+
 // saturated
 // TODO: after implment Packs intrins
 
@@ -112,5 +115,27 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512 sum32 = _mm512_add_ps(a, h64);
+    __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512 sum16 = _mm512_add_ps(sum32, h32);
+    __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512 sum8  = _mm512_add_ps(sum16, h16);
+    __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+    __m512 sum4  = _mm512_add_ps(sum8, h4);
+    return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+}
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+    __m512d sum32 = _mm512_add_pd(a, h64);
+    __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+    __m512d sum16 = _mm512_add_pd(sum32, h32);
+    __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+    __m512d sum8  = _mm512_add_pd(sum16, h16);
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+}
 
 #endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -60,6 +60,12 @@
 #define npyv_mul_f32 vmulq_f32
 #define npyv_mul_f64 vmulq_f64
 
+#ifdef NPY_HAVE_NEON_VFPV4
+    #define npyv_muladd_f32(A, B, C) vfmaq_f32(C, A, B)
+#else
+    #define npyv_muladd_f32(A, B, C) vmlaq_f32(C, A, B)
+#endif
+#define npyv_muladd_f64(A, B, C) vfmaq_f64(C, A, B)
 /***************************
  * Division
  ***************************/
@@ -75,4 +81,17 @@
 #endif
 #define npyv_div_f64 vdivq_f64
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(float32x4_t a)
+{
+    float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+    return vget_lane_f32(vpadd_f32(r, r), 0);
+}
+#ifdef __aarch64__
+    NPY_FINLINE double npyv_sum_f64(float64x2_t a)
+    {
+        return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
+    }
+#endif
+
 #endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -82,6 +82,13 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #define npyv_mul_f32 _mm_mul_ps
 #define npyv_mul_f64 _mm_mul_pd
 
+#ifdef NPY_HAVE_FMA3
+    #define npyv_muladd_f32 _mm_fmadd_ps
+    #define npyv_muladd_f64 _mm_fmadd_pd
+#else
+    #define npyv_muladd_f32(a, b, c) npyv_add_f32(npyv_mul_f32(a, b), c)
+    #define npyv_muladd_f64(a, b, c) npyv_add_f64(npyv_mul_f64(a, b), c)
+#endif
 // saturated
 // TODO: after implment Packs intrins
 
@@ -92,4 +99,21 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 #define npyv_div_f32 _mm_div_ps
 #define npyv_div_f64 _mm_div_pd
 
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+    __m128  t0 = _mm_castpd_ps(a);
+    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
+    __m128d t2 = _mm_add_sd(a,t1);
+    return _mm_cvtsd_f64(t2);
+}
 #endif // _NPY_SIMD_SSE_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -94,10 +94,25 @@
 #define npyv_mul_f32 vec_mul
 #define npyv_mul_f64 vec_mul
 
+#define npyv_muladd_f32 vec_madd
+#define npyv_muladd_f64 vec_madd
+
 /***************************
  * Division
  ***************************/
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
+// TODO: Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1) +
+    vec_extract(a, 2) + vec_extract(a, 3);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
 #endif // _NPY_SIMD_VSX_ARITHMETIC_H