8000 ENH: Improve the performance of einsum by using universal simd by Qiyu8 · Pull Request #17049 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Improve the performance of einsum by using universal simd #17049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 40 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e26dcf7
new npyv intrinsics
Qiyu8 Aug 11, 2020
47118fb
einsum dispatch and usimd process
Qiyu8 Aug 11, 2020
ad0b3b4
update
Qiyu8 Aug 11, 2020
55200fc
add float32 benchmark case
Qiyu8 Aug 11, 2020
94cff77
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 12, 2020
4d7d94d
update
Qiyu8 Aug 12, 2020
ae53e35
fix typos
Qiyu8 Aug 12, 2020
2e713b0
add avx512 reduce sum comments
Qiyu8 Aug 13, 2020
5e7cbd1
add non_contigous arrays ,improve reduce the sum
Qiyu8 Aug 20, 2020
80c0ed4
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 24, 2020
9060231
rebase after split for a better review
Qiyu8 Aug 24, 2020
b0375dc
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 25, 2020
1990c13
headers reconstruct
Qiyu8 Aug 25, 2020
7b756af
use for loop replace begin repeat for readability
Qiyu8 Aug 25, 2020
4877e40
add ivdeps and handle header dependency
Qiyu8 Aug 26, 2020
168c6c9
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 26, 2020
954e642
revert to faster simd code
Qiyu8 Aug 27, 2020
50c6b7e
changed to baseline solution
Qiyu8 Aug 28, 2020
23e28c0
remove redundant typedef
Qiyu8 Aug 31, 2020
21f1c0b
update
Qiyu8 Sep 1, 2020
a07455a
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 10, 2020
d298c8e
remove redundant intrinsics
Qiyu8 Sep 10, 2020
6dac52e
add blank lines
Qiyu8 Sep 11, 2020
985e5b2
add format
Qiyu8 Sep 14, 2020
88c2747
Update numpy/core/src/common/simd/avx512/arithmetic.h
Qiyu8 Sep 14, 2020
90026f9
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 15, 2020
54943e0
modify the int to npy_intp
Qiyu8 Sep 15, 2020
e993af2
split benchmark and define common macro
Qiyu8 Sep 18, 2020
38f7382
avx2 test
Qiyu8 Sep 18, 2020
f351665
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Sep 18, 2020
c6c1e30
explain for auto-vectorize part
Qiyu8 Sep 18, 2020
f18ade4
add explantion
Qiyu8 Sep 18, 2020
33b7d2a
remove duplicated message
Qiyu8 Sep 19, 2020
5a692ed
Update benchmarks/benchmarks/bench_linalg.py
Qiyu8 Sep 29, 2020
20d5cda
Update numpy/core/src/multiarray/einsum_sumprod.c.src
Qiyu8 Sep 30, 2020
83734bf
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
f8f7482
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
1889738
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 12, 2020
7ff7324
fix typos
Qiyu8 Oct 12, 2020
73f61c3
remove extra test
Qiyu8 Oct 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 59 additions & 12 deletions benchmarks/benchmarks/bench_linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,26 +108,73 @@ def time_numpy_linalg_lstsq_a__b_float64(self):

class Einsum(Benchmark):
param_names = ['dtype']
params = [[np.float64]]
params = [[np.float32, np.float64]]
def setup(self, dtype):
self.a = np.arange(2900, dtype=dtype)
self.b = np.arange(3000, dtype=dtype)
self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40)
self.d = np.arange(10000, dtype=dtype).reshape(10,100,10)

#outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
self.one_dim_small = np.arange(600, dtype=dtype)
self.one_dim = np.arange(3000, dtype=dtype)
self.one_dim_big = np.arange(480000, dtype=dtype)
self.two_dim_small = np.arange(1200, dtype=dtype).reshape(30, 40)
self.two_dim = np.arange(240000, dtype=dtype).reshape(400, 600)
self.three_dim_small = np.arange(10000, dtype=dtype).reshape(10,100,10)
self.three_dim = np.arange(24000, dtype=dtype).reshape(20, 30, 40)
# non_contigous arrays
self.non_contigous_dim1_small = np.arange(1, 80, 2, dtype=dtype)
self.non_contigous_dim1 = np.arange(1, 4000, 2, dtype=dtype)
self.non_contigous_dim2 = np.arange(1, 2400, 2, dtype=dtype).reshape(30, 40)
self.non_contigous_dim3 = np.arange(1, 48000, 2, dtype=dtype).reshape(20, 30, 40)

# outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two
def time_einsum_outer(self, dtype):
np.einsum("i,j", self.a, self.b, optimize=True)
np.einsum("i,j", self.one_dim, self.one_dim, optimize=True)

# multiply(a, b):trigger sum_of_products_contig_two
def time_einsum_multiply(self, dtype):
np.einsum("..., ...", self.c1, self.c , optimize=True)
np.einsum("..., ...", self.two_dim_small, self.three_dim , optimize=True)

# sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two
def time_einsum_sum_mul(self, dtype):
np.einsum(",i...->", 300, self.d, optimize=True)
np.einsum(",i...->", 300, self.three_dim_small, optimize=True)

# sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two
def time_einsum_sum_mul2(self, dtype):
np.einsum("i...,->", self.d, 300, optimize=True)
np.einsum("i...,->", self.three_dim_small, 300, optimize=True)

# scalar mul: trigger sum_of_products_stride0_contig_outcontig_two
def time_einsum_mul(self, dtype):
np.einsum("i,->i", self.one_dim_big, 300, optimize=True)

# trigger contig_contig_outstride0_two
def time_einsum_contig_contig(self, dtype):
np.einsum("ji,i->", self.two_dim, self.one_dim_small, optimize=True)

# trigger sum_of_products_contig_outstride0_one
def time_einsum_contig_outstride0(self, dtype):
np.einsum("i->", self.one_dim_big, optimize=True)

# outer(a,b): non_contigous arrays
def time_einsum_noncon_outer(self, dtype):
np.einsum("i,j", self.non_contigous_dim1, self.non_contigous_dim1, optimize=True)

# multiply(a, b):non_contigous arrays
def time_einsum_noncon_multiply(self, dtype):
np.einsum("..., ...", self.non_contigous_dim2, self.non_contigous_dim3 , optimize=True)

# sum and multiply:non_contigous arrays
def time_einsum_noncon_sum_mul(self, dtype):
np.einsum(",i...->", 300, self.non_contigous_dim3, optimize=True)

# sum and multiply:non_contigous arrays
def time_einsum_noncon_sum_mul2(self, dtype):
np.einsum("i...,->", self.non_contigous_dim3, 300, optimize=True)

# scalar mul: non_contigous arrays
def time_einsum_noncon_mul(self, dtype):
np.einsum("i,->i", self.non_contigous_dim1, 300, optimize=True)

# contig_contig_outstride0_two: non_contigous arrays
def time_einsum_noncon_contig_contig(self, dtype):
np.einsum("ji,i->", self.non_contigous_dim2, self.non_contigous_dim1_small, optimize=True)

# sum_of_products_contig_outstride0_one:non_contigous arrays
def time_einsum_noncon_contig_outstride0(self, dtype):
np.einsum("i->", self.non_contigous_dim1, optimize=True)
12 changes: 6 additions & 6 deletions numpy/core/src/common/npy_cpu_dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
*/
/**
* Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
* Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION',
* due the nature of command argument '--disable-optimization',
* which is explicitly disabling the module ccompiler_opt.
*/
Expand All @@ -29,7 +29,7 @@
* It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
* since c99 supports bool variables which may lead to ambiguous errors.
*/
// backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
// backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token.
#define NPY__DISPATCH_DEFBOOL
typedef bool npy__dispatch_bkbool;
#endif
Expand Down Expand Up @@ -134,10 +134,10 @@
* NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
* NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
*
* By assuming the provided config header drived from a dispatch-able source,
* By assuming the provided config header derived from a dispatch-able source,
* that configured with "@targets baseline sse41 vsx3 asimdhp",
* they supported by the compiler and enabled via '--cpu-dspatch',
* then the prototype declrations at the above example will equlivent to the follows:
* then the prototype declrations at the above example will equivalent to the follows:
*
* - x86:
* void dispatch_me(const int*, int*); // baseline
Expand Down Expand Up @@ -179,7 +179,7 @@
/**
* Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
*
* Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
* Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
* if it was provided within the configration statments.
*/
#define NPY_CPU_DISPATCH_DECLARE_XB(...) \
Expand All @@ -206,7 +206,7 @@
* In order to call or to assign the pointer of it from outside the dispatch-able source,
* you have to use this Macro as follows:
*
* // bring the genreated config header of the dispatch-abel source
* // bring the generated config header of the dispatch-able source
* #ifndef NPY_DISABLE_OPTIMIZATION
* #include "dispatchable_source_name.dispatch.h"
* #endif
Expand Down
20 changes: 20 additions & 0 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,26 @@
#define npyv_div_f32 _mm256_div_ps
#define npyv_div_f64 _mm256_div_pd

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(__m256 a)
{
__m256 sum_halves = _mm256_hadd_ps(a, a);
sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
__m128 lo = _mm256_castps256_ps128(sum_halves);
__m128 hi = _mm256_extractf128_ps(sum_halves, 1);
__m128 sum = _mm_add_ps(lo, hi);
return _mm_cvtss_f32(sum);
}

NPY_FINLINE double npyv_sum_f64(__m256d a)
{
__m256d sum_halves = _mm256_hadd_pd(a, a);
__m128d lo = _mm256_castpd256_pd128(sum_halves);
__m128d hi = _mm256_extractf128_pd(sum_halves, 1);
__m128d sum = _mm_add_pd(lo, hi);
return _mm_cvtsd_f64(sum);
}

/***************************
* FUSED
***************************/
Expand Down
43 changes: 43 additions & 0 deletions numpy/core/src/common/simd/avx512/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,49 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_div_f32 _mm512_div_ps
#define npyv_div_f64 _mm512_div_pd

/***************************
* Reduce Sum
* there are three ways to implement reduce sum for AVX512:
* 1- split(256) /add /split(128) /add /hadd /hadd /extract
* 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
* 3- _mm512_reduce_add_ps/pd
* The first one is been widely used by many projects
*
* the second one is used by Intel Compiler, maybe because the
* latency of hadd increased by (2-3) starting from Skylake-X which makes two
* extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
*
* The third one is almost the same as the second one but only works for
* intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
***************************/
#ifdef NPY_HAVE_AVX512F_REDUCE
#define npyv_sum_f32 _mm512_reduce_add_ps
#define npyv_sum_f64 _mm512_reduce_add_pd
#else
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
__m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
__m512 sum32 = _mm512_add_ps(a, h64);
__m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
__m512 sum16 = _mm512_add_ps(sum32, h32);
__m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
__m512 sum8 = _mm512_add_ps(sum16, h16);
__m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
__m512 sum4 = _mm512_add_ps(sum8, h4);
return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
}
NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
{
__m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
__m512d sum32 = _mm512_add_pd(a, h64);
__m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
__m512d sum16 = _mm512_add_pd(sum32, h32);
__m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
__m512d sum8 = _mm512_add_pd(sum16, h16);
return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
}
#endif

/***************************
* FUSED
***************************/
Expand Down
13 changes: 13 additions & 0 deletions numpy/core/src/common/simd/neon/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@
#endif
#define npyv_div_f64 vdivq_f64

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(float32x4_t a)
{
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(r, r), 0);
}
#ifdef __aarch64__
NPY_FINLINE double npyv_sum_f64(float64x2_t a)
{
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
}
#endif

/***************************
* FUSED F32
***************************/
Expand Down
25 changes: 25 additions & 0 deletions numpy/core/src/common/simd/sse/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,31 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// TODO: emulate integer division
#define npyv_div_f32 _mm_div_ps
#define npyv_div_f64 _mm_div_pd

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(__m128 a)
{
#ifdef NPY_HAVE_SSE3
__m128 sum_halves = _mm_hadd_ps(a, a);
return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
#else
__m128 t1 = _mm_movehl_ps(a, a);
__m128 t2 = _mm_add_ps(a, t1);
__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
__m128 t4 = _mm_add_ss(t2, t3);
return _mm_cvtss_f32(t4);
#endif
}

NPY_FINLINE double npyv_sum_f64(__m128d a)
{
#ifdef NPY_HAVE_SSE3
return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
#else
return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
#endif
}

/***************************
* FUSED
***************************/
Expand Down
12 changes: 12 additions & 0 deletions numpy/core/src/common/simd/vsx/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@
#define npyv_div_f32 vec_div
#define npyv_div_f64 vec_div

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
return vec_extract(a, 0) + vec_extract(a, 1) +
vec_extract(a, 2) + vec_extract(a, 3);
}

NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
{
return vec_extract(a, 0) + vec_extract(a, 1);
}

/***************************
* FUSED
***************************/
Expand Down
Loading
0