8000 ENH: Improve the performance of einsum by using universal simd by Qiyu8 · Pull Request #17049 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Improve the performance of einsum by using universal simd #17049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 40 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e26dcf7
new npyv intrinsics
Qiyu8 Aug 11, 2020
47118fb
einsum dispatch and usimd process
Qiyu8 Aug 11, 2020
ad0b3b4
update
Qiyu8 Aug 11, 2020
55200fc
add float32 benchmark case
Qiyu8 Aug 11, 2020
94cff77
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 12, 2020
4d7d94d
update
Qiyu8 Aug 12, 2020
ae53e35
fix typos
Qiyu8 Aug 12, 2020
2e713b0
add avx512 reduce sum comments
Qiyu8 Aug 13, 2020
5e7cbd1
add non_contigous arrays ,improve reduce the sum
Qiyu8 Aug 20, 2020
80c0ed4
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Aug 24, 2020
9060231
rebase after split for a better review
Qiyu8 Aug 24, 2020
b0375dc
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 25, 2020
1990c13
headers reconstruct
Qiyu8 Aug 25, 2020
7b756af
use for loop replace begin repeat for readability
Qiyu8 Aug 25, 2020
4877e40
add ivdeps and handle header dependency
Qiyu8 Aug 26, 2020
168c6c9
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Aug 26, 2020
954e642
revert to faster simd code
Qiyu8 Aug 27, 2020
50c6b7e
changed to baseline solution
Qiyu8 Aug 28, 2020
23e28c0
remove redundant typedef
Qiyu8 Aug 31, 2020
21f1c0b
update
Qiyu8 Sep 1, 2020
a07455a
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 10, 2020
d298c8e
remove redundant intrinsics
Qiyu8 Sep 10, 2020
6dac52e
add blank lines
Qiyu8 Sep 11, 2020
985e5b2
add format
Qiyu8 Sep 14, 2020
88c2747
Update numpy/core/src/common/simd/avx512/arithmetic.h
Qiyu8 Sep 14, 2020
90026f9
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Sep 15, 2020
54943e0
modify the int to npy_intp
Qiyu8 Sep 15, 2020
e993af2
split benchmark and define common macro
Qiyu8 Sep 18, 2020
38f7382
avx2 test
Qiyu8 Sep 18, 2020
f351665
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Sep 18, 2020
c6c1e30
explain for auto-vectorize part
Qiyu8 Sep 18, 2020
f18ade4
add explantion
Qiyu8 Sep 18, 2020
33b7d2a
remove duplicated message
Qiyu8 Sep 19, 2020
5a692ed
Update benchmarks/benchmarks/bench_linalg.py
Qiyu8 Sep 29, 2020
20d5cda
Update numpy/core/src/multiarray/einsum_sumprod.c.src
Qiyu8 Sep 30, 2020
83734bf
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
f8f7482
Merge branch 'einsum-usimd' of github.com:Qiyu8/numpy into einsum-usimd
Qiyu8 Oct 9, 2020
1889738
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
Qiyu8 Oct 12, 2020
7ff7324
fix typos
Qiyu8 Oct 12, 2020
73f61c3
remove extra test
Qiyu8 Oct 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge branch 'master' of github.com:numpy/numpy into einsum-usimd
  • Loading branch information
Qiyu8 committed Sep 10, 2020
commit a07455abb94e1b0497ac80c6785ce758192c2d03
45 changes: 45 additions & 0 deletions numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,49 @@ NPY_FINLINE double npyv_sum_f64(__m256d a)
__m128d sum = _mm_add_pd(lo, hi);
return _mm_cvtsd_f64(sum);
}

/***************************
* FUSED
***************************/
#ifdef NPY_HAVE_FMA3
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm256_fmadd_ps
#define npyv_muladd_f64 _mm256_fmadd_pd
// multiply and subtract, a*b - c
#define npyv_mulsub_f32 _mm256_fmsub_ps
#define npyv_mulsub_f64 _mm256_fmsub_pd
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm256_fnmadd_ps
#define npyv_nmuladd_f64 _mm256_fnmadd_pd
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm256_fnmsub_ps
#define npyv_nmulsub_f64 _mm256_fnmsub_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_add_f32(npyv_mul_f32(a, b), c); }
NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_add_f64(npyv_mul_f64(a, b), c); }
// multiply and subtract, a*b - c
NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_sub_f32(npyv_mul_f32(a, b), c); }
NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(npyv_mul_f64(a, b), c); }
// negate multiply and add, -(a*b) + c
NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
// negate multiply and subtract, -(a*b) - c
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{
npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
}
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{
npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/avx512/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,20 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
#endif
}

/***************************
* FUSED
***************************/
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm512_fmadd_ps
#define npyv_muladd_f64 _mm512_fmadd_pd
// multiply and subtract, a*b - c
#define npyv_mulsub_f32 _mm512_fmsub_ps
#define npyv_mulsub_f64 _mm512_fmsub_pd
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm512_fnmadd_ps
#define npyv_nmuladd_f64 _mm512_fnmadd_pd
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd

#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
43 changes: 43 additions & 0 deletions numpy/core/src/common/simd/neon/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,47 @@ NPY_FINLINE float npyv_sum_f32(float32x4_t a)
}
#endif

/***************************
* FUSED F32
***************************/
#ifdef NPY_HAVE_NEON_VFPV4 // FMA
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vfmaq_f32(c, a, b); }
// multiply and subtract, a*b - c
NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vfmaq_f32(vnegq_f32(c), a, b); }
// negate multiply and add, -(a*b) + c
NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vfmsq_f32(c, a, b); }
// negate multiply and subtract, -(a*b) - c
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vfmsq_f32(vnegq_f32(c), a, b); }
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlaq_f32(c, a, b); }
// multiply and subtract, a*b - c
NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlaq_f32(vnegq_f32(c), a, b); }
// negate multiply and add, -(a*b) + c
NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlsq_f32(c, a, b); }
// negate multiply and subtract, -(a*b) - c
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlsq_f32(vnegq_f32(c), a, b); }
#endif
/***************************
* FUSED F64
***************************/
#if NPY_SIMD_F64
NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmaq_f64(c, a, b); }
NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmaq_f64(vnegq_f64(c), a, b); }
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(c, a, b); }
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
#endif // NPY_SIMD_F64
#endif // _NPY_SIMD_NEON_ARITHMETIC_H
57 changes: 57 additions & 0 deletions numpy/core/src/common/simd/sse/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,61 @@ NPY_FINLINE double npyv_sum_f64(__m128d a)
return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
#endif
}

/***************************
* FUSED
***************************/
#ifdef NPY_HAVE_FMA3
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm_fmadd_ps
#define npyv_muladd_f64 _mm_fmadd_pd
// multiply and subtract, a*b - c
#define npyv_mulsub_f32 _mm_fmsub_ps
#define npyv_mulsub_f64 _mm_fmsub_pd
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm_fnmadd_ps
#define npyv_nmuladd_f64 _mm_fnmadd_pd
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm_fnmsub_ps
#define npyv_nmulsub_f64 _mm_fnmsub_pd
#elif defined(NPY_HAVE_FMA4)
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm_macc_ps
#define npyv_muladd_f64 _mm_macc_pd
// multiply and subtract, a*b - c
#define npyv_mulsub_f32 _mm_msub_ps
#define npyv_mulsub_f64 _mm_msub_pd
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm_nmacc_ps
#define npyv_nmuladd_f64 _mm_nmacc_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_add_f32(npyv_mul_f32(a, b), c); }
NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_add_f64(npyv_mul_f64(a, b), c); }
// multiply and subtract, a*b - c
NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_sub_f32(npyv_mul_f32(a, b), c); }
NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(npyv_mul_f64(a, b), c); }
// negate multiply and add, -(a*b) + c
NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
#endif // NPY_HAVE_FMA3
#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
// negate multiply and subtract, -(a*b) - c
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{
npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
}
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{
npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_SSE_ARITHMETIC_H
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/vsx/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,20 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
return vec_extract(a, 0) + vec_extract(a, 1);
}

/***************************
* FUSED
***************************/
// multiply and add, a*b + c
#define npyv_muladd_f32 vec_madd
#define npyv_muladd_f64 vec_madd
// multiply and subtract, a*b - c
#define npyv_mulsub_f32 vec_msub
#define npyv_mulsub_f64 vec_msub
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
#define npyv_nmuladd_f64 vec_nmsub
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
#define npyv_nmulsub_f64 vec_nmadd

#endif // _NPY_SIMD_VSX_ARITHMETIC_H
You are viewing a condensed version of this merge commit. You can view the full changes here.
0