8000 SIMD: Add sum intrinsics for float/double. by Qiyu8 · Pull Request #17681 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

SIMD: Add sum intrinsics for float/double. #17681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 3, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
improve intrinsics and add sum intrinsic test
  • Loading branch information
Qiyu8 committed Nov 3, 2020
commit 1f0298d62853e5233b0b829b08a11c160f0b6597
10 changes: 10 additions & 0 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
Expand Down Expand Up @@ -351,6 +352,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
#endif // fused_sup

#if @sum_sup@
SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
#endif // sum_sup

#endif // simd_sup
/**end repeat**/
/***************************
Expand All @@ -370,6 +375,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
Expand Down Expand Up @@ -484,6 +490,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif // fused_sup

#if @sum_sup@
SIMD_INTRIN_DEF(sum_@sfx@)
#endif // sum_sup

#endif // simd_sup
/**end repeat**/

Expand Down
5 changes: 4 additions & 1 deletion numpy/core/src/common/simd/avx2/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H

// Horizontal add: Calculates the sum of all vector elements.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please, move the intrinsics inside the header guard _NPY_SIMD_AVX2_ARITHMETIC_H,
same thing for other SIMD extensions.

NPY_FINLINE float npyv_sum_f32(__m256 a)
Expand All @@ -137,3 +136,7 @@ NPY_FINLINE double npyv_sum_f64(__m256d a)
__m128d sum = _mm_add_pd(lo, hi);
return _mm_cvtsd_f64(sum);
}

#endif // _NPY_SIMD_AVX2_ARITHMETIC_H


4 changes: 2 additions & 2 deletions numpy/core/src/common/simd/avx512/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,6 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd

#endif // _NPY_SIMD_AVX512_ARITHMETIC_H

/***************************
* Reduce Sum
* there are three ways to implement reduce sum for AVX512:
Expand Down Expand Up @@ -173,3 +171,5 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
}
#endif

#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
18 changes: 9 additions & 9 deletions numpy/core/src/common/simd/neon/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,17 +118,17 @@
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
#endif // NPY_SIMD_F64
#endif // _NPY_SIMD_NEON_ARITHMETIC_H

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(float32x4_t a)
{
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(r, r), 0);
}
#ifdef __aarch64__
NPY_FINLINE double npyv_sum_f64(float64x2_t a)
#if NPY_SIMD_F64
#define npyv_sum_f32 vaddvq_f32
#define npyv_sum_f64 vaddvq_f64
#else
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(r, r), 0);
}
#endif

#endif // _NPY_SIMD_NEON_ARITHMETIC_H
5 changes: 4 additions & 1 deletion numpy/core/src/common/simd/sse/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
#endif // _NPY_SIMD_SSE_ARITHMETIC_H

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(__m128 a)
Expand All @@ -172,3 +171,7 @@ NPY_FINLINE double npyv_sum_f64(__m128d a)
return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
#endif
}

#endif // _NPY_SIMD_SSE_ARITHMETIC_H


8 changes: 4 additions & 4 deletions numpy/core/src/common/simd/vsx/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,16 @@
#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
#define npyv_nmulsub_f64 vec_nmadd

#endif // _NPY_SIMD_VSX_ARITHMETIC_H

// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
return vec_extract(a, 0) + vec_extract(a, 1) +
vec_extract(a, 2) + vec_extract(a, 3);
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
return vec_extract(sum, 0) + vec_extract(sum, 1);
}

NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
{
return vec_extract(a, 0) + vec_extract(a, 1);
}

#endif // _NPY_SIMD_VSX_ARITHMETIC_H
10 changes: 10 additions & 0 deletions numpy/core/tests/test_simd.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,16 @@ def test_arithmetic_div(self):
div = self.div(vdata_a, vdata_b)
assert div == data_div

def test_arithmetic_reduce_sum(self):
if not self._is_fp():
return
# reduce sum
data = self._data()
vdata = self.load(data)

data_sum = sum(data)
vsum = self.sum(vdata)
assert vsum == data_sum
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!


int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
fp_sfx = ("f32", "f64")
Expand Down
0