-
-
Notifications
You must be signed in to change notification settings - Fork 10.9k
ENH:Umath Replace raw SIMD of unary float point(32-64) with NPYV - g0 #16247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ade6638
34d2672
cc564c0
c811166
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
#ifndef _NPY_CPU_FEATURES_H_ | ||
#define _NPY_CPU_FEATURES_H_ | ||
|
||
#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN | ||
#include <Python.h> // for PyObject | ||
#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am curious why the shift in order is needed, usually There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to suppress warning 'declaration of 'struct timespec*', this compiler warnning raised when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have some more info about this you can link to? |
||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#ifndef NPY_SIMD | ||
#error "Not a standalone header" | ||
#endif | ||
|
||
#ifndef _NPY_SIMD_AVX2_MATH_H | ||
#define _NPY_SIMD_AVX2_MATH_H | ||
/*************************** | ||
* Elementary | ||
***************************/ | ||
// Square root | ||
#define npyv_sqrt_f32 _mm256_sqrt_ps | ||
#define npyv_sqrt_f64 _mm256_sqrt_pd | ||
|
||
// Reciprocal | ||
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) | ||
{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); } | ||
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) | ||
{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); } | ||
|
||
// Absolute | ||
NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) | ||
{ | ||
return _mm256_and_ps( | ||
a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)) | ||
); | ||
} | ||
NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) | ||
{ | ||
return _mm256_and_pd( | ||
a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL)) | ||
); | ||
} | ||
|
||
// Square | ||
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) | ||
{ return _mm256_mul_ps(a, a); } | ||
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) | ||
{ return _mm256_mul_pd(a, a); } | ||
|
||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do the new intrinsics need to be added to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#ifndef NPY_SIMD | ||
#error "Not a standalone header" | ||
#endif | ||
|
||
#ifndef _NPY_SIMD_AVX512_MATH_H | ||
#define _NPY_SIMD_AVX512_MATH_H | ||
|
||
/*************************** | ||
* Elementary | ||
***************************/ | ||
// Square root | ||
#define npyv_sqrt_f32 _mm512_sqrt_ps | ||
#define npyv_sqrt_f64 _mm512_sqrt_pd | ||
|
||
// Reciprocal | ||
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) | ||
{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); } | ||
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) | ||
{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); } | ||
|
||
// Absolute | ||
NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) | ||
{ | ||
#if 0 // def NPY_HAVE_AVX512DQ | ||
return _mm512_range_ps(a, a, 8); | ||
#else | ||
return npyv_and_f32( | ||
a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)) | ||
); | ||
#endif | ||
} | ||
NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) | ||
{ | ||
#if 0 // def NPY_HAVE_AVX512DQ | ||
return _mm512_range_pd(a, a, 8); | ||
#else | ||
return npyv_and_f64( | ||
a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL)) | ||
); | ||
#endif | ||
} | ||
|
||
// Square | ||
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) | ||
{ return _mm512_mul_ps(a, a); } | ||
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) | ||
{ return _mm512_mul_pd(a, a); } | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#ifndef NPY_SIMD | ||
#error "Not a standalone header" | ||
#endif | ||
|
||
#ifndef _NPY_SIMD_NEON_MATH_H | ||
#define _NPY_SIMD_NEON_MATH_H | ||
|
||
/*************************** | ||
* Elementary | ||
***************************/ | ||
// Absolute | ||
#define npyv_abs_f32 vabsq_f32 | ||
#define npyv_abs_f64 vabsq_f64 | ||
|
||
// Square | ||
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) | ||
{ return vmulq_f32(a, a); } | ||
#if NPY_SIMD_F64 | ||
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) | ||
{ return vmulq_f64(a, a); } | ||
#endif | ||
|
||
// Square root | ||
#if NPY_SIMD_F64 | ||
#define npyv_sqrt_f32 vsqrtq_f32 | ||
#define npyv_sqrt_f64 vsqrtq_f64 | ||
#else | ||
// Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI | ||
NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) | ||
{ | ||
const npyv_f32 zero = vdupq_n_f32(0.0f); | ||
const npyv_u32 pinf = vdupq_n_u32(0x7f800000); | ||
npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf); | ||
// guard agianst floating-point division-by-zero error | ||
npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a); | ||
// estimate to (1/√a) | ||
npyv_f32 rsqrte = vrsqrteq_f32(guard_byz); | ||
/** | ||
* Newton-Raphson iteration: | ||
* x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2) | ||
* converges to (1/√d)if x0 is the result of VRSQRTE applied to d. | ||
* | ||
* NOTE: at least 3 iterations is needed to improve precision | ||
*/ | ||
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); | ||
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); | ||
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); | ||
// a * (1/√a) | ||
npyv_f32 sqrt = vmulq_f32(a, rsqrte); | ||
// return zero if the a is zero | ||
// - return zero if a is zero. | ||
// - return positive infinity if a is positive infinity | ||
return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt); | ||
} | ||
#endif // NPY_SIMD_F64 | ||
|
||
// Reciprocal | ||
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) | ||
{ | ||
#if NPY_SIMD_F64 | ||
const npyv_f32 one = vdupq_n_f32(1.0f); | ||
return npyv_div_f32(one, a); | ||
#else | ||
npyv_f32 recipe = vrecpeq_f32(a); | ||
/** | ||
* Newton-Raphson iteration: | ||
* x[n+1] = x[n] * (2-d * x[n]) | ||
* converges to (1/d) if x0 is the result of VRECPE applied to d. | ||
* | ||
* NOTE: at least 3 iterations is needed to improve precision | ||
*/ | ||
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); | ||
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); | ||
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); | ||
return recipe; | ||
#endif | ||
} | ||
#if NPY_SIMD_F64 | ||
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) | ||
{ | ||
const npyv_f64 one = vdupq_n_f64(1.0); | ||
return npyv_div_f64(one, a); | ||
} | ||
#endif // NPY_SIMD_F64 | ||
|
||
#endif // _NPY_SIMD_SSE_MATH_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will fix the failing 32-bit wheel build?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change activates the new dispatcher.
32-bit wheel build fails due to aggressive optimization gcc made that doesn't respect zero division,
this issue was exist also on 64-bit when AVX2 and AVX512F aren't enabled.
the fix mainly here:
https://github.com/numpy/numpy/blob/1f872658984b2f8b0fda7022e72ad333a62864f3/numpy/core/src/umath/loops_unary_fp.dispatch.c.src#L129-L133
where partial load intrinsic
npyv_load_till_*
andnpyv_loadn_till_*
guarantee adding "one" to the tail of the vector.I also made a slight change in the last push to generate using NPYV version for overlapped arrays, also to guarantee the same precsion on armhf.
https://github.com/numpy/numpy/blob/1f872658984b2f8b0fda7022e72ad333a62864f3/numpy/core/src/umath/loops_unary_fp.dispatch.c.src#L202-L206