8000 ENH:Umath Replace raw SIMD of unary float point(32-64) with NPYV - g0 by seiko2plus · Pull Request #16247 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH:Umath Replace raw SIMD of unary float point(32-64) with NPYV - g0 #16247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 10, 2020
8000
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
8000
Diff view
8 changes: 4 additions & 4 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,14 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
TD(O, f='Py_square'),
),
'reciprocal':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.reciprocal'),
None,
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will fix the failing 32-bit wheel build?

Copy link
Member Author
@seiko2plus seiko2plus Oct 31, 2020
6293

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change activates the new dispatcher.
32-bit wheel build fails due to aggressive optimization gcc made that doesn't respect zero division,
this issue was exist also on 64-bit when AVX2 and AVX512F aren't enabled.

the fix mainly here:
https://github.com/numpy/numpy/blob/1f872658984b2f8b0fda7022e72ad333a62864f3/numpy/core/src/umath/loops_unary_fp.dispatch.c.src#L129-L133

where partial load intrinsic npyv_load_till_* and npyv_loadn_till_* guarantee adding "one" to the tail of the vector.

I also made a slight change in the last push to generate using NPYV version for overlapped arrays, also to guarantee the same precsion on armhf.

https://github.com/numpy/numpy/blob/1f872658984b2f8b0fda7022e72ad333a62864f3/numpy/core/src/umath/loops_unary_fp.dispatch.c.src#L202-L206

TD(O, f='Py_reciprocal'),
),
# This is no longer used as numpy.ones_like, however it is
Expand Down Expand Up @@ -389,7 +389,7 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.absolute'),
'PyUFunc_AbsoluteTypeResolver',
TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
Expand Down Expand Up @@ -767,7 +767,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.sqrt'),
None,
TD('e', f='sqrt', astype={'e':'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg' + cmplx, f='sqrt'),
TD(P, f='sqrt'),
),
Expand Down
6 changes: 3 additions & 3 deletions numpy/core/include/numpy/npy_common.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#ifndef _NPY_COMMON_H_
#define _NPY_COMMON_H_

/* need Python.h for npy_intp, npy_uintp */
#include <Python.h>

/* numpconfig.h is auto-generated */
#include "numpyconfig.h"
#ifdef HAVE_NPY_CONFIG_H
#include <npy_config.h>
#endif

/* need Python.h for npy_intp, npy_uintp */
#include <Python.h>

/*
* using static inline modifiers when defining npy_math functions
* allows the compiler to make optimizations when possible
Expand Down
6 changes: 2 additions & 4 deletions numpy/core/include/numpy/npy_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
extern "C" {
#endif

#include <numpy/npy_common.h>

#include <math.h>
#ifdef __SUNPRO_CC
#include <sunmath.h>
#endif
#ifdef HAVE_NPY_CONFIG_H
#include <npy_config.h>
#endif
#include <numpy/npy_common.h>

/* By adding static inline specifiers to npy_math function definitions when
appropriate, compiler is given the opportunity to optimize */
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h.src'),
Expand Down
24 changes: 24 additions & 0 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
Expand Down Expand Up @@ -356,6 +357,17 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
#endif // sum_sup

/***************************
* Math
***************************/
#if @fp_only@
/**begin repeat1
* #intrin = sqrt, recip, abs, square#
*/
SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
#endif

#endif // simd_sup
/**end repeat**/
/***************************
Expand All @@ -371,6 +383,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
Expand Down Expand Up @@ -494,6 +507,17 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(sum_@sfx@)
#endif // sum_sup

/***************************
* Math
***************************/
#if @fp_only@
/**begin repeat1
* #intrin = sqrt, recip, abs, square#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif

#endif // simd_sup
/**end repeat**/

Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/common/npy_cpu_features.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef _NPY_CPU_FEATURES_H_
#define _NPY_CPU_FEATURES_H_

#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
#include <Python.h> // for PyObject
#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am curious why the shift in order is needed, usually Python.h comes first

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to suppress warning 'declaration of 'struct timespec*', this compiler warnning raised when math.h get included before
Python.h.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have some more info about this you can link to?


#ifdef __cplusplus
extern "C" {
Expand Down
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx2/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "math.h"
40 changes: 40 additions & 0 deletions numpy/core/src/common/simd/avx2/math.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_AVX2_MATH_H
#define _NPY_SIMD_AVX2_MATH_H
/***************************
* Elementary
***************************/
// Square root
#define npyv_sqrt_f32 _mm256_sqrt_ps
#define npyv_sqrt_f64 _mm256_sqrt_pd

// Reciprocal
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }

// Absolute
NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
{
return _mm256_and_ps(
a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
);
}
NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
{
return _mm256_and_pd(
a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
);
}

// Square
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
{ return _mm256_mul_ps(a, a); }
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
{ return _mm256_mul_pd(a, a); }

#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do the new intrinsics need to be added to the _simd module explicitly or is it automatic?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx512/avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "math.h"
49 changes: 49 additions & 0 deletions numpy/core/src/common/simd/avx512/math.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_AVX512_MATH_H
#define _NPY_SIMD_AVX512_MATH_H

/***************************
* Elementary
***************************/
// Square root
#define npyv_sqrt_f32 _mm512_sqrt_ps
#define npyv_sqrt_f64 _mm512_sqrt_pd

// Reciprocal
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }

// Absolute
NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
{
#if 0 // def NPY_HAVE_AVX512DQ
return _mm512_range_ps(a, a, 8);
#else
return npyv_and_f32(
a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
);
#endif
}
NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
{
#if 0 // def NPY_HAVE_AVX512DQ
return _mm512_range_pd(a, a, 8);
#else
return npyv_and_f64(
a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
);
#endif
}

// Square
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
{ return _mm512_mul_ps(a, a); }
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
{ return _mm512_mul_pd(a, a); }

#endif
22 changes: 17 additions & 5 deletions numpy/core/src/common/simd/neon/arithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,26 @@
/***************************
* Division
***************************/
#ifdef __aarch64__
#if NPY_SIMD_F64
#define npyv_div_f32 vdivq_f32
#else
NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
{
float32x4_t recip = vrecpeq_f32(b);
recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
return vmulq_f32(a, recip);
// Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
// estimate to 1/b
npyv_f32 recipe = vrecpeq_f32(b);
/**
* Newton-Raphson iteration:
* x[n+1] = x[n] * (2-d * x[n])
* converges to (1/d) if x0 is the result of VRECPE applied to d.
*
* NOTE: at least 3 iterations is needed to improve precision
*/
recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
// a/b = a*recip(b)
return vmulq_f32(a, recipe);
}
#endif
#define npyv_div_f64 vdivq_f64
Expand Down
86 changes: 86 additions & 0 deletions numpy/core/src/common/simd/neon/math.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_MATH_H
#define _NPY_SIMD_NEON_MATH_H

/***************************
* Elementary
***************************/
// Absolute
#define npyv_abs_f32 vabsq_f32
#define npyv_abs_f64 vabsq_f64

// Square
NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
{ return vmulq_f32(a, a); }
#if NPY_SIMD_F64
NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
{ return vmulq_f64(a, a); }
#endif

// Square root
#if NPY_SIMD_F64
#define npyv_sqrt_f32 vsqrtq_f32
#define npyv_sqrt_f64 vsqrtq_f64
#else
// Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
{
const npyv_f32 zero = vdupq_n_f32(0.0f);
const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
// guard agianst floating-point division-by-zero error
npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
// estimate to (1/√a)
npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
/**
* Newton-Raphson iteration:
* x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
* converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
*
* NOTE: at least 3 iterations is needed to improve precision
*/
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
// a * (1/√a)
npyv_f32 sqrt = vmulq_f32(a, rsqrte);
// return zero if the a is zero
// - return zero if a is zero.
// - return positive infinity if a is positive infinity
return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
}
#endif // NPY_SIMD_F64

// Reciprocal
NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
{
#if NPY_SIMD_F64
const npyv_f32 one = vdupq_n_f32(1.0f);
return npyv_div_f32(one, a);
#else
npyv_f32 recipe = vrecpeq_f32(a);
/**
* Newton-Raphson iteration:
* x[n+1] = x[n] * (2-d * x[n])
* converges to (1/d) if x0 is the result of VRECPE applied to d.
*
* NOTE: at least 3 iterations is needed to improve precision
*/
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
return recipe;
#endif
}
#if NPY_SIMD_F64
NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
{
const npyv_f64 one = vdupq_n_f64(1.0);
return npyv_div_f64(one, a);
}
#endif // NPY_SIMD_F64

#endif // _NPY_SIMD_SSE_MATH_H
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/neon/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "math.h"
Loading
0