8000 ENH: Use AVX for float32 implementation of np.sin & np.cos by r-devulap · Pull Request #13368 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Use AVX for float32 implementation of np.sin & np.cos #13368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 18, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
BUG: AVX2 impl of sin/cos requires an FMA
Without an FMA, the output of AVX2 and AVX512 version differ. This
changes ensures the output across implementations remains exactly the
same.
  • Loading branch information
r-devulap committed Aug 3, 2019
commit cd9f1a87d6f200b3ccb4be7dc646b4832b321ba9
8 changes: 4 additions & 4 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.cos'),
None,
TD('e', f='cos', astype={'e':'f'}),
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='cos', astype={'e':'f'}),
TD(P, f='cos'),
),
Expand All @@ -672,7 +672,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.sin'),
None,
TD('e', f='sin', astype={'e':'f'}),
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='sin', astype={'e':'f'}),
TD(P, f='sin'),
),
Expand Down Expand Up @@ -709,7 +709,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.exp'),
None,
TD('e', f='exp', astype={'e':'f'}),
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='exp', astype={'e':'f'}),
TD(P, f='exp'),
),
Expand All @@ -732,7 +732,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.log'),
None,
TD('e', f='log', astype={'e':'f'}),
TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
TD(inexact, f='log', astype={'e':'f'}),
TD(P, f='log'),
),
Expand Down
8 changes: 6 additions & 2 deletions numpy/core/include/numpy/npy_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@
#else
#define NPY_GCC_TARGET_AVX
#endif

#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
#define HAVE_ATTRIBUTE_TARGET_FMA
#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
#endif

#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
#else
#define NPY_GCC_TARGET_AVX2
#endif
Expand Down
5 changes: 3 additions & 2 deletions numpy/core/setup_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,10 @@ def check_api_version(apiversion, codegen_dir):
# gcc 4.8.4 support attributes but not with intrisics
# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
# function name will be converted to HAVE_<upper-case-name> preprocessor macro
OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
'attribute_target_avx2_with_intrinsics',
'__m256 temp = _mm256_set1_ps(1.0)',
'__m256 temp = _mm256_set1_ps(1.0); temp = \
_mm256_fmadd_ps(temp, temp, temp)',
'immintrin.h'),
('__attribute__((target("avx512f")))',
'attribute_target_avx512f_with_intrinsics',
Expand Down
22 changes: 22 additions & 0 deletions numpy/core/src/umath/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,25 @@ int os_avx512_support(void)
#endif
}

static NPY_INLINE
int cpu_supports_fma(void)
{
unsigned int feature = 0x01;
unsigned int a, b, c, d;
#ifdef __x86_64__
__asm__ volatile (
"cpuid" "\n\t"
: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
: "a" (feature));
/*
* FMA is the 12th bit of ECX
*/
return (c >> 12) & 1;
#else
return 0;
#endif
}

/*
* Primitive cpu feature detect function
* Currently only supports checking for avx on gcc compatible compilers.
Expand All @@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature)
return 0;
#endif
}
else if (strcmp(feature, "fma") == 0) {
return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
}
else if (strcmp(feature, "avx2") == 0) {
return __builtin_cpu_supports("avx2") && os_avx_support();
}
Expand Down
4 changes: 2 additions & 2 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
/**end repeat**/

/**begin repeat
* #isa = avx512f, avx2#
* #ISA = AVX512F, AVX2#
* #isa = avx512f, fma#
* #ISA = AVX512F, FMA#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
*/

Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ NPY_NO_EXPORT void
FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));

/**begin repeat1
* #isa = avx512f, avx2#
* #isa = avx512f, fma#
*/

NPY_NO_EXPORT void
Expand Down
79 changes: 38 additions & 41 deletions numpy/core/src/umath/simd.inc.src
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b)
*/

/**begin repeat
* #ISA = AVX2, AVX512F#
* #isa = avx2, avx512f#
* #ISA = FMA, AVX512F#
* #isa = fma, avx512f#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
* #REGISTER_SIZE = 32, 64#
*/

Expand All @@ -141,15 +142,15 @@ abs_ptrdiff(char *a, char *b)
* #func = exp, log#
*/

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif

static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
return 1;
Expand All @@ -162,15 +163,15 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)

/**end repeat1**/

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
#endif

static NPY_INLINE int
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
{
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
return 1;
Expand Down Expand Up @@ -1142,69 +1143,63 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/* bunch of helper functions used in ISA_exp/log_FLOAT*/

#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_fmadd(__m256 a, __m256 b, __m256 c)
{
return _mm256_add_ps(_mm256_mul_ps(a, b), c);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_get_full_load_mask(void)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_full_load_mask(void)
{
return _mm256_set1_ps(-1.0);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float* addr = maskint + total_elem - num_lanes;
return _mm256_loadu_ps(addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_masked_gather(__m256 src,
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_gather(__m256 src,
npy_float* addr,
__m256i vindex,
__m256 mask)
{
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_masked_load(__m256 mask, npy_float* addr)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_load(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_blend(__m256 x, __m256 y, __m256 ymask)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
return _mm256_blendv_ps(x, y, ymask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
{
return _mm256_cvtepi32_ps(
_mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_should_negate(__m256i k, __m256i andop, __m256i cmp)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
{
return avx2_should_calculate_sine(k, andop, cmp);
return fma_should_calculate_sine(k, andop, cmp);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_get_exponent(__m256 x)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_exponent(__m256 x)
{
/*
* Special handling of denormals:
Expand All @@ -1230,8 +1225,8 @@ avx2_get_exponent(__m256 x)
return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_get_mantissa(__m256 x)
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_mantissa(__m256 x)
{
/*
* Special handling of denormals:
Expand Down Expand Up @@ -1369,17 +1364,18 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
#endif

/**begin repeat
* #ISA = AVX2, AVX512F#
* #isa = avx2, avx512#
* #ISA = FMA, AVX512F#
* #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #or = or_ps, kor#
* #vsub = , _mask#
* #mask = __m256, __mmask16#
* #fmadd = avx2_fmadd,_mm512_fmadd_ps#
* #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
**/

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
#if defined @CHK@

/*
* Vectorized Cody-Waite range reduction technique
Expand Down Expand Up @@ -1446,8 +1442,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
/**end repeat**/

/**begin repeat
* #ISA = AVX2, AVX512F#
* #isa = avx2, avx512#
* #ISA = FMA, AVX512F#
* #isa = fma, avx512#
* #vtype = __m256, __m512#
* #vsize = 256, 512#
* #BYTES = 32, 64#
Expand All @@ -1456,11 +1452,12 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* #or_masks =_mm256_or_ps, _mm512_kor#
* #and_masks =_mm256_and_ps, _mm512_kand#
* #xor_masks =_mm256_xor_ps, _mm512_kxor#
* #fmadd = avx2_fmadd,_mm512_fmadd_ps#
* #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
* #mask_to_int = _mm256_movemask_ps, #
* #full_mask= 0xFF, 0xFFFF#
* #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
* #cvtps_epi32 = _mm256_cvtps_epi32, #
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
*/


Expand All @@ -1485,7 +1482,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* performance.
*/

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
#if defined @CHK@
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_sincos_FLOAT(npy_float * op,
npy_float * ip,
Expand Down
0