From bd2c82bf141852b4737da297c081e5f621604317 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 20 Mar 2019 15:12:50 -0700
Subject: [PATCH 01/11] ENH: Use AVX for float32 implementation of np.sin &
 np.cos

This commit implements vectorized single precision sine and cosine using
AVX2 and AVX512. Both sine and cosine are computed using a
polynomial approximation which are accurate for values between
[-PI/4,PI/4]. The original input is reduced to this range using a 3-step
Cody-Waite's range reduction method. This method is only accurate for
values between [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
117435.992f] for sine. The algorithm identifies elements outside this
range and calls glibc in a scalar loop to compute their output.

The algorithm is a vectorized version of the methods presented
here: https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751

Accuracy: maximum ULP error = 1.49

Performance: The speed-up this implementation provides is dependent on
the values of the input array. It performs best when all the input
values are within the range specified above. Details of the performance
boost are provided below. Its worst performance is when all the array
elements are outside the range leading to about 1-2% reduction in
performance.

Three different benchmarking data are provided, each of which was benchmarked
using timeit package in python. Each function is executed 1000 times and
this is repeated 100 times. The standard deviation for all the runs was
less than 2% of their mean value and hence not included in the data.

(1) Micro-bencharking:
Array size = 10000, Command = "%timeit np.cos([myarr])"
|---------------+------------+--------+---------+----------+----------|
| Function Name | NumPy 1.16 | AVX2   | AVX512  | AVX2     | AVX512   |
|               |            |        |         | speed up | speed up |
|---------------+------------+--------+---------+----------+----------|
| np.cos        | 1.5174     | 0.1553 | 0.06634 | 9.77     | 22.87    |
| np.sin        | 1.4738     | 0.1517 | 0.06406 | 9.71     | 23.00    |
|---------------+------------+--------+---------+----------+----------|

(2) Package ai.cs provides an API to transform spherical coordinates to
cartesean system:
Array size = 10000, Command = "%timeit ai.cs.sp2cart(r,theta,phi)"
|---------------+------------+--------+--------+----------+----------|
| Function Name | NumPy 1.16 | AVX2   | AVX512 | AVX2     | AVX512   |
|               |            |        |        | speed up | speed up |
|---------------+------------+--------+--------+----------+----------|
| ai.cs.sp2cart | 0.6371     | 0.1066 | 0.0605 | 5.97     | 10.53    |
|---------------+------------+--------+--------+----------+----------|

(3) Package photutils provides an API to find the best fit of first and
second harmonic functions to a set of (angle, intensity) pairs:
Array size = 1000, Command = "%timeit fit_first_and_second_harmonics(E, data)"
|--------------------------------+------------+--------+--------+----------+----------|
| Function Name                  | NumPy 1.16 | AVX2   | AVX512 | AVX2     | AVX512   |
|                                |            |        |        | speed up | speed up |
|--------------------------------+------------+--------+--------+----------+----------|
| fit_first_and_second_harmonics | 1.598      | 0.8709 | 0.7761 | 1.83     | 2.05     |
|--------------------------------+------------+--------+--------+----------+----------|
---
 numpy/core/code_generators/generate_umath.py |   2 +
 numpy/core/include/numpy/npy_math.h          |  17 +-
 numpy/core/src/umath/loops.c.src             |  34 ++-
 numpy/core/src/umath/loops.h.src             |   2 +-
 numpy/core/src/umath/simd.inc.src            | 216 ++++++++++++++++++-
 5 files changed, 266 insertions(+), 5 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf1747272820..896e03278c12 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -662,6 +662,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='cos', astype={'e':'f'}),
           TD(P, f='cos'),
           ),
@@ -669,6 +670,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
+          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='sin', astype={'e':'f'}),
           TD(P, f='sin'),
           ),
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 6a78ff3c2b54..7831dd3d7190 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -144,7 +144,22 @@ NPY_INLINE static float __npy_nzerof(void)
 #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
 #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
 #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-
+/*
+ * Constants used in vector implementation of sinf/cosf(x)
+ */
+#define NPY_TWO_O_PIf 0x1.45f306p-1f
+#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
+#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
+#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
+#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
+#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
+#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
+#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
+#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
+#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
+#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
+#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
+#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
 /*
  * Integer functions.
  */
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1a488513330c..bc7e075cb524 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1594,8 +1594,8 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
- *  #func = exp, log#
- *  #scalarf = npy_expf, npy_logf#
+ *  #func = sin, cos, exp, log#
+ *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
@@ -1642,6 +1642,36 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
+
+/**begin repeat1
+ *  #func = cos, sin#
+ *  #scalarf = npy_cosf, npy_sinf#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    char str[] = "@func@";
+    @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
+#else
+    /*
+     * This is the path it would take if ISA was runtime detected, but not
+     * compiled for. It fixes the error on clang6.0 which fails to compile
+     * AVX512F version. Not sure if I like this idea, if during runtime it
+     * detects AXV512F, it will end up running the scalar version instead
+     * of AVX2.
+     */
+    UNARY_LOOP {
+	const npy_float in1 = *(npy_float *)ip1;
+	*(npy_float *)op1 = @scalarf@(in1);
+    }
+#endif
+}
+
+/**end repeat1**/
+
+
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 7f05a693a0c8..accd0eab6e74 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -178,7 +178,7 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
- *  #func = exp, log#
+ *  #func = sin, cos, exp, log#
  */
 NPY_NO_EXPORT void
 FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 9816a1da446a..58ec48447769 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -162,6 +162,11 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 /**end repeat1**/
 
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_int n, char*);
+#endif
+
 /**end repeat**/
 
 
@@ -1171,6 +1176,19 @@ avx2_blend(__m256 x, __m256 y, __m256 ymask)
     return _mm256_blendv_ps(x, y, ymask);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
+{
+   return _mm256_cvtepi32_ps(
+                _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_should_negate(__m256i k, __m256i andop, __m256i cmp)
+{
+    return avx2_should_calculate_sine(k, andop, cmp);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_get_exponent(__m256 x)
 {
@@ -1305,6 +1323,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
+{
+    return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
+{
+    return avx512_should_calculate_sine(k, andop, cmp);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_get_exponent(__m512 x)
 {
@@ -1336,6 +1366,16 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
  **/
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
@@ -1344,6 +1384,50 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
     reduced_x = @fmadd@(y, c3, reduced_x);
     return reduced_x;
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
+@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
+{
+    @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
+                                x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
+    @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
+                                x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
+    return _mm@vsize@_@or@(m1,m2);
+}
+
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
+                                                @vtype@ invf2, @vtype@ invf0)
+{
+    @vtype@ cos = @fmadd@(invf8, x2, invf6);
+    cos = @fmadd@(cos, x2, invf4);
+    cos = @fmadd@(cos, x2, invf2);
+    cos = @fmadd@(cos, x2, invf0);
+    return cos;
+}
+
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ */
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
+                                          @vtype@ invf5, @vtype@ invf3,
+                                          @vtype@ zero)
+{
+    @vtype@ sin = @fmadd@(invf9, x2, invf7);
+    sin = @fmadd@(sin, x2, invf5);
+    sin = @fmadd@(sin, x2, invf3);
+    sin = @fmadd@(sin, x2, zero);
+    sin = @fmadd@(sin, x, x);
+    return sin;
+}
 #endif
 /**end repeat**/
 
@@ -1366,6 +1450,137 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  */
 
 
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in ZMM/YMM registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction peforms poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x faster than scalar implementations when magnitude of all elements in
+ * the array < 71476.0625f (117435.992f for sine). Worst case performance is
+ * when all the elements are large leading to about 1-2% reduction in
+ * performance.
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size,
+                                                   char* operation)
+{
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_int compute_cos = 1;
+    npy_float large_number = 71476.0625f;
+    if (*operation == 's') {
+        compute_cos = 0;
+        large_number = 117435.992f;
+    }
+
+    /* Load up frequently used constants */
+    @vtype@i zeros = _mm@vsize@_set1_epi32(0);
+    @vtype@i ones = _mm@vsize@_set1_epi32(1);
+    @vtype@i twos = _mm@vsize@_set1_epi32(2);
+    @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
+    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
+    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
+    @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
+    @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
+    @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
+    @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
+    @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
+    @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
+    @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
+    @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
+    @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
+    @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
+    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
+    @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
+    @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
+    @vtype@i iquadrant;
+    @mask@ glibc_mask, sine_mask, negate_mask;
+    @mask@ load_mask = @isa@_get_full_load_mask();
+    npy_int num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes)
+            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+                                                         num_lanes);
+        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+
+        /*
+         * For elements outside of this range, Cody-Waite's range reduction
+         * becomes inaccurate and we will call glibc to compute cosine for
+         * these numbers
+         */
+
+        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
+        glibc_mask = @and_masks@(load_mask, glibc_mask);
+        x = @isa@_set_masked_lanes(x, zero_f, glibc_mask);
+        npy_int iglibc_mask = @mask_to_int@(glibc_mask);
+
+        if (iglibc_mask != @full_mask@) {
+            quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
+
+            /* round to nearest */
+            quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
+            quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
+
+            /* Cody-Waite's range reduction algorithm */
+            reduced_x = @isa@_range_reduction(x, quadrant,
+                                                   codyw_c1, codyw_c2, codyw_c3);
+            reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
+
+            /* compute cosine and sine */
+            cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
+                                                           cos_invf2, cos_invf0);
+            sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
+                                             sin_invf5, sin_invf3, zero_f);
+
+            iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
+            if (compute_cos)
+                iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
+
+            /* blend sin and cos based on the quadrant */
+            sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
+            cos = @isa@_blend(cos, sin, sine_mask);
+
+            /* multiply by -1 for appropriate elements */
+            negate_mask = @isa@_should_negate(iquadrant, twos, twos);
+            cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
+
+            @masked_store@(op, @cvtps_epi32@(load_mask), cos);
+        }
+
+        /* process elements using glibc for large elements */
+        if (compute_cos) {
+            for (int ii = 0; iglibc_mask != 0; ii++) {
+                if (iglibc_mask & 0x01)
+                    op[ii] = npy_cosf(ip[ii]);
+                iglibc_mask  = iglibc_mask >> 1;
+            }
+        }
+        else {
+            for (int ii = 0; iglibc_mask != 0; ii++) {
+                if (iglibc_mask & 0x01)
+                    op[ii] = npy_sinf(ip[ii]);
+                iglibc_mask  = iglibc_mask >> 1;
+            }
+        }
+        ip += num_lanes;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+
 /*
  * Vectorized implementation of exp using AVX2 and AVX512:
  * 1) if x >= xmax; return INF (overflow)
@@ -1381,7 +1596,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * same x = 0xc2781e37)
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_exp_FLOAT(npy_float * op,
                 npy_float * ip,

From e50e72513212764f673027328b9f33574cc3d254 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 1 May 2019 08:43:21 -0700
Subject: [PATCH 02/11] BUG: fixing NAN handling and adding tests for sin/cos

---
 numpy/core/src/umath/simd.inc.src | 10 ++++++----
 numpy/core/tests/test_umath.py    | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 58ec48447769..7bd5aa2b52e3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1473,7 +1473,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size,
+@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
                                                    char* operation)
 {
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
@@ -1505,9 +1505,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
     @vtype@i iquadrant;
-    @mask@ glibc_mask, sine_mask, negate_mask;
+    @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
     @mask@ load_mask = @isa@_get_full_load_mask();
-    npy_int num_remaining_elements = array_size;
+    npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
@@ -1524,7 +1524,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
-        x = @isa@_set_masked_lanes(x, zero_f, glibc_mask);
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
+        x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -1556,6 +1557,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             /* multiply by -1 for appropriate elements */
             negate_mask = @isa@_should_negate(iquadrant, twos, twos);
             cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
+            cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
 
             @masked_store@(op, @cvtps_epi32@(load_mask), cos);
         }
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 91d403d9868a..0948cbfe6a10 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -678,6 +678,23 @@ def test_log_values(self):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
+    def test_sincos_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.nan, np.nan]
+            y = [np.nan, -np.nan, np.inf, -np.inf]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.sin(yf), xf)
+                assert_equal(np.cos(yf), xf)
+
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.sin, np.float32(-np.inf))
+            assert_raises(FloatingPointError, np.sin, np.float32(np.inf))
+            assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
+            assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
+
+
 class TestExpLogFloat32(object):
     def test_exp_float32(self):
         np.random.seed(42)

From d56fbfda94a689e855ce8679c5828f7a729d777a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 7 May 2019 13:49:52 -0700
Subject: [PATCH 03/11] BUG: sin and cos cast float16 to float32

---
 numpy/core/code_generators/generate_umath.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 896e03278c12..bf31c0a8851d 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -662,6 +662,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
+          TD('e', f='cos', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='cos', astype={'e':'f'}),
           TD(P, f='cos'),
@@ -670,6 +671,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
+          TD('e', f='sin', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='sin', astype={'e':'f'}),
           TD(P, f='sin'),

From bca96282530fc15dc86c78b640a7b87581885513 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 20 May 2019 10:13:08 -0700
Subject: [PATCH 04/11] TEST: adding tests to validate AVX sin/cos
 implementation

---
 numpy/core/src/umath/simd.inc.src |  2 +-
 numpy/core/tests/test_umath.py    | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 7bd5aa2b52e3..07a3c19a4f06 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -164,7 +164,7 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_int n, char*);
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
 #endif
 
 /**end repeat**/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 0948cbfe6a10..747d893c2dd3 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -695,7 +695,7 @@ def test_sincos_values(self):
             assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
 
 
-class TestExpLogFloat32(object):
+class TestSIMDFloat32(object):
     def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
@@ -708,7 +708,14 @@ def test_log_float32(self):
         x_f64 = np.float64(x_f32)
         assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
 
-    def test_strided_exp_log_float32(self):
+    def test_sincos_float32(self):
+        np.random.seed(42)
+        x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=1000000))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=1.5)
+        assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=1.5)
+
+    def test_strided_float32(self):
         np.random.seed(42)
         strides = np.random.randint(low=-100, high=100, size=100)
         sizes = np.random.randint(low=1, high=2000, size=100)
@@ -716,9 +723,13 @@ def test_strided_exp_log_float32(self):
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
             exp_true = np.exp(x_f32)
             log_true = np.log(x_f32)
+            sin_true = np.sin(x_f32)
+            cos_true = np.cos(x_f32)
             for jj in strides:
                 assert_array_almost_equal_nulp(np.exp(x_f32[::jj]), exp_true[::jj], nulp=2)
                 assert_array_almost_equal_nulp(np.log(x_f32[::jj]), log_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.sin(x_f32[::jj]), sin_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.cos(x_f32[::jj]), cos_true[::jj], nulp=2)
 
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):

From 9cae09cfa46dcb8d4eed07f7df841a36da942b07 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 22 May 2019 11:17:44 -0700
Subject: [PATCH 05/11] BUG: use strides and process strided arrays using AVX

---
 numpy/core/src/umath/loops.c.src  | 21 +++++--------
 numpy/core/src/umath/simd.inc.src | 49 ++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index bc7e075cb524..f29b15477397 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1651,22 +1651,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     char str[] = "@func@";
-    @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
+    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
+        UNARY_LOOP {
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
 #else
-    /*
-     * This is the path it would take if ISA was runtime detected, but not
-     * compiled for. It fixes the error on clang6.0 which fails to compile
-     * AVX512F version. Not sure if I like this idea, if during runtime it
-     * detects AXV512F, it will end up running the scalar version instead
-     * of AVX2.
-     */
-    UNARY_LOOP {
-	const npy_float in1 = *(npy_float *)ip1;
-	*(npy_float *)op1 = @scalarf@(in1);
-    }
+	        const npy_float in1 = *(npy_float *)ip1;
+	        *(npy_float *)op1 = @scalarf@(in1);
 #endif
+        }
+    }
 }
 
 /**end repeat1**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 07a3c19a4f06..6da75d724d0d 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
 #endif
 
+static NPY_INLINE int
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
 /**end repeat**/
 
 
@@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
-                                                   char* operation)
+@ISA@_sincos_FLOAT(npy_float * op,
+                   npy_float * ip,
+                   const npy_intp array_size,
+                   const npy_intp steps,
+                   char* operation)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_int compute_cos = 1;
     npy_float large_number = 71476.0625f;
@@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
     @mask@ load_mask = @isa@_get_full_load_mask();
     npy_intp num_remaining_elements = array_size;
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                          num_lanes);
-        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+        }
+
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load(load_mask, ip);
+        }
+        else {
+            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+        }
 
         /*
          * For elements outside of this range, Cody-Waite's range reduction
@@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         /* process elements using glibc for large elements */
         if (compute_cos) {
             for (int ii = 0; iglibc_mask != 0; ii++) {
-                if (iglibc_mask & 0x01)
+                if (iglibc_mask & 0x01) {
                     op[ii] = npy_cosf(ip[ii]);
+                }
                 iglibc_mask  = iglibc_mask >> 1;
             }
         }
         else {
             for (int ii = 0; iglibc_mask != 0; ii++) {
-                if (iglibc_mask & 0x01)
+                if (iglibc_mask & 0x01) {
                     op[ii] = npy_sinf(ip[ii]);
+                }
                 iglibc_mask  = iglibc_mask >> 1;
             }
         }
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }

From cd9f1a87d6f200b3ccb4be7dc646b4832b321ba9 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 22 May 2019 16:27:47 -0700
Subject: [PATCH 06/11] BUG: AVX2 impl of sin/cos requires an FMA

Without an FMA, the output of AVX2 and AVX512 version differ. This
changes ensures the output across implementations remains exactly the
same.
---
 numpy/core/code_generators/generate_umath.py |  8 +-
 numpy/core/include/numpy/npy_common.h        |  8 +-
 numpy/core/setup_common.py                   |  5 +-
 numpy/core/src/umath/cpuid.c                 | 22 ++++++
 numpy/core/src/umath/loops.c.src             |  4 +-
 numpy/core/src/umath/loops.h.src             |  2 +-
 numpy/core/src/umath/simd.inc.src            | 79 ++++++++++----------
 7 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index bf31c0a8851d..ae871ea6fb8a 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -663,7 +663,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.cos'),
           None,
           TD('e', f='cos', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='cos', astype={'e':'f'}),
           TD(P, f='cos'),
           ),
@@ -672,7 +672,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.sin'),
           None,
           TD('e', f='sin', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='sin', astype={'e':'f'}),
           TD(P, f='sin'),
           ),
@@ -709,7 +709,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.exp'),
           None,
           TD('e', f='exp', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
           ),
@@ -732,7 +732,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.log'),
           None,
           TD('e', f='log', astype={'e':'f'}),
-          TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
+          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
           ),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 108c0a2027c8..27b83f7b5749 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -44,10 +44,14 @@
 #else
 #define NPY_GCC_TARGET_AVX
 #endif
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
+#define HAVE_ATTRIBUTE_TARGET_FMA
+#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
+#endif
+
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
 #else
 #define NPY_GCC_TARGET_AVX2
 #endif
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 6e3109ab5786..a3f7acd6df68 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -178,9 +178,10 @@ def check_api_version(apiversion, codegen_dir):
 # gcc 4.8.4 support attributes but not with intrisics
 # tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
 # function name will be converted to HAVE_<upper-case-name> preprocessor macro
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))',
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
                                 'attribute_target_avx2_with_intrinsics',
-                                '__m256 temp = _mm256_set1_ps(1.0)',
+                                '__m256 temp = _mm256_set1_ps(1.0); temp = \
+                                _mm256_fmadd_ps(temp, temp, temp)',
                                 'immintrin.h'),
                                 ('__attribute__((target("avx512f")))',
                                 'attribute_target_avx512f_with_intrinsics',
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 8673f1736e01..9d4ba13d8914 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -48,6 +48,25 @@ int os_avx512_support(void)
 #endif
 }
 
+static NPY_INLINE
+int cpu_supports_fma(void)
+{
+    unsigned int feature = 0x01;
+    unsigned int a, b, c, d;
+#ifdef __x86_64__
+    __asm__ volatile (
+        "cpuid"				"\n\t"
+	: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+	: "a" (feature));
+    /*
+     * FMA is the 12th bit of ECX
+     */
+    return (c >> 12) & 1;
+#else
+    return 0;
+#endif
+}
+
 /*
  * Primitive cpu feature detect function
  * Currently only supports checking for avx on gcc compatible compilers.
@@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature)
         return 0;
 #endif
     }
+    else if (strcmp(feature, "fma") == 0) {
+        return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
+    }
     else if (strcmp(feature, "avx2") == 0) {
         return __builtin_cpu_supports("avx2") && os_avx_support();
     }
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index f29b15477397..80bd8787571e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 /**end repeat**/
 
 /**begin repeat
- * #isa = avx512f, avx2#
- * #ISA = AVX512F, AVX2#
+ * #isa = avx512f, fma#
+ * #ISA = AVX512F, FMA#
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
  */
 
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index accd0eab6e74..5070ab38b918 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -184,7 +184,7 @@ NPY_NO_EXPORT void
 FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
- * #isa = avx512f, avx2#
+ * #isa = avx512f, fma#
  */
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 6da75d724d0d..43a0ddaade7a 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b)
  */
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512f#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512f#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  * #REGISTER_SIZE = 32, 64#
  */
 
@@ -141,7 +142,7 @@ abs_ptrdiff(char *a, char *b)
  * #func = exp, log#
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
 @ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
 #endif
@@ -149,7 +150,7 @@ static NPY_INLINE void
 static NPY_INLINE int
 run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
         @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
         return 1;
@@ -162,7 +163,7 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 /**end repeat1**/
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
 @ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
 #endif
@@ -170,7 +171,7 @@ static NPY_INLINE void
 static NPY_INLINE int
 run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
 {
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
         @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
         return 1;
@@ -1142,20 +1143,14 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_fmadd(__m256 a, __m256 b, __m256 c)
-{
-    return _mm256_add_ps(_mm256_mul_ps(a, b), c);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_full_load_mask(void)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_full_load_mask(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
@@ -1163,8 +1158,8 @@ avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
     return _mm256_loadu_ps(addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_gather(__m256 src,
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_gather(__m256 src,
                    npy_float* addr,
                    __m256i vindex,
                    __m256 mask)
@@ -1172,39 +1167,39 @@ avx2_masked_gather(__m256 src,
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_masked_load(__m256 mask, npy_float* addr)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_masked_load(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_blend(__m256 x, __m256 y, __m256 ymask)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
 {
    return _mm256_cvtepi32_ps(
                 _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_should_negate(__m256i k, __m256i andop, __m256i cmp)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
 {
-    return avx2_should_calculate_sine(k, andop, cmp);
+    return fma_should_calculate_sine(k, andop, cmp);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_exponent(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_exponent(__m256 x)
 {
     /*
      * Special handling of denormals:
@@ -1230,8 +1225,8 @@ avx2_get_exponent(__m256 x)
     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_mantissa(__m256 x)
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_get_mantissa(__m256 x)
 {
     /*
      * Special handling of denormals:
@@ -1369,17 +1364,18 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 #endif
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #or = or_ps, kor#
  * #vsub = , _mask#
  * #mask = __m256, __mmask16#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  **/
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
 
 /*
  * Vectorized Cody-Waite range reduction technique
@@ -1446,8 +1442,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 /**end repeat**/
 
 /**begin repeat
- * #ISA = AVX2, AVX512F#
- * #isa = avx2, avx512#
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #BYTES = 32, 64#
@@ -1456,11 +1452,12 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #or_masks =_mm256_or_ps, _mm512_kor#
  * #and_masks =_mm256_and_ps, _mm512_kand#
  * #xor_masks =_mm256_xor_ps, _mm512_kxor#
- * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
+ * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
  * #mask_to_int = _mm256_movemask_ps, #
  * #full_mask= 0xFF, 0xFFFF#
  * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
  * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  */
 
 
@@ -1485,7 +1482,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * performance.
  */
 
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
+#if defined @CHK@
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 @ISA@_sincos_FLOAT(npy_float * op,
                    npy_float * ip,

From 8589d4839dab4158abe1b241390c3d3838445721 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 23 May 2019 09:38:59 -0700
Subject: [PATCH 07/11] TEST: Rounding max tolerable ulp error to an int

The assert_array_max_ulp returns only an int since it compares ULP
difference between two float32 numbers.
---
 numpy/core/tests/test_umath.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 747d893c2dd3..95078b52b35d 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -700,20 +700,20 @@ def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
         x_f64 = np.float64(x_f32)
-        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=3)
 
     def test_log_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
         x_f64 = np.float64(x_f32)
-        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=4)
 
     def test_sincos_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=1000000))
         x_f64 = np.float64(x_f32)
-        assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=1.5)
-        assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=1.5)
+        assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
+        assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
 
     def test_strided_float32(self):
         np.random.seed(42)

From 0f78154c046a8f25fed6266fe4dbf4c903b6c4ce Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 23 May 2019 11:31:51 -0700
Subject: [PATCH 08/11] BUG: eliminate unsed variables warning in cpuid

---
 numpy/core/src/umath/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
index 9d4ba13d8914..72c6493e8e13 100644
--- a/numpy/core/src/umath/cpuid.c
+++ b/numpy/core/src/umath/cpuid.c
@@ -51,9 +51,9 @@ int os_avx512_support(void)
 static NPY_INLINE
 int cpu_supports_fma(void)
 {
+#ifdef __x86_64__
     unsigned int feature = 0x01;
     unsigned int a, b, c, d;
-#ifdef __x86_64__
     __asm__ volatile (
         "cpuid"				"\n\t"
 	: "=a" (a), "=b" (b), "=c" (c), "=d" (d)

From 44d14ab5bdee2d9a1ec7e3b9c5d41593ac855c54 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 29 May 2019 11:52:21 -0700
Subject: [PATCH 09/11] MAINT: using an enum to switch between sin/cos

---
 numpy/core/include/numpy/npy_math.h |  9 +++++++++
 numpy/core/src/umath/loops.c.src    |  6 +++---
 numpy/core/src/umath/simd.inc.src   | 17 ++++++++---------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 7831dd3d7190..126b861bf30f 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -177,6 +177,15 @@ NPY_INPLACE npy_long npy_lcml(npy_long a, npy_long b);
 NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b);
 
+/*
+ * avx function has a common API for both sin & cos. This enum is used to
+ * distinguish between the two
+ */
+typedef enum {
+    npy_compute_sin,
+    npy_compute_cos
+} NPY_TRIG_OP;
+
 /*
  * C99 double math funcs
  */
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 80bd8787571e..2028a5712c44 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1645,17 +1645,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 
 /**begin repeat1
  *  #func = cos, sin#
+ *  #enum = npy_compute_cos, npy_compute_sin#
  *  #scalarf = npy_cosf, npy_sinf#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
-    char str[] = "@func@";
-    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
+    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
         UNARY_LOOP {
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
+            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
 #else
 	        const npy_float in1 = *(npy_float *)ip1;
 	        *(npy_float *)op1 = @scalarf@(in1);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 43a0ddaade7a..4073ad47a305 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -165,15 +165,15 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
 #endif
 
 static NPY_INLINE int
-run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, NPY_TRIG_OP my_trig_op)
 {
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
-        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
+        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
         return 1;
     }
     else
@@ -1488,14 +1488,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                    npy_float * ip,
                    const npy_intp array_size,
                    const npy_intp steps,
-                   char* operation)
+                   NPY_TRIG_OP my_trig_op)
 {
     const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
-    npy_int compute_cos = 1;
     npy_float large_number = 71476.0625f;
-    if (*operation == 's') {
-        compute_cos = 0;
+    if (my_trig_op == npy_compute_sin) {
         large_number = 117435.992f;
     }
 
@@ -1575,8 +1573,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                              sin_invf5, sin_invf3, zero_f);
 
             iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
-            if (compute_cos)
+            if (my_trig_op == npy_compute_cos) {
                 iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
+            }
 
             /* blend sin and cos based on the quadrant */
             sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
@@ -1591,7 +1590,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         }
 
         /* process elements using glibc for large elements */
-        if (compute_cos) {
+        if (my_trig_op == npy_compute_cos) {
             for (int ii = 0; iglibc_mask != 0; ii++) {
                 if (iglibc_mask & 0x01) {
                     op[ii] = npy_cosf(ip[ii]);

From 6f34f5faa0c4e50e822c6aee01e074785fc86f33 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 29 May 2019 20:55:35 -0700
Subject: [PATCH 10/11] TEST: improving test coverage for sin/cos for input >
 117435.992f

---
 numpy/core/tests/test_umath.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 95078b52b35d..ef48fed05593 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -710,7 +710,12 @@ def test_log_float32(self):
 
     def test_sincos_float32(self):
         np.random.seed(42)
-        x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=1000000))
+        N = 1000000
+        M = np.int(N/20)
+        index = np.random.randint(low=0, high=N, size=M)
+        x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=N))
+        # test coverage for elements > 117435.992f for which glibc is used
+        x_f32[index] = np.float32(10E+10*np.random.rand(M))
         x_f64 = np.float64(x_f32)
         assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
         assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)

From 457b3e6979d84677b860cf1a7c22d1190ba9d895 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Sat, 3 Aug 2019 10:58:19 -0700
Subject: [PATCH 11/11] BUG: rename avx2_scalef_ps to fma_scalef_ps

---
 numpy/core/src/umath/simd.inc.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4073ad47a305..7aec1ff495b3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -1252,7 +1252,7 @@ fma_get_mantissa(__m256 x)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_scalef_ps(__m256 poly, __m256 quadrant)
+fma_scalef_ps(__m256 poly, __m256 quadrant)
 {
     /*
      * Handle denormals (which occur when quadrant <= -125):