8000 ENH: Use AVX for float32 implementation of np.sin & np.cos by r-devulap · Pull Request #13368 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Use AVX for float32 implementation of np.sin & np.cos #13368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 18, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
BUG: use strides and process strided arrays using AVX
  • Loading branch information
r-devulap committed Aug 3, 2019
commit 9cae09cfa46dcb8d4eed07f7df841a36da942b07
21 changes: 8 additions & 13 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1651,22 +1651,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT NPY_GCC_OPT_3 void
FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
char str[] = "@func@";
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
UNARY_LOOP {
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
#else
/*
* This is the path it would take if ISA was runtime detected, but not
* compiled for. It fixes the error on clang6.0 which fails to compile
* AVX512F version. Not sure if I like this idea, if during runtime it
* detects AXV512F, it will end up running the scalar version instead
* of AVX2.
*/
UNARY_LOOP {
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = @scalarf@(in1);
}
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = @scalarf@(in1);
#endif
}
}
}

/**end repeat1**/
Expand Down
49 changes: 41 additions & 8 deletions numpy/core/src/umath/simd.inc.src
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
#endif

static NPY_INLINE int
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
{
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
return 1;
}
else
return 0;
#endif
return 0;
}

/**end repeat**/


Expand Down Expand Up @@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@

#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
char* operation)
@ISA@_sincos_FLOAT(npy_float * op,
npy_float * ip,
const npy_intp array_size,
const npy_intp steps,
char* operation)
{
const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
npy_int compute_cos = 1;
npy_float large_number = 71476.0625f;
Expand Down Expand Up @@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
@mask@ load_mask = @isa@_get_full_load_mask();
npy_intp num_remaining_elements = array_size;
npy_int indexarr[16];
for (npy_int ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);

while (num_remaining_elements > 0) {

if (num_remaining_elements < num_lanes)
if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
num_lanes);
@vtype@ x = @isa@_masked_load(load_mask, ip);
}

@vtype@ x;
if (stride == 1) {
x = @isa@_masked_load(load_mask, ip);
}
else {
x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
}

/*
* For elements outside of this range, Cody-Waite's range reduction
Expand Down Expand Up @@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
/* process elements using glibc for large elements */
if (compute_cos) {
for (int ii = 0; iglibc_mask != 0; ii++) {
if (iglibc_mask & 0x01)
if (iglibc_mask & 0x01) {
op[ii] = npy_cosf(ip[ii]);
}
iglibc_mask = iglibc_mask >> 1;
}
}
else {
for (int ii = 0; iglibc_mask != 0; ii++) {
if (iglibc_mask & 0x01)
if (iglibc_mask & 0x01) {
op[ii] = npy_sinf(ip[ii]);
}
iglibc_mask = iglibc_mask >> 1;
}
}
ip += num_lanes;
ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}
Expand Down
0