8000 ENH: Add Neon SIMD implementations for add, sub, mul, and div by DumbMice · Pull Request #16969 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Add Neon SIMD implementations for add, sub, mul, and div #16969

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 31, 2020
114 changes: 113 additions & 1 deletion numpy/core/src/umath/simd.inc.src
56B4
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#undef __AVX512F__
#endif
#endif
#include "simd/simd.h"
#include <assert.h>
#include <stdlib.h>
#include <float.h>
Expand Down Expand Up @@ -505,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #vector = 1, 1, 0#
* #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
*/

/**begin repeat1
Expand Down Expand Up @@ -553,6 +555,18 @@ static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);

#elif @VECTOR@

static void
simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
static void
simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);

#endif

static NPY_INLINE int
Expand Down Expand Up @@ -584,6 +598,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
#elif @VECTOR@
@type@ * ip1 = (@type@ *)args[0];
@type@ * ip2 = (@type@ *)args[1];
@type@ * op = (@type@ *)args[2];
npy_intp n = dimensions[0];
/* argument one scalar */
if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
/* argument two scalar */
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
#endif
return 0;
}
Expand Down Expand Up @@ -3694,7 +3727,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
/**end repeat**/

#undef VECTOR_SIZE_BYTES
#else /* NPY_HAVE_SSE2_INTRINSICS */

#endif /* NPY_HAVE_SSE2_INTRINSICS */
/**begin repeat
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
* #sfx = f32, f64#
* #CHK = , _F64#
*/

#if NPY_SIMD@CHK@

/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
* # OP = +, -, *, /#
* # VOP = add, sub, mul, div#
*/

static void
simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
op[i] = ip1[i] @OP@ ip2[i];
}
/* lots of specializations, to squeeze out max performance */
if (ip1 == ip2) {
LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
npyv_store_@sfx@(&op[i], c);
}
}
else {
LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
npyv_store_@sfx@(&op[i], c);
}
}
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[i];
}
}

static void
simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
op[i] = ip1[0] @OP@ ip2[i];
}
LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
npyv_store_@sfx@(&op[i], v3);
}
LOOP_BLOCKED_END {
op[i] = ip1[0] @OP@ ip2[i];
}
}

static void
simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
op[i] = ip1[i] @OP@ ip2[0];
}
LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
npyv_store_@sfx@(&op[i], v3);
}
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[0];
}
}
/**end repeat1**/
#endif /* NPY_SIMD@CHK@ */
/**end repeat**/
#endif
#endif
0