8000 ENH: Add Neon SIMD implementations for add, sub, mul, and div by DumbMice · Pull Request #16969 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Add Neon SIMD implementations for add, sub, mul, and div #16969

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 31, 2020
Prev Previous commit
Next Next commit
update
  • Loading branch information
DumbMice committed Jul 29, 2020
commit 8d1d95c7402ed3020eb0cba54c142fb1313edf0d
63 changes: 1 addition & 62 deletions numpy/core/src/umath/simd.inc.src
Original file line number Diff line number Diff line change
Expand Up @@ -3843,69 +3843,8 @@ neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ *
}
}
/**end repeat2**/
/*
static void
neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __aarch64__
const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1);
LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
op[i] = ip1[0] @OP@ ip2[i];
LOOP_BLOCKED(@type@, vector_size_bytes) {
@Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]);
@Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
@Nvpre@st1q_@Nvsuf@(&op[i], c);
}

#elif __aarch32__
const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1);
LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
op[i] = ip1[0] @OP@ ip2[i];
LOOP_BLOCKED(@type@, vector_size_bytes) {
@Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]);
@Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
@Nvpre@st1_@Nvsuf@(&op[i], c);
}

#endif
LOOP_BLOCKED_END {
op[i] = ip1[0] @OP@ ip2[i];
}
}


static void
neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __aarch64__
const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2);
LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
op[i] = ip1[i] @OP@ ip2[0];
LOOP_BLOCKED(@type@, vector_size_bytes) {
@Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]);
@Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b);
@Nvpre@st1q_@Nvsuf@(&op[i], c);
}

#elif __aarch32__
const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2);
LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
op[i] = ip1[i] @OP@ ip2[0];
LOOP_BLOCKED(@type@, vector_size_bytes) {
@Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]);
@Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b);
@Nvpre@st1_@Nvsuf@(&op[i], c);
}

#endif
LOOP_BLOCKED_END {
op[i] = ip1[i] @OP@ ip2[0];
}
}
*/

/**end repeat1**/
/**end repeat**/

#endif /* NPY_HAVENEON */
#endif /* NPY_HAVE_NEON */
#endif
0