From ae244d74e9e9380a95a701ba06eb407c79b3ebf0 Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Tue, 28 Jul 2020 18:03:08 +0800 Subject: [PATCH 01/11] ENH: Add Neon implmentation for add, sub, mul, div --- numpy/core/src/umath/simd.inc.src | 178 ++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 7866f8143c6a..4a02ed047bc6 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -28,6 +28,9 @@ #undef __AVX512F__ #endif #endif +#ifdef NPY_HAVE_NEON +#include +#endif #include #include #include @@ -553,6 +556,18 @@ static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); +#elif @vector@ && defined NPY_HAVE_NEON + +static void +neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); + #endif static NPY_INLINE int @@ -584,6 +599,30 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } +#elif @vector@ && defined NPY_HAVE_NEON + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + @type@ * op = (@type@ *)args[2]; + npy_intp n = dimensions[0]; +#if defined __aarch64__ + const npy_uintp vector_size_bytes = 16; +#else + const npy_uintp vector_size_bytes = 8; +#endif + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { + neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { + neon_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { + neon_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } #endif return 0; } @@ -3697,4 +3736,143 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) #endif /* NPY_HAVE_SSE2_INTRINSICS */ +#ifdef NPY_HAVE_NEON +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #scalarf = npy_sqrtf, npy_sqrt# + * #Nvtype128 = float32x4_t, float64x2_t# + * #Nvtype64 = float32x2_t, float64x1_t# + * #Nvpre = v, v# + * #Nvsuf = f32, f64# + * #double = 0, 1# + */ + + +/**begin repeat1 +* Arithmetic +* # kind = add, subtract, multiply, divide# +* # OP = +, -, *, /# +* # VOP = add, sub, mul, div# +*/ + +static void +neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef __aarch64__ + const npy_intp vector_size_bytes = 16; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); + @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, a); + @Nvpre@st1q_@Nvsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); + @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]); + @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); + @Nvpre@st1q_@Nvsuf@(&op[i], c); + } + } + +#elif defined __aarch32__ + const npy_intp vector_size_bytes = 8; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[i]; + /* lots of specializations, to squeeze out max performance */ + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); + @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a); + @Nvpre@st1q_@Nvsuf@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); + @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]); + @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); + @Nvpre@st1_@Nvsuf@(&op[i], c); + } + } + +#endif + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[i]; + } +} + + +static void +neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef __aarch64__ + const npy_intp vector_size_bytes = 16; + const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[0] @OP@ ip2[i]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]); + @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); + @Nvpre@st1q_@Nvsuf@(&op[i], c); + } + +#elif defined __aarch32__ + const npy_intp vector_size_bytes = 8; + const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[0] @OP@ ip2[i]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]); + @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); + @Nvpre@st1_@Nvsuf@(&op[i], c); + } + +#endif + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; + } +} + + +static void +neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef __aarch64__ + const npy_intp vector_size_bytes = 16; + const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[0]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); + @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); + @Nvpre@st1q_@Nvsuf@(&op[i], c); + } + +#elif defined __aarch32__ + const npy_intp vector_size_bytes = 16; + const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[i] @OP@ ip2[0]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); + @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); + @Nvpre@st1_@Nvsuf@(&op[i], c); + } + +#endif + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[0]; + } +} + +/**end repeat1**/ +/**end repeat**/ + +#endif /* NPY_HAVENEON */ #endif From ad20babe8d46b01033e2fafd23e0a67d10bb5b05 Mon Sep 17 00:00:00 2001 From: DumbMice Date: Tue, 28 Jul 2020 22:22:53 +0800 Subject: [PATCH 02/11] Update simd.inc.src --- numpy/core/src/umath/simd.inc.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 4a02ed047bc6..f5a841944883 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3855,7 +3855,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i } #elif defined __aarch32__ - const npy_intp vector_size_bytes = 16; + const npy_intp vector_size_bytes = 8; const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; From 6c32cd020a21285ab3699f0f12b0b31b3785f9e7 Mon Sep 17 00:00:00 2001 From: DumbMice Date: Wed, 29 Jul 2020 10:15:31 +0800 Subject: [PATCH 03/11] delete the unused --- numpy/core/src/umath/simd.inc.src | 1 - 1 file changed, 1 deletion(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index f5a841944883..8fb88ba6e37d 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3745,7 +3745,6 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) * #Nvtype64 = float32x2_t, float64x1_t# * #Nvpre = v, v# * #Nvsuf = f32, f64# - * #double = 0, 1# */ From 380029d3f06146d6309123dc66d64d2561eb1ee6 Mon Sep 17 00:00:00 2001 From: DumbMice Date: Wed, 29 Jul 2020 10:33:05 +0800 Subject: [PATCH 04/11] Update simd.inc.src --- numpy/core/src/umath/simd.inc.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 8fb88ba6e37d..04dccc347899 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3788,7 +3788,7 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) LOOP_BLOCKED(@type@, vector_size_bytes) { @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a); - @Nvpre@st1q_@Nvsuf@(&op[i], c); + @Nvpre@st1_@Nvsuf@(&op[i], c); } } else { From df7b19929801a49724d3b3c8a7c1a6245e9d39bd Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Wed, 29 Jul 2020 14:56:40 +0800 Subject: [PATCH 05/11] update; extract from scalar1&2 --- numpy/core/src/._.DS_Store | Bin 0 -> 4096 bytes .../src/umath/_umath_tests.dispatch.asimdhp.c | 12 ++++ numpy/core/src/umath/simd.inc.src | 62 ++++++++++++++---- 3 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 numpy/core/src/._.DS_Store create mode 100644 numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c diff --git a/numpy/core/src/._.DS_Store b/numpy/core/src/._.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..41837bc8eb701c2c8b198f9c5595938388d2c675 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIhYCu0iY;W;207T z1eEv-(ZNuGRne&6Xb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDP&ot`f#!iQ z7|4ZWWELwFr55Lx7A2=Dq~#Z7D`e)Cq~?`m=I15m4jn|NjF3PEQ@v literal 0 HcmV?d00001 diff --git a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c new file mode 100644 index 000000000000..97f646f2867e --- /dev/null +++ b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c @@ -0,0 +1,12 @@ +/** + * AUTOGENERATED DON'T EDIT + * Please make changes to the code generator (distutils/ccompiler_opt.py) + */ +#define NPY__CPU_TARGET_MODE +#define NPY__CPU_TARGET_CURRENT ASIMDHP +#define NPY__CPU_TARGET_NEON +#define NPY__CPU_TARGET_NEON_FP16 +#define NPY__CPU_TARGET_NEON_VFPV4 +#define NPY__CPU_TARGET_ASIMD +#define NPY__CPU_TARGET_ASIMDHP +#include "/home/bangcheng/project/proj-2021348/numpy/core/src/umath/_umath_tests.dispatch.c" diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 04dccc347899..a8d6de3843e5 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -36,7 +36,14 @@ #include #include /* for memcpy */ +#ifdef NPY_HAVE_NEON +#ifdef __aarch64__ +#define vector_size_bytes 16 +#else +#define vector_size_bytes 8 +#endif #define VECTOR_SIZE_BYTES 16 +#endif /* * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. @@ -604,11 +611,6 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; -#if defined __aarch64__ - const npy_uintp vector_size_bytes = 16; -#else - const npy_uintp vector_size_bytes = 8; -#endif /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); @@ -3759,7 +3761,6 @@ static void neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __aarch64__ - const npy_intp vector_size_bytes = 16; LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ @@ -3779,8 +3780,7 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } -#elif defined __aarch32__ - const npy_intp vector_size_bytes = 8; +#else LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ @@ -3805,13 +3805,49 @@ neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) op[i] = ip1[i] @OP@ ip2[i]; } } +/**begin repeat2 + * scalar1 & scalar2 + * + * # scalar_loc = 1, 2# + * # vector_loc = 2, 1# + * # ip1ind = 0, i# + * # ip2ind = i, 0# + */ +static void +neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ +#ifdef __aarch64__ + const @Nvtype128@ v@scalar_loc@ = @Nvpre@ld1q_dup_@Nvsuf@(ip@scalar_loc@); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype128@ v@vector_loc@ = @Nvpre@ld1q_@Nvsuf@(&ip@vector_loc@[i]); + @Nvtype128@ v3 = @Nvpre@@VOP@q_@Nvsuf@(v1, v2); + @Nvpre@st1q_@Nvsuf@(&op[i], v3); + } + +#else + const @Nvtype64@ v@scalar_loc@ = @Nvpre@ld1_dup_@Nvsuf@(ip@scalar_loc@); + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; + LOOP_BLOCKED(@type@, vector_size_bytes) { + @Nvtype64@ v@vector_loc@ = @Nvpre@ld1_@Nvsuf@(&ip@vector_loc@[i]); + @Nvtype64@ v3 = @Nvpre@@VOP@_@Nvsuf@(v1, v2); + @Nvpre@st1_@Nvsuf@(&op[i], v3); + } +#endif + LOOP_BLOCKED_END { + op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; + } +} +/**end repeat2**/ +/* static void neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __aarch64__ - const npy_intp vector_size_bytes = 16; const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; @@ -3821,8 +3857,7 @@ neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i @Nvpre@st1q_@Nvsuf@(&op[i], c); } -#elif defined __aarch32__ - const npy_intp vector_size_bytes = 8; +#elif __aarch32__ const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; @@ -3843,7 +3878,6 @@ static void neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __aarch64__ - const npy_intp vector_size_bytes = 16; const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; @@ -3853,8 +3887,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i @Nvpre@st1q_@Nvsuf@(&op[i], c); } -#elif defined __aarch32__ - const npy_intp vector_size_bytes = 8; +#elif __aarch32__ const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2); LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; @@ -3869,6 +3902,7 @@ neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i op[i] = ip1[i] @OP@ ip2[0]; } } +*/ /**end repeat1**/ /**end repeat**/ From 867b2b9176e722fdd4e4661f26c6f41fd964aad9 Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Wed, 29 Jul 2020 15:37:14 +0800 Subject: [PATCH 06/11] update --- numpy/core/src/._.DS_Store | Bin 4096 -> 0 bytes .../src/umath/_umath_tests.dispatch.asimdhp.c | 12 ------------ 2 files changed, 12 deletions(-) delete mode 100644 numpy/core/src/._.DS_Store delete mode 100644 numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c diff --git a/numpy/core/src/._.DS_Store b/numpy/core/src/._.DS_Store deleted file mode 100644 index 41837bc8eb701c2c8b198f9c5595938388d2c675..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIhYCu0iY;W;207T z1eEv-(ZNuGRne&6Xb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDP&ot`f#!iQ z7|4ZWWELwFr55Lx7A2=Dq~#Z7D`e)Cq~?`m=I15m4jn|NjF3PEQ@v diff --git a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c b/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c deleted file mode 100644 index 97f646f2867e..000000000000 --- a/numpy/core/src/umath/_umath_tests.dispatch.asimdhp.c +++ /dev/null @@ -1,12 +0,0 @@ -/** - * AUTOGENERATED DON'T EDIT - * Please make changes to the code generator (distutils/ccompiler_opt.py) - */ -#define NPY__CPU_TARGET_MODE -#define NPY__CPU_TARGET_CURRENT ASIMDHP -#define NPY__CPU_TARGET_NEON -#define NPY__CPU_TARGET_NEON_FP16 -#define NPY__CPU_TARGET_NEON_VFPV4 -#define NPY__CPU_TARGET_ASIMD -#define NPY__CPU_TARGET_ASIMDHP -#include "/home/bangcheng/project/proj-2021348/numpy/core/src/umath/_umath_tests.dispatch.c" From 8d1d95c7402ed3020eb0cba54c142fb1313edf0d Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Wed, 29 Jul 2020 15:53:45 +0800 Subject: [PATCH 07/11] update --- numpy/core/src/umath/simd.inc.src | 63 +------------------------------ 1 file changed, 1 insertion(+), 62 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index a8d6de3843e5..fa2c39b3ae46 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3843,69 +3843,8 @@ neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * } } /**end repeat2**/ -/* -static void -neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef __aarch64__ - const @Nvtype128@ a = @Nvpre@ld1q_dup_@Nvsuf@(ip1); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]); - @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); - @Nvpre@st1q_@Nvsuf@(&op[i], c); - } - -#elif __aarch32__ - const @Nvtype64@ a = @Nvpre@ld1_dup_@Nvsuf@(ip1); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]); - @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); - @Nvpre@st1_@Nvsuf@(&op[i], c); - } - -#endif - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} - - -static void -neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef __aarch64__ - const @Nvtype128@ b = @Nvpre@ld1q_dup_@Nvsuf@(ip2); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); - @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); - @Nvpre@st1q_@Nvsuf@(&op[i], c); - } - -#elif __aarch32__ - const @Nvtype64@ b = @Nvpre@ld1_dup_@Nvsuf@(ip2); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); - @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); - @Nvpre@st1_@Nvsuf@(&op[i], c); - } - -#endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; - } -} -*/ - /**end repeat1**/ /**end repeat**/ -#endif /* NPY_HAVENEON */ +#endif /* NPY_HAVE_NEON */ #endif From 708bf277793444676483ff26ff27d29b3236b361 Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Wed, 29 Jul 2020 16:19:37 +0800 Subject: [PATCH 08/11] fix macros --- numpy/core/src/umath/simd.inc.src | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index fa2c39b3ae46..1d27fe4864b2 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -42,9 +42,10 @@ #else #define vector_size_bytes 8 #endif -#define VECTOR_SIZE_BYTES 16 #endif +#define VECTOR_SIZE_BYTES 16 + /* * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. * Very large step size can be as slow as processing it using scalar. The From 96b6b13c27e6ee2423ad5cc79d4761727c74a4c3 Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Thu, 30 Jul 2020 01:09:12 +0800 Subject: [PATCH 09/11] transfer neon into universal intrinsics --- numpy/core/src/umath/simd.inc.src | 144 +++++++++++------------------- 1 file changed, 51 insertions(+), 93 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 1d27fe4864b2..d4073fd15ae3 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -28,22 +28,12 @@ #undef __AVX512F__ #endif #endif -#ifdef NPY_HAVE_NEON -#include -#endif +#include "simd/simd.h" #include #include #include #include /* for memcpy */ -#ifdef NPY_HAVE_NEON -#ifdef __aarch64__ -#define vector_size_bytes 16 -#else -#define vector_size_bytes 8 -#endif -#endif - #define VECTOR_SIZE_BYTES 16 /* @@ -516,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #vector = 1, 1, 0# + * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # */ /**begin repeat1 @@ -564,16 +555,16 @@ static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); -#elif @vector@ && defined NPY_HAVE_NEON +#elif @VECTOR@ static void -neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, +simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); static void -neon_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, +simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); static void -neon_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, +simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); #endif @@ -607,23 +598,23 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } -#elif @vector@ && defined NPY_HAVE_NEON +#elif @VECTOR@ @type@ * ip1 = (@type@ *)args[0]; @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { - neon_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { - neon_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { - neon_binary_@kind@_@TYPE@(op, ip1, ip2, n); + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } #endif @@ -3739,17 +3730,13 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) #endif /* NPY_HAVE_SSE2_INTRINSICS */ -#ifdef NPY_HAVE_NEON /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# - * #Nvtype128 = float32x4_t, float64x2_t# - * #Nvtype64 = float32x2_t, float64x1_t# - * #Nvpre = v, v# - * #Nvsuf = f32, f64# + * #sfx = f32, f64# + * #CHK = , _F64# */ - +#if NPY_SIMD@CHK@ /**begin repeat1 * Arithmetic @@ -3759,93 +3746,64 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) */ static void -neon_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { -#ifdef __aarch64__ - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); - @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, a); - @Nvpre@st1q_@Nvsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype128@ a = @Nvpre@ld1q_@Nvsuf@(&ip1[i]); - @Nvtype128@ b = @Nvpre@ld1q_@Nvsuf@(&ip2[i]); - @Nvtype128@ c = @Nvpre@@VOP@q_@Nvsuf@(a, b); - @Nvpre@st1q_@Nvsuf@(&op[i], c); - } - } - -#else - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); - @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, a); - @Nvpre@st1_@Nvsuf@(&op[i], c); + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); + npyv_store_@sfx@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype64@ a = @Nvpre@ld1_@Nvsuf@(&ip1[i]); - @Nvtype64@ b = @Nvpre@ld1_@Nvsuf@(&ip2[i]); - @Nvtype64@ c = @Nvpre@@VOP@_@Nvsuf@(a, b); - @Nvpre@st1_@Nvsuf@(&op[i], c); + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); + npyv_store_@sfx@(&op[i], c); } } - -#endif LOOP_BLOCKED_END { op[i] = ip1[i] @OP@ ip2[i]; } } -/**begin repeat2 - * scalar1 & scalar2 - * - * # scalar_loc = 1, 2# - * # vector_loc = 2, 1# - * # ip1ind = 0, i# - * # ip2ind = i, 0# - */ static void -neon_binary_scalar@scalar_loc@_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { -#ifdef __aarch64__ - const @Nvtype128@ v@scalar_loc@ = @Nvpre@ld1q_dup_@Nvsuf@(ip@scalar_loc@); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype128@ v@vector_loc@ = @Nvpre@ld1q_@Nvsuf@(&ip@vector_loc@[i]); - @Nvtype128@ v3 = @Nvpre@@VOP@q_@Nvsuf@(v1, v2); - @Nvpre@st1q_@Nvsuf@(&op[i], v3); + const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) + op[i] = ip1[0] @OP@ ip2[i]; + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); } - -#else - const @Nvtype64@ v@scalar_loc@ = @Nvpre@ld1_dup_@Nvsuf@(ip@scalar_loc@); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; - LOOP_BLOCKED(@type@, vector_size_bytes) { - @Nvtype64@ v@vector_loc@ = @Nvpre@ld1_@Nvsuf@(&ip@vector_loc@[i]); - @Nvtype64@ v3 = @Nvpre@@VOP@_@Nvsuf@(v1, v2); - @Nvpre@st1_@Nvsuf@(&op[i], v3); + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; } +} -#endif +static void +simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) + op[i] = ip1[i] @OP@ ip2[0]; + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); + } LOOP_BLOCKED_END { - op[i] = ip1[@ip1ind@] @OP@ ip2[@ip2ind@]; + op[i] = ip1[i] @OP@ ip2[0]; } } -/**end repeat2**/ /**end repeat1**/ +#endif /* NPY_SIMD@CHK@ */ /**end repeat**/ -#endif /* NPY_HAVE_NEON */ #endif From 1286dc45d74cc33369a2dcc849545807908459e9 Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Thu, 30 Jul 2020 02:38:14 +0800 Subject: [PATCH 10/11] avoid defining simd_binary functions for sse2-enabled machines --- numpy/core/src/umath/simd.inc.src | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index d4073fd15ae3..49452c77b2e9 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3728,8 +3728,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) #undef VECTOR_SIZE_BYTES -#endif /* NPY_HAVE_SSE2_INTRINSICS */ - +#else /* NPY_HAVE_SSE2_INTRINSICS */ /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# @@ -3805,5 +3804,5 @@ simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i /**end repeat1**/ #endif /* NPY_SIMD@CHK@ */ /**end repeat**/ - +#endif #endif From a36153f72e51833af4a2116a51b4c5ca30d77bbd Mon Sep 17 00:00:00 2001 From: Bangcheng Yang Date: Thu, 30 Jul 2020 10:25:53 +0800 Subject: [PATCH 11/11] ENH: Add Neon SIMD implmentation for add, sub, mul, div --- numpy/core/src/umath/simd.inc.src | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 49452c77b2e9..40bb76914fab 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -3727,14 +3727,15 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) /**end repeat**/ #undef VECTOR_SIZE_BYTES - #else /* NPY_HAVE_SSE2_INTRINSICS */ + /**begin repeat * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# * #CHK = , _F64# */ + #if NPY_SIMD@CHK@ /**begin repeat1 @@ -3747,8 +3748,9 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) static void simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { op[i] = ip1[i] @OP@ ip2[i]; + } /* lots of specializations, to squeeze out max performance */ if (ip1 == ip2) { LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { @@ -3774,8 +3776,9 @@ static void simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) - op[i] = ip1[0] @OP@ ip2[i]; + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[0] @OP@ ip2[i]; + } LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); @@ -3790,8 +3793,9 @@ static void simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) - op[i] = ip1[i] @OP@ ip2[0]; + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[i] @OP@ ip2[0]; + } LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);