8000 ENH, SIMD: serveral improvments for max/min · numpy/numpy@5fca2bf · GitHub
[go: up one dir, main page]

< 8000 a href="#start-of-content" data-skip-target-assigned="false" class="px-2 py-4 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content">Skip to content

Commit 5fca2bf

Browse files
committed
ENH, SIMD: serveral improvments for max/min
- Avoid unroll vectorized loops max/min by x6/x8 when SIMD width > 128 to avoid memory bandwidth bottleneck - tune reduce max/min - vectorize non-contiguos max/min - fix code style - call npyv_cleanup() at end of inner loop
1 parent 1b55815 commit 5fca2bf

File tree

1 file changed

+121
-80
l 10000 ines changed

1 file changed

+121
-80
lines changed

numpy/core/src/umath/loops_minmax.dispatch.c.src

Lines changed: 121 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,6 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
192192
/*******************************************************************************
193193
** Defining the SIMD kernels
194194
******************************************************************************/
195-
196195
/**begin repeat
197196
* #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
198197
* #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
@@ -205,6 +204,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
205204
*/
206205
#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
207206
#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
207+
208208
#if @is_fp@ && !@fp_only@
209209
#define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
210210
#define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
@@ -217,119 +217,145 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
217217
static inline void
218218
simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
219219
{
220-
// Note, 8x unroll was chosen for best results on Apple M1
221-
const int vectorsPerLoop = 8;
222-
const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
223-
npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
224-
225-
npy_intp i = 0;
226-
npyv_lanetype_@sfx@ io1 = op1[0];
227-
228-
// SIMD if possible
229-
if((i+elemPerLoop) <= len){
230-
npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
231-
npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
232-
npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
233-
npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
234-
npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
235-
npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
236-
npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
237-
npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
238-
239-
i += elemPerLoop;
240-
for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
241-
npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
242-
npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
243-
npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
244-
npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
245-
npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
246-
npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
247-
npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
248-
npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
249-
250-
m0 = V_INTRIN(m0, v0);
251-
m1 = V_INTRIN(m1, v1);
252-
m2 = V_INTRIN(m2, v2);
253-
m3 = V_INTRIN(m3, v3);
254-
m4 = V_INTRIN(m4, v4);
255-
m5 = V_INTRIN(m5, v5);
256-
m6 = V_INTRIN(m6, v6);
257-
m7 = V_INTRIN(m7, v7);
258-
}
259-
260-
m0 = V_INTRIN(m0, m1);
261-
m2 = V_INTRIN(m2, m3);
262-
m4 = V_INTRIN(m4, m5);
263-
m6 = V_INTRIN(m6, m7);
264-
265-
m0 = V_INTRIN(m0, m2);
266-
m4 = V_INTRIN(m4, m6);
267-
268-
m0 = V_INTRIN(m0, m4);
269-
270-
npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0);
271-
272-
io1 = SCALAR_OP(io1, r);
220+
if (len < 1) {
221+
return;
273222
}
274-
223+
const int vstep = npyv_nlanes_@sfx@;
224+
const int wstep = vstep*8;
225+
npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]);
226+
for (; len >= wstep; len -= wstep, ip += wstep) {
227+
#ifdef NPY_HAVE_SSE2
228+
NPY_PREFETCH(ip + wstep, 0, 3);
229+
#endif
230+
npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
231+
npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
232+
npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
233+
npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
234+
235+
npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
236+
npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
237+
npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
238+
npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
239+
240+
npyv_@sfx@ r01 = V_INTRIN(v0, v1);
241+
npyv_@sfx@ r23 = V_INTRIN(v2, v3);
242+
npyv_@sfx@ r45 = V_INTRIN(v4, v5);
243+
npyv_@sfx@ r67 = V_INTRIN(v6, v7);
244+
acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
245+
}
246+
for (; len >= vstep; len -= vstep, ip += vstep) {
247+
acc = V_INTRIN(acc, npyv_load_@sfx@(ip));
248+
}
249+
npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc);
275250
// Scalar - finish up any remaining iterations
276-
for(; i<len; ++i){
277-
const npyv_lanetype_@sfx@ in2 = ip[i];
278-
io1 = SCALAR_OP(io1, in2);
251+
for (; len > 0; --len, ++ip) {
252+
const npyv_lanetype_@sfx@ in2 = *ip;
253+
r = SCALAR_OP(r, in2);
279254
}
280-
op1[0] = io1;
255+
op1[0] = r;
281256
}
282257

283258
// contiguous inputs and output.
284259
static inline void
285260
simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
286261
npyv_lanetype_@sfx@ *op1, npy_intp len)
287262
{
263+
#if NPY_SIMD_WIDTH == 128
288264
// Note, 6x unroll was chosen for best results on Apple M1
289265
const int vectorsPerLoop = 6;
290-
const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
266+
#else
267+
// To avoid memory bandwidth bottleneck
268+
const int vectorsPerLoop = 2;
269+
#endif
270+
const int elemPerVector = npyv_nlanes_@sfx@;
291271
int elemPerLoop = vectorsPerLoop * elemPerVector;
292272

293273
npy_intp i = 0;
294274

295-
for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
296-
npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
297-
npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
298-
npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
299-
npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
300-
npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
301-
npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
302-
303-
npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
304-
npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
305-
npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
306-
npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
307-
npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
308-
npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
309-
275+
for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
276+
npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
277+
npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
278+
#if NPY_SIMD_WIDTH == 128
279+
npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
280+
npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
281+
npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
282+
npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
283+
#endif
284+
npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
285+
npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
286+
#if NPY_SIMD_WIDTH == 128
287+
npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
288+
npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
289+
npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
290+
npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
291+
#endif
310292
npyv_@sfx@ m0 = V_INTRIN(v0, u0);
311293
npyv_@sfx@ m1 = V_INTRIN(v1, u1);
294+
#if NPY_SIMD_WIDTH == 128
312295
npyv_@sfx@ m2 = V_INTRIN(v2, u2);
313296
npyv_@sfx@ m3 = V_INTRIN(v3, u3);
314297
npyv_@sfx@ m4 = V_INTRIN(v4, u4);
315298
npyv_@sfx@ m5 = V_INTRIN(v5, u5);
316-
299+
#endif
317300
npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
318301
npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
302+
#if NPY_SIMD_WIDTH == 128
319303
npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
320304
npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
321305
npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
322306
npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
307+
#endif
308+
}
309+
for (; (i+elemPerVector) <= len; i += elemPerVector) {
310+
npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i);
311+
npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i);
312+
npyv_@sfx@ m0 = V_INTRIN(v0, u0);
313+
npyv_store_@sfx@(op1 + i, m0);
323314
}
324-
325315
// Scalar - finish up any remaining iterations
326-
for(; i<len; ++i){
316+
for (; i < len; ++i) {
327317
const npyv_lanetype_@sfx@ in1 = ip1[i];
328318
const npyv_lanetype_@sfx@ in2 = ip2[i];
329-
330319
op1[i] = SCALAR_OP(in1, in2);
331320
}
332321
}
322+
// non-contiguous for float 32/64-bit memory access
323+
#if @is_fp@
324+
static inline void
325+
simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
326+
const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
327+
npyv_lanetype_@sfx@ *op1, npy_intp sop1,
328+
npy_intp len)
329+
{
330+
const int vstep = npyv_nlanes_@sfx@;
331+
for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
332+
ip2 += sip2*vstep, op1 += sop1*vstep
333+
) {
334+
npyv_@sfx@ a, b;
335+
if (sip1 == 1) {
336+
a = npyv_load_@sfx@(ip1);
337+
} else {
338+
a = npyv_loadn_@sfx@(ip1, sip1);
339+
}
340+
if (sip2 == 1) {
341+
b = npyv_load_@sfx@(ip2);
342+
} else {
343+
b = npyv_loadn_@sfx@(ip2, sip2);
344+
}
345+
npyv_@sfx@ r = V_INTRIN(a, b);
346+
if (sop1 == 1) {
347+
npyv_store_@sfx@(op1, r);
348+
} else {
349+
npyv_storen_@sfx@(op1, sop1, r);
350+
}
351+
}
352+
for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
353+
const npyv_lanetype_@sfx@ a = *ip1;
354+
const npyv_lanetype_@sfx@ b = *ip2;
355+
*op1 = SCALAR_OP(a, b);
356+
}
357+
}
358+
#endif
333359

334360
#undef V_INTRIN
335361
#undef V_REDUCE_INTRIN
@@ -415,6 +441,20 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
415441
);
416442
goto clear_fp;
417443
}
444+
// unroll scalars faster than non-contiguous vector load/store on Arm
445+
#if !defined(NPY_HAVE_NEON) && @is_fp@
446+
if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
447+
TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
448+
TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
449+
) {
450+
TO_SIMD_SFX(simd_binary_@intrin@)(
451+
(STYPE*)ip1, is1/sizeof(STYPE),
452+
(STYPE*)ip2, is2/sizeof(STYPE),
453+
(STYPE*)op1, os1/sizeof(STYPE), len
454+
);
455+
goto clear_fp;
456+
}
457+
#endif
418458
}
419459
#endif // TO_SIMD_SFX
420460
#ifndef NPY_DISABLE_OPTIMIZATION
@@ -495,7 +535,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
495535
*((@type@ *)op1) = SCALAR_OP(in1, in2);
496536
}
497537
#ifdef TO_SIMD_SFX
498-
clear_fp:;
538+
clear_fp:
539+
npyv_cleanup();
499540
#endif
500541
#if @is_fp@
501542
npy_clear_floatstatus_barrier((char*)dimensions);

0 commit comments

Comments
 (0)
0