@@ -192,7 +192,6 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
192
192
/*******************************************************************************
193
193
** Defining the SIMD kernels
194
194
******************************************************************************/
195
-
196
195
/**begin repeat
197
196
* #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
198
197
* #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
@@ -205,6 +204,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
205
204
*/
206
205
#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
207
206
#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
207
+
208
208
#if @is_fp@ && !@fp_only@
209
209
#define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
210
210
#define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
@@ -217,119 +217,145 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
217
217
static inline void
218
218
simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
219
219
{
220
- // Note, 8x unroll was chosen for best results on Apple M1
221
- const int vectorsPerLoop = 8;
222
- const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
223
- npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
224
-
225
- npy_intp i = 0;
226
- npyv_lanetype_@sfx@ io1 = op1[0];
227
-
228
- // SIMD if possible
229
- if((i+elemPerLoop) <= len){
230
- npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
231
- npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
232
- npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
233
- npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
234
- npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
235
- npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
236
- npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
237
- npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
238
-
239
- i += elemPerLoop;
240
- for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
241
- npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
242
- npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
243
- npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
244
- npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
245
- npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
246
- npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
247
- npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
248
- npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
249
-
250
- m0 = V_INTRIN(m0, v0);
251
- m1 = V_INTRIN(m1, v1);
252
- m2 = V_INTRIN(m2, v2);
253
- m3 = V_INTRIN(m3, v3);
254
- m4 = V_INTRIN(m4, v4);
255
- m5 = V_INTRIN(m5, v5);
256
- m6 = V_INTRIN(m6, v6);
257
- m7 = V_INTRIN(m7, v7);
258
- }
259
-
260
- m0 = V_INTRIN(m0, m1);
261
- m2 = V_INTRIN(m2, m3);
262
- m4 = V_INTRIN(m4, m5);
263
- m6 = V_INTRIN(m6, m7);
264
-
265
- m0 = V_INTRIN(m0, m2);
266
- m4 = V_INTRIN(m4, m6);
267
-
268
- m0 = V_INTRIN(m0, m4);
269
-
270
- npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0);
271
-
272
- io1 = SCALAR_OP(io1, r);
220
+ if (len < 1) {
221
+ return;
273
222
}
274
-
223
+ const int vstep = npyv_nlanes_@sfx@;
224
+ const int wstep = vstep*8;
225
+ npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]);
226
+ for (; len >= wstep; len -= wstep, ip += wstep) {
227
+ #ifdef NPY_HAVE_SSE2
228
+ NPY_PREFETCH(ip + wstep, 0, 3);
229
+ #endif
230
+ npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
231
+ npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
232
+ npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
233
+ npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
234
+
235
+ npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
236
+ npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
237
+ npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
238
+ npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
239
+
240
+ npyv_@sfx@ r01 = V_INTRIN(v0, v1);
241
+ npyv_@sfx@ r23 = V_INTRIN(v2, v3);
242
+ npyv_@sfx@ r45 = V_INTRIN(v4, v5);
243
+ npyv_@sfx@ r67 = V_INTRIN(v6, v7);
244
+ acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
245
+ }
246
+ for (; len >= vstep; len -= vstep, ip += vstep) {
247
+ acc = V_INTRIN(acc, npyv_load_@sfx@(ip));
248
+ }
249
+ npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc);
275
250
// Scalar - finish up any remaining iterations
276
- for(; i< len; ++i) {
277
- const npyv_lanetype_@sfx@ in2 = ip[i] ;
278
- io1 = SCALAR_OP(io1 , in2);
251
+ for (; len > 0; --len, ++ip) {
252
+ const npyv_lanetype_@sfx@ in2 = *ip ;
253
+ r = SCALAR_OP(r , in2);
279
254
}
280
- op1[0] = io1 ;
255
+ op1[0] = r ;
281
256
}
282
257
283
258
// contiguous inputs and output.
284
259
static inline void
285
260
simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
286
261
npyv_lanetype_@sfx@ *op1, npy_intp len)
287
262
{
263
+ #if NPY_SIMD_WIDTH == 128
288
264
// Note, 6x unroll was chosen for best results on Apple M1
289
265
const int vectorsPerLoop = 6;
290
- const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
266
+ #else
267
+ // To avoid memory bandwidth bottleneck
268
+ const int vectorsPerLoop = 2;
269
+ #endif
270
+ const int elemPerVector = npyv_nlanes_@sfx@;
291
271
int elemPerLoop = vectorsPerLoop * elemPerVector;
292
272
293
273
npy_intp i = 0;
294
274
295
- for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
296
- npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
297
- npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
298
- npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
299
- npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
300
- npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
301
- npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
302
-
303
- npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
304
- npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
305
- npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
306
- npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
307
- npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
308
- npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
309
-
275
+ for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
276
+ npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
277
+ npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
278
+ #if NPY_SIMD_WIDTH == 128
279
+ npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
280
+ npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
281
+ npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
282
+ npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
283
+ #endif
284
+ npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
285
+ npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
286
+ #if NPY_SIMD_WIDTH == 128
287
+ npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
288
+ npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
289
+ npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
290
+ npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
291
+ #endif
310
292
npyv_@sfx@ m0 = V_INTRIN(v0, u0);
311
293
npyv_@sfx@ m1 = V_INTRIN(v1, u1);
294
+ #if NPY_SIMD_WIDTH == 128
312
295
npyv_@sfx@ m2 = V_INTRIN(v2, u2);
313
296
npyv_@sfx@ m3 = V_INTRIN(v3, u3);
314
297
npyv_@sfx@ m4 = V_INTRIN(v4, u4);
315
298
npyv_@sfx@ m5 = V_INTRIN(v5, u5);
316
-
299
+ #endif
317
300
npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
318
301
npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
302
+ #if NPY_SIMD_WIDTH == 128
319
303
npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
320
304
npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
321
305
npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
322
306
npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
307
+ #endif
308
+ }
309
+ for (; (i+elemPerVector) <= len; i += elemPerVector) {
310
+ npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i);
311
+ npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i);
312
+ npyv_@sfx@ m0 = V_INTRIN(v0, u0);
313
+ npyv_store_@sfx@(op1 + i, m0);
323
314
}
324
-
325
315
// Scalar - finish up any remaining iterations
326
- for(; i< len; ++i){
316
+ for (; i < len; ++i) {
327
317
const npyv_lanetype_@sfx@ in1 = ip1[i];
328
318
const npyv_lanetype_@sfx@ in2 = ip2[i];
329
-
330
319
op1[i] = SCALAR_OP(in1, in2);
331
320
}
332
321
}
322
+ // non-contiguous for float 32/64-bit memory access
323
+ #if @is_fp@
324
+ static inline void
325
+ simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
326
+ const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
327
+ npyv_lanetype_@sfx@ *op1, npy_intp sop1,
328
+ npy_intp len)
329
+ {
330
+ const int vstep = npyv_nlanes_@sfx@;
331
+ for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
332
+ ip2 += sip2*vstep, op1 += sop1*vstep
333
+ ) {
334
+ npyv_@sfx@ a, b;
335
+ if (sip1 == 1) {
336
+ a = npyv_load_@sfx@(ip1);
337
+ } else {
338
+ a = npyv_loadn_@sfx@(ip1, sip1);
339
+ }
340
+ if (sip2 == 1) {
341
+ b = npyv_load_@sfx@(ip2);
342
+ } else {
343
+ b = npyv_loadn_@sfx@(ip2, sip2);
344
+ }
345
+ npyv_@sfx@ r = V_INTRIN(a, b);
346
+ if (sop1 == 1) {
347
+ npyv_store_@sfx@(op1, r);
348
+ } else {
349
+ npyv_storen_@sfx@(op1, sop1, r);
350
+ }
351
+ }
352
+ for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
353
+ const npyv_lanetype_@sfx@ a = *ip1;
354
+ const npyv_lanetype_@sfx@ b = *ip2;
355
+ *op1 = SCALAR_OP(a, b);
356
+ }
357
+ }
358
+ #endif
333
359
334
360
#undef V_INTRIN
335
361
#undef V_REDUCE_INTRIN
@@ -415,6 +441,20 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
415
441
);
416
442
goto clear_fp;
417
443
}
444
+ // unroll scalars faster than non-contiguous vector load/store on Arm
445
+ #if !defined(NPY_HAVE_NEON) && @is_fp@
446
+ if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
447
+ TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
448
+ TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
449
+ ) {
450
+ TO_SIMD_SFX(simd_binary_@intrin@)(
451
+ (STYPE*)ip1, is1/sizeof(STYPE),
452
+ (STYPE*)ip2, is2/sizeof(STYPE),
453
+ (STYPE*)op1, os1/sizeof(STYPE), len
454
+ );
455
+ goto clear_fp;
456
+ }
457
+ #endif
418
458
}
419
459
#endif // TO_SIMD_SFX
420
460
#ifndef NPY_DISABLE_OPTIMIZATION
@@ -495,7 +535,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
495
535
*((@type@ *)op1) = SCALAR_OP(in1, in2);
496
536
}
497
537
#ifdef TO_SIMD_SFX
498
- clear_fp:;
538
+ clear_fp:
539
+ npyv_cleanup();
499
540
#endif
500
541
#if @is_fp@
501
542
npy_clear_floatstatus_barrier((char*)dimensions);
0 commit comments