8000 ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic) · numpy/numpy@0985a73 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0985a73

Browse files
committed
ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic)
The first patch in a series of pull-requests aims to facilitate the migration process to our new SIMD interface(NPYV). It is basically a process that focuses on getting rid of the main umath SIMD source `simd.inc`, which contains almost all SIMD kernels, by splitting it into several dispatch-able sources without changing the base code, which facilitates the review process in order to speed up access to the nominal target. In this patch, we have moved the arithmetic operations of real and complex for single/double precision to the new CPU dispatcher. NOTE: previously, the SIMD code of AVX2 and AVX512F for single/double precision wasn't dispatched in runtime before.
1 parent d084917 commit 0985a73

10 files changed

+1192
-967
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ numpy/core/src/umath/simd.inc
172172
numpy/core/src/umath/struct_ufunc_test.c
173173
numpy/core/src/umath/test_rational.c
174174
numpy/core/src/umath/umath_tests.c
175+
numpy/core/src/umath/loops_utils.h
175176
numpy/distutils/__config__.py
176177
numpy/linalg/umath_linalg.c
177178
doc/source/**/generated/
@@ -218,3 +219,4 @@ numpy/core/src/_simd/_simd_data.inc
218219
numpy/core/src/_simd/_simd_inc.h
219220
# umath module
220221
numpy/core/src/umath/loops_unary_fp.dispatch.c
222+
numpy/core/src/umath/loops_arithm_fp.dispatch.c

numpy/core/code_generators/generate_umath.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,19 @@ class TypeDescription:
4646
If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
4747
instead of PyUFunc_x_x/PyUFunc_xx_x.
4848
cfunc_alias : str or none, optional
49-
replaces the suffix of C function name instead of using ufunc_name,
50-
e.g. "FLOAT_{cfunc_alias}" instead of "FLOAT_{ufunc_name}" (see make_arrays)
49+
appended to inner loop C function name, e.g. FLOAT_{cfunc_alias} (see make_arrays)
5150
NOTE: it doesn't support 'astype'
5251
simd: list
5352
Available SIMD ufunc loops, dispatched at runtime in specified order
5453
Currently only supported for simples types (see make_arrays)
5554
dispatch: str or None, optional
56-
Dispatch-able source name without its extension '.dispatch.c' that contains the definition of ufunc,
57-
dispatched at runtime depending on the specified targets of the dispatch-able source.
55+
Dispatch-able source name without its extension '.dispatch.c' that
56+
contains the definition of ufunc, dispatched at runtime depending on the
57+
specified targets of the dispatch-able source.
5858
NOTE: it doesn't support 'astype'
5959
"""
60-
def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None, simd=None, dispatch=None):
60+
def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
61+
simd=None, dispatch=None):
6162
self.type = type
6263
self.func_data = f
6364
if astype is None:
@@ -96,7 +97,8 @@ def build_func_data(types, f):
9697
func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
9798
return func_data
9899

99-
def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, simd=None, dispatch=None):
100+
def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
101+
simd=None, dispatch=None):
100102
if f is not None:
101103
if isinstance(f, str):
102104
func_data = build_func_data(types, f)
@@ -132,7 +134,8 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None, simd=No
132134
else:
133135
dispt = None
134136
tds.append(TypeDescription(
135-
t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias, simd=simdt, dispatch=dispt
137+
t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
138+
simd=simdt, dispatch=dispt
136139
))
137140
return tds
138141

@@ -287,7 +290,7 @@ def english_upper(s):
287290
Ufunc(2, 1, Zero,
288291
docstrings.get('numpy.core.umath.add'),
289292
'PyUFunc_AdditionTypeResolver',
290-
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
293+
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
291294
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
292295
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
293296
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -298,7 +301,7 @@ def english_upper(s):
298301
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
299302
docstrings.get('numpy.core.umath.subtract'),
300303
'PyUFunc_SubtractionTypeResolver',
301-
TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
304+
TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
302305
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
303306
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
304307
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -309,7 +312,7 @@ def english_upper(s):
309312
Ufunc(2, 1, One,
310313
docstrings.get('numpy.core.umath.multiply'),
311314
'PyUFunc_MultiplicationTypeResolver',
312-
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
315+
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
313316
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
314317
TypeDescription('m', FullTypeDescr, 'qm', 'm'),
315318
TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -333,10 +336,10 @@ def english_upper(s):
333336
Ufunc(2, 1, None, # One is only a unit to the right, not the left
334337
docstrings.get('numpy.core.umath.true_divide'),
335338
'PyUFunc_TrueDivisionTypeResolver',
336-
TD(flts+cmplx),
337-
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
338-
TypeDescription('m', FullTypeDescr, 'md', 'm'),
339-
TypeDescription('m', FullTypeDescr, 'mm', 'd'),
339+
TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
340+
[TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
341+
TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
342+
TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
340343
],
341344
TD(O, f='PyNumber_TrueDivide'),
342345
),

numpy/core/setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,8 +916,10 @@ def generate_umath_c(ext, build_dir):
916916
join('src', 'umath', 'funcs.inc.src'),
917917
join('src', 'umath', 'simd.inc.src'),
918918
join('src', 'umath', 'loops.h.src'),
919+
join('src', 'umath', 'loops_utils.h.src'),
919920
join('src', 'umath', 'loops.c.src'),
920921
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
922+
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
921923
join('src', 'umath', 'matmul.h.src'),
922924
join('src', 'umath', 'matmul.c.src'),
923925
join('src', 'umath', 'clip.h.src'),

numpy/core/src/umath/fast_loop_macros.h

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
237237
TYPE io1 = *(TYPE *)iop1; \
238238
BINARY_REDUCE_LOOP_INNER
239239

240+
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
241+
((steps[0] == esize) && \
242+
(steps[1] == esize) && \
243+
(steps[2] == esize) && \
244+
(abs_ptrdiff(args[2], args[0]) >= vsize) && \
245+
(abs_ptrdiff(args[2], args[1]) >= vsize))
246+
247+
/*
248+
* stride is equal to element size and input and destination are equal or
249+
* don't overlap within one register. The check of the steps against
250+
* esize also quarantees that steps are >= 0.
251+
*/
252+
#define IS_BLOCKABLE_UNARY(esize, vsize) \
253+
(steps[0] == (esize) && steps[0] == steps[1] && \
254+
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
255+
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
256+
((abs_ptrdiff(args[1], args[0]) == 0))))
257+
258+
/*
259+
* Avoid using SIMD for very large step sizes for several reasons:
260+
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
261+
* in which case we need two i64gather instructions and an additional vinsertf32x8
262+
* instruction to load a single zmm register (since one i64gather instruction
263+
* loads into a ymm register). This is not ideal for performance.
264+
* 2) Gather and scatter instructions can be slow when the loads/stores
265+
* cross page boundaries.
266+
*
267+
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
268+
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
269+
* ensures this. The condition also requires that the input and output arrays
270+
* should have no overlap in memory.
271+
*/
272+
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
273+
((labs(steps[0]) < MAX_STEP_SIZE) && \
274+
(labs(steps[1]) < MAX_STEP_SIZE) && \
275+
(labs(steps[2]) < MAX_STEP_SIZE) && \
276+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
277+
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
278+
279+
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
280+
((labs(steps[0]) < MAX_STEP_SIZE) && \
281+
(labs(steps[1]) < MAX_STEP_SIZE) && \
282+
(labs(steps[2]) < MAX_STEP_SIZE) && \
283+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
284+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
285+
286+
/*
287+
* 1) Output should be contiguous, can handle strided input data
288+
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
289+
* 3) Input and output arrays should have no overlap in memory
290+
*/
291+
#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
292+
((steps[0] & (esizein-1)) == 0 && \
293+
steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
294+
(nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
295+
296+
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
297+
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
298+
npy_is_aligned(args[1], (esize)) && \
299+
npy_is_aligned(args[0], (esize)))
300+
301+
#define IS_BLOCKABLE_BINARY(esize, vsize) \
302+
(steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
303+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
304+
npy_is_aligned(args[0], (esize)) && \
305+
(abs_ptrdiff(args[2], args[0]) >= (vsize) || \
306+
abs_ptrdiff(args[2], args[0]) == 0) && \
307+
(abs_ptrdiff(args[2], args[1]) >= (vsize) || \
308+
abs_ptrdiff(args[2], args[1]) >= 0))
309+
310+
#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
311+
(steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
312+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
313+
((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
314+
(abs_ptrdiff(args[2], args[1]) == 0)) && \
315+
abs_ptrdiff(args[2], args[0]) >= (esize))
316+
317+
#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
318+
(steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
319+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
320+
((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
321+
(abs_ptrdiff(args[2], args[0]) == 0)) && \
322+
abs_ptrdiff(args[2], args[1]) >= (esize))
323+
324+
#undef abs_ptrdiff
325+
326+
#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
327+
(steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
328+
npy_is_aligned(args[1], (esize)) && \
329+
npy_is_aligned(args[0], (esize)))
330+
331+
#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
332+
(steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
333+
npy_is_aligned(args[1], (esize)))
334+
335+
#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
336+
(steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
337+
npy_is_aligned(args[0], (esize)))
338+
339+
/* align var to alignment */
340+
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
341+
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
342+
alignment, n);\
343+
for(i = 0; i < peel; i++)
344+
345+
#define LOOP_BLOCKED(type, vsize)\
346+
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
347+
i += (vsize / sizeof(type)))
348+
349+
#define LOOP_BLOCKED_END\
350+
for (; i < n; i++)
351+
240352

241353
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */

0 commit comments

Comments
 (0)
0