8000 Merge pull request #17985 from seiko2plus/ditch_simd_arithmetic · numpy/numpy@f7bc512 · GitHub
[go: up one dir, main page]

Skip to content

Commit f7bc512

Browse files
authored
Merge pull request #17985 from seiko2plus/ditch_simd_arithmetic
ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic)
2 parents 811f9ee + 81bb563 commit f7bc512

10 files changed

+1226
-986
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ numpy/core/src/umath/simd.inc
170170
numpy/core/src/umath/struct_ufunc_test.c
171171
numpy/core/src/umath/test_rational.c
172172
numpy/core/src/umath/umath_tests.c
173+
numpy/core/src/umath/loops_utils.h
173174
numpy/distutils/__config__.py
174175
numpy/linalg/umath_linalg.c
175176
doc/source/**/generated/
@@ -216,3 +217,4 @@ numpy/core/src/_simd/_simd_data.inc
216217
numpy/core/src/_simd/_simd_inc.h
217218
# umath module
218219
numpy/core/src/umath/loops_unary_fp.dispatch.c
220+
numpy/core/src/umath/loops_arithm_fp.dispatch.c

numpy/core/code_generators/generate_umath.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,20 @@ class TypeDescription:
4545
astype : dict or None, optional
4646
If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
4747
instead of PyUFunc_x_x/PyUFunc_xx_x.
48+
cfunc_alias : str or none, optional
49+
Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays.
50+
NOTE: it doesn't support 'astype'
4851
simd: list
4952
Available SIMD ufunc loops, dispatched at runtime in specified order
5053
Currently only supported for simples types (see make_arrays)
51-
dispatch: list
52-
Available SIMD ufunc loops, dispatched at runtime in specified order
53-
Currently only supported for simples types (see make_arrays)
54+
dispatch: str or None, optional
55+
Dispatch-able source name without its extension '.dispatch.c' that
56+
contains the definition of ufunc, dispatched at runtime depending on the
57+
specified targets of the dispatch-able source.
58+
NOTE: it doesn't support 'astype'
5459
"""
55-
def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
60+
def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
61+
simd=None, dispatch=None):
5662
self.type = type
5763
self.func_data = f
5864
if astype is None:
@@ -64,6 +70,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dis
6470
if out is not None:
6571
out = out.replace('P', type)
6672
self.out = out
73+
self.cfunc_alias = cfunc_alias
6774
self.simd = simd
6875
self.dispatch = dispatch
6976

@@ -90,7 +97,8 @@ def build_func_data(types, f):
9097
func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
9198
return func_data
9299

93-
def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
100+
def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
101+
simd=None, dispatch=None):
94102
if f is not None:
95103
if isinstance(f, str):
96104
func_data = build_func_data(types, f)
@@ -119,13 +127,15 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
119127
simdt = [k for k, v in simd if t in v]
120128
else:
121129
simdt = []
130+
122131
# [(dispatch file name without extension '.dispatch.c*', list of types)]
123132
if dispatch:
124-
dispt = [k for k, v in dispatch if t in v]
133+
dispt = ([k for k, v in dispatch if t in v]+[None])[0]
125134
else:
126-
dispt = []
135+
dispt = None
127136
tds.append(TypeDescription(
128-
t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
137+
t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
138+
simd=simdt, dispatch=dispt
129139
))
130140
return tds
131141

@@ -280,7 +290,7 @@ def english_upper(s):
280290
Ufunc(2, 1, Zero,
281291
docstrings.get('numpy.core.umath.add'),
282292
'PyUFunc_AdditionTypeResolver',
283-
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
293+
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
284294
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
285295
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
286296
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -291,7 +301,7 @@ def english_upper(s):
291301
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
292302
docstrings.get('numpy.core.umath.subtract'),
293303
'PyUFunc_SubtractionTypeResolver',
294-
TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
304+
TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
295305
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
296306
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
297307
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -302,7 +312,7 @@ def english_upper(s):
302312
Ufunc(2, 1, One,
303313
docstrings.get('numpy.core.umath.multiply'),
304314
'PyUFunc_MultiplicationTypeResolver',
305-
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
315+
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
306316
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
307317
TypeDescription('m', FullTypeDescr, 'qm', 'm'),
308318
TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -326,10 +336,10 @@ def english_upper(s):
326336
Ufunc(2, 1, None, # One is only a unit to the right, not the left
327337
docstrings.get('numpy.core.umath.true_divide'),
328338
'PyUFunc_TrueDivisionTypeResolver',
329-
TD(flts+cmplx),
330-
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
331-
TypeDescription('m', FullTypeDescr, 'md', 'm'),
332-
TypeDescription('m', FullTypeDescr, 'mm', 'd'),
339+
TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
340+
[TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
341+
TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
342+
TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
333343
],
334344
TD(O, f='PyNumber_TrueDivide'),
335345
),
@@ -1000,6 +1010,7 @@ def make_arrays(funcdict):
10001010
# later
10011011
code1list = []
10021012
code2list = []
1013+
dispdict = {}
10031014
names = sorted(funcdict.keys())
10041015
for name in names:
10051016
uf = funcdict[name]
@@ -1010,44 +1021,33 @@ def make_arrays(funcdict):
10101021
sub = 0
10111022

10121023
for t in uf.type_descriptions:
1024+
cfunc_alias = t.cfunc_alias if t.cfunc_alias else name
1025+
cfunc_fname = None
10131026
if t.func_data is FullTypeDescr:
10141027
tname = english_upper(chartoname[t.type])
10151028
datalist.append('(void *)NULL')
1016-
funclist.append(
1017-
'%s_%s_%s_%s' % (tname, t.in_, t.out, name))
1029+
cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
10181030
elif isinstance(t.func_data, FuncNameSuffix):
10191031
datalist.append('(void *)NULL')
10201032
tname = english_upper(chartoname[t.type])
1021-
funclist.append(
1022-
'%s_%s_%s' % (tname, name, t.func_data.suffix))
1033+
cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}"
10231034
elif t.func_data is None:
10241035
datalist.append('(void *)NULL')
10251036
tname = english_upper(chartoname[t.type])
1026-
funclist.append('%s_%s' % (tname, name))
1037+
cfunc_fname = f"{tname}_{cfunc_alias}"
10271038
if t.simd is not None:
10281039
for vt in t.simd:
10291040
code2list.append(textwrap.dedent("""\
10301041
#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
10311042
if (NPY_CPU_HAVE({ISA})) {{
1032-
{fname}_functions[{idx}] = {type}_{fname}_{isa};
1043+
{fname}_functions[{idx}] = {cname}_{isa};
10331044
}}
10341045
#endif
10351046
""").format(
10361047
ISA=vt.upper(), isa=vt,
1037-
fname=name, type=tname, idx=k
1038-
))
1039-
if t.dispatch is not None:
1040-
for dname in t.dispatch:
1041-
code2list.append(textwrap.dedent("""\
1042-
#ifndef NPY_DISABLE_OPTIMIZATION
1043-
#include "{dname}.dispatch.h"
1044-
#endif
1045-
NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
1046-
""").format(
1047-
dname=dname, name=name, tname=tname, k=k
1048+
fname=name, cname=cfunc_fname, idx=k
10481049
))
10491050
else:
1050-
funclist.append('NULL')
10511051
try:
10521052
thedict = arity_lookup[uf.nin, uf.nout]
10531053
except KeyError as e:
@@ -1077,6 +1077,13 @@ def make_arrays(funcdict):
10771077
#datalist.append('(void *)%s' % t.func_data)
10781078
sub += 1
10791079

1080+
if cfunc_fname:
1081+
funclist.append(cfunc_fname)
1082+
if t.dispatch:
1083+
dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
1084+
else:
1085+
funclist.append('NULL')
1086+
10801087
for x in t.in_ + t.out:
10811088
siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
10821089

@@ -1091,6 +1098,17 @@ def make_arrays(funcdict):
10911098
% (name, datanames))
10921099
code1list.append("static char %s_signatures[] = {%s};"
10931100
% (name, signames))
1101+
1102+
for dname, funcs in dispdict.items():
1103+
code2list.append(textwrap.dedent(f"""
1104+
#ifndef NPY_DISABLE_OPTIMIZATION
1105+
#include "{dname}.dispatch.h"
1106+
#endif
1107+
"""))
1108+
for (ufunc_name, func_idx, cfunc_name) in funcs:
1109+
code2list.append(textwrap.dedent(f"""\
1110+
NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
1111+
"""))
10941112
return "\n".join(code1list), "\n".join(code2list)
10951113

10961114
def make_ufuncs(funcdict):

numpy/core/setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,8 +916,10 @@ def generate_umath_c(ext, build_dir):
916916
join('src', 'umath', 'funcs.inc.src'),
917917
join('src', 'umath', 'simd.inc.src'),
918918
join('src', 'umath', 'loops.h.src'),
919+
join('src', 'umath', 'loops_utils.h.src'),
919920
join('src', 'umath', 'loops.c.src'),
920921
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
922+
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
921923
join('src', 'umath', 'matmul.h.src'),
922924
join('src', 'umath', 'matmul.c.src'),
923925
join('src', 'umath', 'clip.h.src'),

numpy/core/src/umath/fast_loop_macros.h

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
237237
TYPE io1 = *(TYPE *)iop1; \
238238
BINARY_REDUCE_LOOP_INNER
239239

240+
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
241+
((steps[0] == esize) && \
242+
(steps[1] == esize) && \
243+
(steps[2] == esize) && \
244+
(abs_ptrdiff(args[2], args[0]) >= vsize) && \
245+
(abs_ptrdiff(args[2], args[1]) >= vsize))
246+
247+
/*
248+
* stride is equal to element size and input and destination are equal or
249+
* don't overlap within one register. The check of the steps against
250+
* esize also quarantees that steps are >= 0.
251+
*/
252+
#define IS_BLOCKABLE_UNARY(esize, vsize) \
253+
(steps[0] == (esize) && steps[0] == steps[1] && \
254+
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
255+
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
256+
((abs_ptrdiff(args[1], args[0]) == 0))))
257+
258+
/*
259+
* Avoid using SIMD for very large step sizes for several reasons:
260+
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
261+
* in which case we need two i64gather instructions and an additional vinsertf32x8
262+
* instruction to load a single zmm register (since one i64gather instruction
263+
* loads into a ymm register). This is not ideal for performance.
264+
* 2) Gather and scatter instructions can be slow when the loads/stores
265+
* cross page boundaries.
266+
*
267+
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
268+
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
269+
* ensures this. The condition also requires that the input and output arrays
270+
* should have no overlap in memory.
271+
*/
272+
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
273+
((labs(steps[0]) < MAX_STEP_SIZE) && \
274+
(labs(steps[1]) < MAX_STEP_SIZE) && \
275+
(labs(steps[2]) < MAX_STEP_SIZE) && \
276+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
277+
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
278+
279+
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
280+
((labs(steps[0]) < MAX_STEP_SIZE) && \
281+
(labs(steps[1]) < MAX_STEP_SIZE) && \
282+
(labs(steps[2]) < MAX_STEP_SIZE) && \
283+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
284+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
285+
286+
/*
287+
* 1) Output should be contiguous, can handle strided input data
288+
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
289+
* 3) Input and output arrays should have no overlap in memory
290+
*/
291+
#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
292+
((steps[0] & (esizein-1)) == 0 && \
293+
steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
294+
(nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
295+
296+
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
297+
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
298+
npy_is_aligned(args[1], (esize)) && \
299+
npy_is_aligned(args[0], (esize)))
300+
301+
#define IS_BLOCKABLE_BINARY(esize, vsize) \
302+
(steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
303+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
304+
npy_is_aligned(args[0], (esize)) && \
305+
(abs_ptrdiff(args[2], args[0]) >= (vsize) || \
306+
abs_ptrdiff(args[2], args[0]) == 0) && \
307+
(abs_ptrdiff(args[2], args[1]) >= (vsize) || \
308+
abs_ptrdiff(args[2], args[1]) >= 0))
309+
310+
#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
311+
(steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
312+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
313+
((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
314+
(abs_ptrdiff(args[2], args[1]) == 0)) && \
315+
abs_ptrdiff(args[2], args[0]) >= (esize))
316+
317+
#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
318+
(steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
319+
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
320+
((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
321+
(abs_ptrdiff(args[2], args[0]) == 0)) && \
322+
abs_ptrdiff(args[2], args[1]) >= (esize))
323+
324+
#undef abs_ptrdiff
325+
326+
#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
327+
(steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
328+
npy_is_aligned(args[1], (esize)) && \
329+
npy_is_aligned(args[0], (esize)))
330+
331+
#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
332+
(steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
333+
npy_is_aligned(args[1], (esize)))
334+
335+
#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
336+
(steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
337+
npy_is_aligned(args[0], (esize)))
338+
339+
/* align var to alignment */
340+
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
341+
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
342+
alignment, n);\
343+
for(i = 0; i < peel; i++)
344+
345+
#define LOOP_BLOCKED(type, vsize)\
346+
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
347+
i += (vsize / sizeof(type)))
348+
349+
#define LOOP_BLOCKED_END\
350+
for (; i < n; i++)
351+
240352

241353
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */

0 commit comments

Comments
 (0)
0