8000 ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic) by seiko2plus · Pull Request #17985 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH, SIMD: Ditching the old CPU dispatcher(Arithmetic) #17985

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 19, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ numpy/core/src/umath/simd.inc
numpy/core/src/umath/struct_ufunc_test.c
numpy/core/src/umath/test_rational.c
numpy/core/src/umath/umath_tests.c
numpy/core/src/umath/loops_utils.h
numpy/distutils/__config__.py
numpy/linalg/umath_linalg.c
doc/source/**/generated/
Expand Down Expand Up @@ -218,3 +219,4 @@ numpy/core/src/_simd/_simd_data.inc
numpy/core/src/_simd/_simd_inc.h
# umath module
numpy/core/src/umath/loops_unary_fp.dispatch.c
numpy/core/src/umath/loops_arithm_fp.dispatch.c
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better in general to use full names like "arithmetic" rather than shortened names. Seven character names are long gone :) This can be changed later though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will consider it a policy to follow in my upcoming patches.

84 changes: 51 additions & 33 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,20 @@ class TypeDescription:
astype : dict or None, optional
If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
instead of PyUFunc_x_x/PyUFunc_xx_x.
cfunc_alias : str or none, optional
appended to inner loop C function name, e.g. FLOAT_{cfunc_alias} (see make_arrays)
NOTE: it doesn't support 'astype'
simd: list
Available SIMD ufunc loops, dispatched at runtime in specified order
Currently only supported for simples types (see make_arrays)
dispatch: list
Available SIMD ufunc loops, dispatched at runtime in specified order
Currently only supported for simples types (see make_arrays)
dispatch: str or None, optional
Dispatch-able source name without its extension '.dispatch.c' that
contains the definition of ufunc, dispatched at runtime depending on the
specified targets of the dispatch-able source.
NOTE: it doesn't support 'astype'
"""
def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
simd=None, dispatch=None):
self.type = type
self.func_data = f
if astype is None:
Expand All @@ -64,6 +70,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dis
if out is not None:
out = out.replace('P', type)
self.out = out
self.cfunc_alias = cfunc_alias
self.simd = simd
self.dispatch = dispatch

Expand All @@ -90,7 +97,8 @@ def build_func_data(types, f):
func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
return func_data

def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
simd=None, dispatch=None):
if f is not None:
if isinstance(f, str):
func_data = build_func_data(types, f)
Expand Down Expand Up @@ -119,13 +127,15 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
simdt = [k for k, v in simd if t in v]
else:
simdt = []

# [(dispatch file name without extension '.dispatch.c*', list of types)]
if dispatch:
dispt = [k for k, v in dispatch if t in v]
dispt = ([k for k, v in dispatch if t in v]+[None])[0]
else:
dispt = []
dispt = None
tds.append(TypeDescription(
t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
simd=simdt, dispatch=dispt
))
return tds

Expand Down Expand Up @@ -280,7 +290,7 @@ def english_upper(s):
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.add'),
'PyUFunc_AdditionTypeResolver',
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear: this adds fd (cmplxvec is FD). Also: I don't see where 'gG' (long double and long double complex) is taken into account. Am I missing something?

Copy link
Member Author
@seiko2plus seiko2plus Dec 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it dispatches single/double for both real and complex the definitions of these loops moved from loops.c.src and simd.inc to the new dispatch-able source loops_arithm_fp.dispatch.c.src.
NOTE: divide operation for complex single and double precision still in loops.c.src, we will need to vectorize this operation via NPYV and move it to this file later.

I don't see where 'gG' (long double and long double complex) is taken into account. Am I missing something?

loops_arithm_fp.dispatch.c.src only contains inner loops functions with SIMD code the rest operations
remain in loops.c.src.
I don't think we will support SIMD kernels for 80-bit and 128-bit precision, there's no hardware support for it but
maybe we can emulate light operations such as addition and subtract

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, my bad, it is in notimes_or_obj

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

128-bit precision

AArch64 and Power9 have support for quad precision floats. The astronomy community would like to have them and I think we will see more support coming in modern architectures.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AArch64 and Power9 have scalar support but not SIMD. Power9 users have to build NumPy with CFLAGS -mcpu=power9 or --cpu-baseline=vsx3 in order to get __float128 works, we can dispatch float128 operation in runtime for them if it's necessary.

[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
Expand All @@ -291,7 +301,7 @@ def english_upper(s):
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
docstrings.get('numpy.core.umath.subtract'),
'PyUFunc_SubtractionTypeResolver',
TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
Expand All @@ -302,7 +312,7 @@ def english_upper(s):
Ufunc(2, 1, One,
docstrings.get('numpy.core.umath.multiply'),
'PyUFunc_MultiplicationTypeResolver',
TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
[Type 9E88 Description('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'qm', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
Expand All @@ -326,10 +336,10 @@ def english_upper(s):
Ufunc(2, 1, None, # One is only a unit to the right, not the left
docstrings.get('numpy.core.umath.true_divide'),
'PyUFunc_TrueDivisionTypeResolver',
TD(flts+cmplx),
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
TypeDescription('m', FullTypeDescr, 'mm', 'd'),
TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
[TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
],
TD(O, f='PyNumber_TrueDivide'),
),
Expand Down Expand Up @@ -1000,6 +1010,7 @@ def make_arrays(funcdict):
# later
code1list = []
code2list = []
dispdict = {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe dispatch_dict?

names = sorted(funcdict.keys())
for name in names:
uf = funcdict[name]
Expand All @@ -1010,44 +1021,33 @@ def make_arrays(funcdict):
sub = 0

for t in uf.type_descriptions:
cfunc_alias = t.cfunc_alias if t.cfunc_alias else name
cfunc_fname = None
if t.func_data is FullTypeDescr:
tname = english_upper(chartoname[t.type])
datalist.append('(void *)NULL')
funclist.append(
'%s_%s_%s_%s' % (tname, t.in_, t.out, name))
cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
elif isinstance(t.func_data, FuncNameSuffix):
datalist.append('(void *)NULL')
tname = english_upper(chartoname[t.type])
funclist.append(
'%s_%s_%s' % (tname, name, t.func_data.suffix))
cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}"
elif t.func_data is None:
datalist.append('(void *)NULL')
tname = english_upper(chartoname[t.type])
funclist.append('%s_%s' % (tname, name))
cfunc_fname = f"{tname}_{cfunc_alias}"
if t.simd is not None:
for vt in t.simd:
code2list.append(textwrap.dedent("""\
#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
if (NPY_CPU_HAVE({ISA})) {{
{fname}_functions[{idx}] = {type}_{fname}_{isa};
{fname}_functions[{idx}] = {cname}_{isa};
}}
#endif
""").format(
ISA=vt.upper(), isa=vt,
fname=name, type=tname, idx=k
))
if t.dispatch is not None:
for dname in t.dispatch:
code2list.append(textwrap.dedent("""\
#ifndef NPY_DISABLE_OPTIMIZATION
#include "{dname}.dispatch.h"
#endif
NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
""").format(
dname=dname, name=name, tname=tname, k=k
fname=name, cname=cfunc_fname, idx=k
))
else:
funclist.append('NULL')
try:
thedict = arity_lookup[uf.nin, uf.nout]
except KeyError as e:
Expand Down Expand Up @@ -1077,6 +1077,13 @@ def make_arrays(funcdict):
#datalist.append('(void *)%s' % t.func_data)
sub += 1

if cfunc_fname:
funclist.append(cfunc_fname)
if t.dispatch:
dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
else:
funclist.append('NULL')

for x in t.in_ + t.out:
siglist.append('NPY_%s' % (english_upper(chartoname[x]),))

Expand All @@ -1091,6 +1098,17 @@ def make_arrays(funcdict):
% (name, datanames))
code1list.append("static char %s_signatures[] = {%s};"
% (name, signames))

for dname, funcs in dispdict.items():
code2list.append(textwrap.dedent(f"""
#ifndef NPY_DISABLE_OPTIMIZATION
#include "{dname}.dispatch.h"
#endif
"""))
for (ufunc_name, func_idx, cfunc_name) in funcs:
code2list.append(textwrap.dedent(f"""\
NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
"""))
return "\n".join(code1list), "\n".join(code2list)

def make_ufuncs(funcdict):
Expand Down
2 changes: 2 additions & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,8 +916,10 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'funcs.inc.src'),
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops_utils.h.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h.src'),
Expand Down
112 changes: 112 additions & 0 deletions numpy/core/src/umath/fast_loop_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
TYPE io1 = *(TYPE *)iop1; \
BINARY_REDUCE_LOOP_INNER

#define IS_BINARY_STRIDE_ONE(esize, vsize) \
((steps[0] == esize) && \
(steps[1] == esize) && \
(steps[2] == esize) && \
(abs_ptrdiff(args[2], args[0]) >= vsize) && \
(abs_ptrdiff(args[2], args[1]) >= vsize))

/*
* stride is equal to element size and input and destination are equal or
* don't overlap within one register. The check of the steps against
* esize also quarantees that steps are >= 0.
*/
#define IS_BLOCKABLE_UNARY(esize, vsize) \
(steps[0] == (esize) && steps[0] == steps[1] && \
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
((abs_ptrdiff(args[1], args[0]) == 0))))

/*
* Avoid using SIMD for very large step sizes for several reasons:
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
* in which case we need two i64gather instructions and an additional vinsertf32x8
* instruction to load a single zmm register (since one i64gather instruction
* loads into a ymm register). This is not ideal for performance.
* 2) Gather and scatter instructions can be slow when the loads/stores
* cross page boundaries.
*
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
* ensures this. The condition also requires that the input and output arrays
* should have no overlap in memory.
*/
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
((labs(steps[0]) < MAX_STEP_SIZE) && \
(labs(steps[1]) < MAX_STEP_SIZE) && \
(labs(steps[2]) < MAX_STEP_SIZE) && \
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))

#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
((labs(steps[0]) < MAX_STEP_SIZE) && \
(labs(steps[1]) < MAX_STEP_SIZE) && \
(labs(steps[2]) < MAX_STEP_SIZE) && \
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
(nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))

/*
* 1) Output should be contiguous, can handle strided input data
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
* 3) Input and output arrays should have no overlap in memory
*/
#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
((steps[0] & (esizein-1)) == 0 && \
steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
(nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))

#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
np BF01 y_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)))

#define IS_BLOCKABLE_BINARY(esize, vsize) \
(steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)) && \
(abs_ptrdiff(args[2], args[0]) >= (vsize) || \
abs_ptrdiff(args[2], args[0]) == 0) && \
(abs_ptrdiff(args[2], args[1]) >= (vsize) || \
abs_ptrdiff(args[2], args[1]) >= 0))

#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
(steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
(abs_ptrdiff(args[2], args[1]) == 0)) && \
abs_ptrdiff(args[2], args[0]) >= (esize))

#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
(steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
(abs_ptrdiff(args[2], args[0]) == 0)) && \
abs_ptrdiff(args[2], args[1]) >= (esize))

#undef abs_ptrdiff

#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
(steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
npy_is_aligned(args[1], (esize)) && \
npy_is_aligned(args[0], (esize)))

#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
(steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
npy_is_aligned(args[1], (esize)))

#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
(steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
npy_is_aligned(args[0], (esize)))

/* align var to alignment */
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
alignment, n);\
for(i = 0; i < peel; i++)

#define LOOP_BLOCKED(type, vsize)\
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
i += (vsize / sizeof(type)))

#define LOOP_BLOCKED_END\
for (; i < n; i++)


#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */
Loading
0