8000 ENH: Add ability to runtime select ufunc loops, add AVX2 integer loops by juliantaylor · Pull Request #7980 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Add ability to runtime select ufunc loops, add AVX2 integer loops #7980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/release/1.12.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ bugs where such dtypes were not handled properly. In particular, changed
``ndarray.__new__`` to not implicitly convert ``dtype('S0')`` to
``dtype('S1')`` (and likewise for unicode) when creating new arrays.

Integer ufuncs vectorized with AVX2
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the cpu supports it at runtime the basic integer ufuncs now use AVX2
instructions. This feature is currently only available when compiled with GCC.

Changes
=======
Expand Down
68 changes: 43 additions & 25 deletions numpy/core/code_generators/generate_umath.py
TD(notimes_or_obj, simd=[('avx2', ints)]),
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@ class TypeDescription(object):
astype : dict or None, optional
If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
instead of PyUFunc_x_x/PyUFunc_xx_x.
simd: list
Available SIMD ufunc loops, dispatched at runtime in specified order
Currently only supported for simples types (see make_arrays)
"""
def __init__(self, type, f=None, in_=None, out=None, astype=None):
def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
self.type = type
self.func_data = f
if astype is None:
Expand All @@ -57,6 +60,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None):
if out is not None:
out = out.replace('P', type)
self.out = out
self.simd = simd

def finish_signature(self, nin, nout):
if self.in_ is None:
Expand All @@ -76,7 +80,7 @@ def build_func_data(types, f):
func_data.append(d)
return func_data

def TD(types, f=None, astype=None, in_=None, out=None):
def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
if f is not None:
if isinstance(f, str):
func_data = build_func_data(types, f)
Expand All @@ -95,7 +99,12 @@ def TD(types, f=None, astype=None, in_=None, out=None):
out = (None,) * len(types)
tds = []
for t, fd, i, o in zip(types, func_data, in_, out):
tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype))
# [(simd-name, list of types)]
if simd is not None:
simdt = [k for k, v in simd if t in v]
else:
simdt = []
tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
return tds

class Ufunc(object):
Expand Down Expand Up @@ -250,7 +259,7 @@ def english_upper(s):
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.add'),
'PyUFunc_AdditionTypeResolver',
TD(notimes_or_obj),
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
Expand All @@ -261,7 +270,7 @@ def english_upper(s):
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
docstrings.get('numpy.core.umath.subtract'),
'PyUFunc_SubtractionTypeResolver',
TD(notimes_or_obj),
TD(notimes_or_obj, simd=[('avx2', ints)]),
[TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
TypeDescription('m', FullTypeDescr, 'mm', 'm'),
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
Expand All @@ -272,7 +281,7 @@ def english_upper(s):
Ufunc(2, 1, One,
docstrings.get('numpy.core.umath.multiply'),
'PyUFunc_MultiplicationTypeResolver',
TD(notimes_or_obj),
TD(notimes_or_obj, simd=[('avx2', ints)]),
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'qm', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
Expand Down Expand Up @@ -319,7 +328,7 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.conjugate'),
None,
TD(ints+flts+cmplx),
TD(ints+flts+cmplx, simd=[('avx2', ints)]),
TD(P, f='conjugate'),
),
'fmod':
Expand All @@ -334,14 +343,14 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
TD(ints+inexact),
TD(ints+inexact, simd=[('avx2', ints)]),
TD(O, f='Py_square'),
),
'reciprocal':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.reciprocal'),
None,
TD(ints+inexact),
TD(ints+inexact, simd=[('avx2', ints)]),
TD(O, f='Py_reciprocal'),
),
# This is no longer used as numpy.ones_like, however it is
Expand Down Expand Up @@ -379,7 +388,7 @@ def english_upper(s):
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.negative'),
'PyUFunc_NegativeTypeResolver',
TD(bints+flts+timedeltaonly),
TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]),
TD(cmplx, f='neg'),
TD(O, f='PyNumber_Negative'),
),
Expand All @@ -393,57 +402,57 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'greater_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'less':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'less_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'not_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?'),
TD(all, out='?', simd=[('avx2', ints)]),
),
'logical_and':
Ufunc(2, 1, One,
docstrings.get('numpy.core.umath.logical_and'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(nodatetime_or_obj, out='?'),
TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
TD(O, f='npy_ObjectLogicalAnd'),
),
'logical_not':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.logical_not'),
None,
TD(nodatetime_or_obj, out='?'),
TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
TD(O, f='npy_ObjectLogicalNot'),
),
'logical_or':
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.logical_or'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(nodatetime_or_obj, out='?'),
TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
TD(O, f='npy_ObjectLogicalOr'),
),
'logical_xor':
Expand Down Expand Up @@ -497,42 +506,42 @@ def english_upper(s):
Ufunc(2, 1, AllOnes,
docstrings.get('numpy.core.umath.bitwise_and'),
None,
TD(bints),
TD(bints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_And'),
),
'bitwise_or':
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.bitwise_or'),
None,
TD(bints),
TD(bints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_Or'),
),
'bitwise_xor':
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.bitwise_xor'),
None,
TD(bints),
TD(bints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_Xor'),
),
'invert':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.invert'),
None,
TD(bints),
TD(bints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_Invert'),
),
'left_shift':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.left_shift'),
None,
TD(ints),
TD(ints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_Lshift'),
),
'right_shift':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.right_shift'),
None,
TD(ints),
TD(ints, simd=[('avx2', ints)]),
TD(O, f='PyNumber_Rshift'),
),
'degrees':
Expand Down Expand Up @@ -932,6 +941,15 @@ def make_arrays(funcdict):
datalist.append('(void *)NULL')
tname = english_upper(chartoname[t.type])
funclist.append('%s_%s' % (tname, name))
if t.simd is not None:
for vt in t.simd:
code2list.append("""\
#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
if (NPY_CPU_SUPPORTS_{ISA}) {{
{fname}_functions[{idx}] = {type}_{fname}_{isa};
}}
#endif
""".format(ISA=vt.upper(), isa=vt, fname=name, type=tname, idx=k))

for x in t.in_ + t.out:
siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
Expand Down
28 changes: 28 additions & 0 deletions numpy/core/include/numpy/npy_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@
#define NPY_GCC_OPT_3
#endif

/* compile target attributes */
#ifdef HAVE_ATTRIBUTE_TARGET_AVX
#define NPY_GCC_TARGET_AVX __attribute__((target("avx")))
#else
#define NPY_GCC_TARGET_AVX
#endif
#ifdef HAVE_ATTRIBUTE_TARGET_AVX2
#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
#else
#define NPY_GCC_TARGET_AVX2
#endif

/*
* mark an argument (starting from 1) that must not be NULL and is not checked
* DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
Expand Down Expand Up @@ -79,6 +91,22 @@
#endif
#endif

#ifdef HAVE___BUILTIN_CPU_SUPPORTS
#ifdef HAVE_ATTRIBUTE_TARGET_AVX2
#define NPY_CPU_SUPPORTS_AVX2 __builtin_cpu_supports("avx2")
#else
#define NPY_CPU_SUPPORTS_AVX2 0
#endif
#ifdef HAVE_ATTRIBUTE_TARGET_AVX
#define NPY_CPU_SUPPORTS_AVX __builtin_cpu_supports("avx")
#else
#define NPY_CPU_SUPPORTS_AVX 0
#endif
#else
#define NPY_CPU_SUPPORTS_AVX 0
#define NPY_CPU_SUPPORTS_AVX2 0
#endif

#if defined(_MSC_VER)
#define NPY_INLINE __inline
#elif defined(__GNUC__)
Expand Down
5 changes: 5 additions & 0 deletions numpy/core/setup_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def check_api_version(apiversion, codegen_dir):
("__builtin_bswap64", '5u'),
("__builtin_expect", '5, 0'),
("__builtin_mul_overflow", '5, 5, (int*)5'),
("__builtin_cpu_supports", '"sse"'),
("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
"xmmintrin.h"), # SSE
Expand All @@ -141,6 +142,10 @@ def check_api_version(apiversion, codegen_dir):
'attribute_optimize_opt_3'),
('__attribute__((nonnull (1)))',
'attribute_nonnull'),
('__attribute__((target ("avx")))',
'attribute_target_avx'),
('__attribute__((target ("avx2")))',
'attribute_target_avx2'),
]

# variable attributes tested via "int %s a" % attribute
Expand Down
Loading
0