diff --git a/doc/release/1.12.0-notes.rst b/doc/release/1.12.0-notes.rst index f84511fc1925..7873ee24ad87 100644 --- a/doc/release/1.12.0-notes.rst +++ b/doc/release/1.12.0-notes.rst @@ -248,6 +248,10 @@ bugs where such dtypes were not handled properly. In particular, changed ``ndarray.__new__`` to not implicitly convert ``dtype('S0')`` to ``dtype('S1')`` (and likewise for unicode) when creating new arrays. +Integer ufuncs vectorized with AVX2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If the cpu supports it at runtime the basic integer ufuncs now use AVX2 +instructions. This feature is currently only available when compiled with GCC. Changes ======= diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index d44cabe7251b..21dab54c58db 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -44,8 +44,11 @@ class TypeDescription(object): astype : dict or None, optional If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y instead of PyUFunc_x_x/PyUFunc_xx_x. + simd: list + Available SIMD ufunc loops, dispatched at runtime in specified order + Currently only supported for simples types (see make_arrays) """ - def __init__(self, type, f=None, in_=None, out=None, astype=None): + def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None): self.type = type self.func_data = f if astype is None: @@ -57,6 +60,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None): if out is not None: out = out.replace('P', type) self.out = out + self.simd = simd def finish_signature(self, nin, nout): if self.in_ is None: @@ -76,7 +80,7 @@ def build_func_data(types, f): func_data.append(d) return func_data -def TD(types, f=None, astype=None, in_=None, out=None): +def TD(types, f=None, astype=None, in_=None, out=None, simd=None): if f is not None: if isinstance(f, str): func_data = build_func_data(types, f) @@ -95,7 +99,12 @@ def TD(types, f=None, astype=None, in_=None, out=None): out = (None,) * len(types) tds = [] for t, fd, i, o in zip(types, func_data, in_, out): - tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype)) + # [(simd-name, list of types)] + if simd is not None: + simdt = [k for k, v in simd if t in v] + else: + simdt = [] + tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt)) return tds class Ufunc(object): @@ -250,7 +259,7 @@ def english_upper(s): Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj), + TD(notimes_or_obj, simd=[('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -261,7 +270,7 @@ def english_upper(s): Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(notimes_or_obj), + TD(notimes_or_obj, simd=[('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -272,7 +281,7 @@ def english_upper(s): Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj), + TD(notimes_or_obj, simd=[('avx2', ints)]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -319,7 +328,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.conjugate'), None, - TD(ints+flts+cmplx), + TD(ints+flts+cmplx, simd=[('avx2', ints)]), TD(P, f='conjugate'), ), 'fmod': @@ -334,14 +343,14 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact), + TD(ints+inexact, simd=[('avx2', ints)]), TD(O, f='Py_square'), ), 'reciprocal': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.reciprocal'), None, - TD(ints+inexact), + TD(ints+inexact, simd=[('avx2', ints)]), TD(O, f='Py_reciprocal'), ), # This is no longer used as numpy.ones_like, however it is @@ -379,7 +388,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.negative'), 'PyUFunc_NegativeTypeResolver', - TD(bints+flts+timedeltaonly), + TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]), TD(cmplx, f='neg'), TD(O, f='PyNumber_Negative'), ), @@ -393,57 +402,57 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'greater_equal': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'less': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'less_equal': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'equal': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'not_equal': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.not_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?'), + TD(all, out='?', simd=[('avx2', ints)]), ), 'logical_and': Ufunc(2, 1, One, docstrings.get('numpy.core.umath.logical_and'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?'), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalAnd'), ), 'logical_not': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.logical_not'), None, - TD(nodatetime_or_obj, out='?'), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalNot'), ), 'logical_or': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.logical_or'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?'), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalOr'), ), 'logical_xor': @@ -497,42 +506,42 @@ def english_upper(s): Ufunc(2, 1, AllOnes, docstrings.get('numpy.core.umath.bitwise_and'), None, - TD(bints), + TD(bints, simd=[('avx2', ints)]), TD(O, f='PyNumber_And'), ), 'bitwise_or': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_or'), None, - TD(bints), + TD(bints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Or'), ), 'bitwise_xor': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_xor'), None, - TD(bints), + TD(bints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Xor'), ), 'invert': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.invert'), None, - TD(bints), + TD(bints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Invert'), ), 'left_shift': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.left_shift'), None, - TD(ints), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Lshift'), ), 'right_shift': Ufunc(2, 1, None, docstrings.get('numpy.core.umath.right_shift'), None, - TD(ints), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Rshift'), ), 'degrees': @@ -932,6 +941,15 @@ def make_arrays(funcdict): datalist.append('(void *)NULL') tname = english_upper(chartoname[t.type]) funclist.append('%s_%s' % (tname, name)) + if t.simd is not None: + for vt in t.simd: + code2list.append("""\ +#ifdef HAVE_ATTRIBUTE_TARGET_{ISA} +if (NPY_CPU_SUPPORTS_{ISA}) {{ + {fname}_functions[{idx}] = {type}_{fname}_{isa}; +}} +#endif +""".format(ISA=vt.upper(), isa=vt, fname=name, type=tname, idx=k)) for x in t.in_ + t.out: siglist.append('NPY_%s' % (english_upper(chartoname[x]),)) diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index baf5549d970f..21b15869f0ab 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -28,6 +28,18 @@ #define NPY_GCC_OPT_3 #endif +/* compile target attributes */ +#ifdef HAVE_ATTRIBUTE_TARGET_AVX +#define NPY_GCC_TARGET_AVX __attribute__((target("avx"))) +#else +#define NPY_GCC_TARGET_AVX +#endif +#ifdef HAVE_ATTRIBUTE_TARGET_AVX2 +#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define NPY_GCC_TARGET_AVX2 +#endif + /* * mark an argument (starting from 1) that must not be NULL and is not checked * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check @@ -79,6 +91,22 @@ #endif #endif +#ifdef HAVE___BUILTIN_CPU_SUPPORTS + #ifdef HAVE_ATTRIBUTE_TARGET_AVX2 + #define NPY_CPU_SUPPORTS_AVX2 __builtin_cpu_supports("avx2") + #else + #define NPY_CPU_SUPPORTS_AVX2 0 + #endif + #ifdef HAVE_ATTRIBUTE_TARGET_AVX + #define NPY_CPU_SUPPORTS_AVX __builtin_cpu_supports("avx") + #else + #define NPY_CPU_SUPPORTS_AVX 0 + #endif +#else + #define NPY_CPU_SUPPORTS_AVX 0 + #define NPY_CPU_SUPPORTS_AVX2 0 +#endif + #if defined(_MSC_VER) #define NPY_INLINE __inline #elif defined(__GNUC__) diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index ba7521e3043b..5e1396e6b701 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -125,6 +125,7 @@ def check_api_version(apiversion, codegen_dir): ("__builtin_bswap64", '5u'), ("__builtin_expect", '5, 0'), ("__builtin_mul_overflow", '5, 5, (int*)5'), + ("__builtin_cpu_supports", '"sse"'), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE ("_mm_prefetch", '(float*)0, _MM_HINT_NTA', "xmmintrin.h"), # SSE @@ -141,6 +142,10 @@ def check_api_version(apiversion, codegen_dir): 'attribute_optimize_opt_3'), ('__attribute__((nonnull (1)))', 'attribute_nonnull'), + ('__attribute__((target ("avx")))', + 'attribute_target_avx'), + ('__attribute__((target ("avx2")))', + 'attribute_target_avx2'), ] # variable attributes tested via "int %s a" % attribute diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 157b30e70964..60b5464f1e20 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -786,51 +786,71 @@ NPY_NO_EXPORT void } } -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +/**begin repeat1 + * #isa = , _avx2# + * #ISA = , AVX2# + * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2# + * #ATTR = , NPY_GCC_TARGET_AVX2# + */ + +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_square@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) { UNARY_LOOP_FAST(@type@, @type@, *out = in * in); } +#endif -NPY_NO_EXPORT void -@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_reciprocal@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) { UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); } +#endif -NPY_NO_EXPORT void -@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_conjugate@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { UNARY_LOOP_FAST(@type@, @type@, *out = in); } +#endif -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_negative@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { UNARY_LOOP_FAST(@type@, @type@, *out = -in); } +#endif -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_logical_not@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); } +#endif -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_invert(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_invert@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { UNARY_LOOP_FAST(@type@, @type@, *out = ~in); } +#endif -/**begin repeat1 +/**begin repeat2 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, * left_shift, right_shift# * #OP = +, -,*, &, |, ^, <<, >># */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { if(IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { @@ -842,17 +862,19 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); } } +#endif -/**end repeat1**/ +/**end repeat2**/ -/**begin repeat1 +/**begin repeat2 * #kind = equal, not_equal, greater, greater_equal, less, less_equal, * logical_and, logical_or# * #OP = ==, !=, >, >=, <, <=, &&, ||# */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { /* * gcc vectorization of this is not good (PR60575) but manual integer @@ -860,11 +882,13 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void */ BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); } +#endif -/**end repeat1**/ +/**end repeat2**/ -NPY_NO_EXPORT void -@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) +#if @CHK@ +NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void +@TYPE@_logical_xor@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { BINARY_LOOP { const int t1 = !!*(@type@ *)ip1; @@ -872,6 +896,9 @@ NPY_NO_EXPORT void *((npy_bool *)op1) = (t1 != t2); } } +#endif + +/**end repeat1**/ /**begin repeat1 * #kind = maximum, minimum# diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index a6e775a3ad15..67d8d74c3fa4 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -61,46 +61,52 @@ BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN NPY_NO_EXPORT void @S@@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); +/**begin repeat2 + * #isa = , _avx2# + */ + NPY_NO_EXPORT void -@S@@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); +@S@@TYPE@_square@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void -@S@@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void -@S@@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_conjugate@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_negative@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_logical_not@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_invert(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_invert@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); -/**begin repeat2 +/**begin repeat3 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, * left_shift, right_shift# * #OP = +, -,*, &, |, ^, <<, >># */ NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ +/**end repeat3**/ -/**begin repeat2 +/**begin repeat3 * #kind = equal, not_equal, greater, greater_equal, less, less_equal, * logical_and, logical_or# * #OP = ==, !=, >, >=, <, <=, &&, ||# */ NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ +@S@@TYPE@_@kind@@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ NPY_NO_EXPORT void -@S@@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ /**begin repeat2 * #kind = maximum, minimum#