numpy · juliantaylor · Sep 25, 2016 · Aug 27, 2016 · Aug 27, 2016 · Aug 27, 2016
diff --git a/doc/release/1.12.0-notes.rst b/doc/release/1.12.0-notes.rst
@@ -248,6 +248,10 @@ bugs where such dtypes were not handled properly.  In particular, changed
 ``ndarray.__new__`` to not implicitly convert ``dtype('S0')`` to
 ``dtype('S1')`` (and likewise for unicode) when creating new arrays.
 
+Integer ufuncs vectorized with AVX2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If the cpu supports it at runtime the basic integer ufuncs now use AVX2
+instructions. This feature is currently only available when compiled with GCC.
 
 Changes
 =======

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -44,8 +44,11 @@ class TypeDescription(object):
     astype : dict or None, optional
         If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
         instead of PyUFunc_x_x/PyUFunc_xx_x.
+    simd: list
+        Available SIMD ufunc loops, dispatched at runtime in specified order
+        Currently only supported for simples types (see make_arrays)
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -57,6 +60,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None):
         if out is not None:
             out = out.replace('P', type)
         self.out = out
+        self.simd = simd
 
     def finish_signature(self, nin, nout):
         if self.in_ is None:
@@ -76,7 +80,7 @@ def build_func_data(types, f):
         func_data.append(d)
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None):
+def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -95,7 +99,12 @@ def TD(types, f=None, astype=None, in_=None, out=None):
         out = (None,) * len(types)
     tds = []
     for t, fd, i, o in zip(types, func_data, in_, out):
-        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype))
+        # [(simd-name, list of types)]
+        if simd is not None:
+            simdt = [k for k, v in simd if t in v]
+        else:
+            simdt = []
+        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
     return tds
 
 class Ufunc(object):
@@ -250,7 +259,7 @@ def english_upper(s):
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -261,7 +270,7 @@ def english_upper(s):
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(notimes_or_obj),
+          TD(notimes_or_obj, simd=[('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -272,7 +281,7 @@ def english_upper(s):
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj),
+          TD(notimes_or_obj, simd=[('avx2', ints)]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -319,7 +328,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx),
+          TD(ints+flts+cmplx, simd=[('avx2', ints)]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -334,14 +343,14 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact),
+          TD(ints+inexact, simd=[('avx2', ints)]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact),
+          TD(ints+inexact, simd=[('avx2', ints)]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -379,7 +388,7 @@ def english_upper(s):
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(bints+flts+timedeltaonly),
+          TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
@@ -393,57 +402,57 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'greater_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'less':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'less_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'not_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?'),
+          TD(all, out='?', simd=[('avx2', ints)]),
           ),
 'logical_and':
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?'),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
           TD(O, f='npy_ObjectLogicalAnd'),
           ),
 'logical_not':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
-          TD(nodatetime_or_obj, out='?'),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
           TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?'),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
           TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor':
@@ -497,42 +506,42 @@ def english_upper(s):
     Ufunc(2, 1, AllOnes,
           docstrings.get('numpy.core.umath.bitwise_and'),
           None,
-          TD(bints),
+          TD(bints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_And'),
           ),
 'bitwise_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_or'),
           None,
-          TD(bints),
+          TD(bints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Or'),
           ),
 'bitwise_xor':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_xor'),
           None,
-          TD(bints),
+          TD(bints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Xor'),
           ),
 'invert':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.invert'),
           None,
-          TD(bints),
+          TD(bints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Invert'),
           ),
 'left_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.left_shift'),
           None,
-          TD(ints),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Lshift'),
           ),
 'right_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.right_shift'),
           None,
-          TD(ints),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Rshift'),
           ),
 'degrees':
@@ -932,6 +941,15 @@ def make_arrays(funcdict):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
                 funclist.append('%s_%s' % (tname, name))
+                if t.simd is not None:
+                    for vt in t.simd:
+                        code2list.append("""\
+#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
+if (NPY_CPU_SUPPORTS_{ISA}) {{
+    {fname}_functions[{idx}] = {type}_{fname}_{isa};
+}}
+#endif
+""".format(ISA=vt.upper(), isa=vt, fname=name, type=tname, idx=k))
 
             for x in t.in_ + t.out:
                 siglist.append('NPY_%s' % (english_upper(chartoname[x]),))

diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
@@ -28,6 +28,18 @@
 #define NPY_GCC_OPT_3
 #endif
 
+/* compile target attributes */
+#ifdef HAVE_ATTRIBUTE_TARGET_AVX
+#define NPY_GCC_TARGET_AVX __attribute__((target("avx")))
+#else
+#define NPY_GCC_TARGET_AVX
+#endif
+#ifdef HAVE_ATTRIBUTE_TARGET_AVX2
+#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
+#else
+#define NPY_GCC_TARGET_AVX2
+#endif
+
 /*
  * mark an argument (starting from 1) that must not be NULL and is not checked
  * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -79,6 +91,22 @@
 #endif
 #endif
 
+#ifdef HAVE___BUILTIN_CPU_SUPPORTS
+  #ifdef HAVE_ATTRIBUTE_TARGET_AVX2
+    #define NPY_CPU_SUPPORTS_AVX2 __builtin_cpu_supports("avx2")
+  #else
+    #define NPY_CPU_SUPPORTS_AVX2 0
+  #endif
+  #ifdef HAVE_ATTRIBUTE_TARGET_AVX
+    #define NPY_CPU_SUPPORTS_AVX __builtin_cpu_supports("avx")
+  #else
+    #define NPY_CPU_SUPPORTS_AVX 0
+  #endif
+#else
+  #define NPY_CPU_SUPPORTS_AVX 0
+  #define NPY_CPU_SUPPORTS_AVX2 0
+#endif
+
 #if defined(_MSC_VER)
         #define NPY_INLINE __inline
 #elif defined(__GNUC__)

diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
@@ -125,6 +125,7 @@ def check_api_version(apiversion, codegen_dir):
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
                        ("__builtin_mul_overflow", '5, 5, (int*)5'),
+                       ("__builtin_cpu_supports", '"sse"'),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
                        ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
                         "xmmintrin.h"),  # SSE
@@ -141,6 +142,10 @@ def check_api_version(apiversion, codegen_dir):
                                  'attribute_optimize_opt_3'),
                                 ('__attribute__((nonnull (1)))',
                                  'attribute_nonnull'),
+                                ('__attribute__((target ("avx")))',
+                                 'attribute_target_avx'),
+                                ('__attribute__((target ("avx2")))',
+                                 'attribute_target_avx2'),
                                 ]
 
 # variable attributes tested via "int %s a" % attribute