numpy · charris · Dec 19, 2020 · Dec 10, 2020 · Dec 11, 2020 · Dec 11, 2020
diff --git a/.gitignore b/.gitignore
@@ -172,6 +172,7 @@ numpy/core/src/umath/simd.inc
 numpy/core/src/umath/struct_ufunc_test.c
 numpy/core/src/umath/test_rational.c
 numpy/core/src/umath/umath_tests.c
+numpy/core/src/umath/loops_utils.h
 numpy/distutils/__config__.py
 numpy/linalg/umath_linalg.c
 doc/source/**/generated/
@@ -218,3 +219,4 @@ numpy/core/src/_simd/_simd_data.inc
 numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary_fp.dispatch.c
+numpy/core/src/umath/loops_arithm_fp.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -45,14 +45,20 @@ class TypeDescription:
     astype : dict or None, optional
         If astype['x'] is 'y', uses PyUFunc_x_x_As_y_y/PyUFunc_xx_x_As_yy_y
         instead of PyUFunc_x_x/PyUFunc_xx_x.
+    cfunc_alias : str or none, optional
+        appended to inner loop C function name, e.g. FLOAT_{cfunc_alias} (see make_arrays)
+        NOTE: it doesn't support 'astype'
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
-    dispatch: list
-        Available SIMD ufunc loops, dispatched at runtime in specified order
-        Currently only supported for simples types (see make_arrays)
+    dispatch: str or None, optional
+        Dispatch-able source name without its extension '.dispatch.c' that
+        contains the definition of ufunc, dispatched at runtime depending on the
+        specified targets of the dispatch-able source.
+        NOTE: it doesn't support 'astype'
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
+                 simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -64,6 +70,7 @@ def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dis
         if out is not None:
             out = out.replace('P', type)
         self.out = out
+        self.cfunc_alias = cfunc_alias
         self.simd = simd
         self.dispatch = dispatch
 
@@ -90,7 +97,8 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
+def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
+       simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -119,13 +127,15 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None)
             simdt = [k for k, v in simd if t in v]
         else:
             simdt = []
+
         # [(dispatch file name without extension '.dispatch.c*', list of types)]
         if dispatch:
-            dispt = [k for k, v in dispatch if t in v]
+            dispt = ([k for k, v in dispatch if t in v]+[None])[0]
         else:
-            dispt = []
+            dispt = None
         tds.append(TypeDescription(
-            t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+            t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
+            simd=simdt, dispatch=dispt
         ))
     return tds
 
@@ -280,7 +290,7 @@ def english_upper(s):
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -291,7 +301,7 @@ def english_upper(s):
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -302,7 +312,7 @@ def english_upper(s):
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
           [Type
9E88
Description('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -326,10 +336,10 @@ def english_upper(s):
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.true_divide'),
           'PyUFunc_TrueDivisionTypeResolver',
-          TD(flts+cmplx),
-          [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
-           TypeDescription('m', FullTypeDescr, 'md', 'm'),
-           TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+          TD(flts+cmplx, cfunc_alias='divide', dispatch=[('loops_arithm_fp', 'fd')]),
+          [TypeDescription('m', FullTypeDescr, 'mq', 'm', cfunc_alias='divide'),
+           TypeDescription('m', FullTypeDescr, 'md', 'm', cfunc_alias='divide'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
           ),
@@ -1000,6 +1010,7 @@ def make_arrays(funcdict):
     # later
     code1list = []
     code2list = []
+    dispdict  = {}
     names = sorted(funcdict.keys())
     for name in names:
         uf = funcdict[name]
@@ -1010,44 +1021,33 @@ def make_arrays(funcdict):
         sub = 0
 
         for t in uf.type_descriptions:
+            cfunc_alias = t.cfunc_alias if t.cfunc_alias else name
+            cfunc_fname = None
             if t.func_data is FullTypeDescr:
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
-                funclist.append(
-                        '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+                cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append(
-                        '%s_%s_%s' % (tname, name, t.func_data.suffix))
+                cfunc_fname = f"{tname}_{cfunc_alias}_{t.func_data.suffix}"
             elif t.func_data is None:
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append('%s_%s' % (tname, name))
+                cfunc_fname = f"{tname}_{cfunc_alias}"
                 if t.simd is not None:
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
                         #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
                         if (NPY_CPU_HAVE({ISA})) {{
-                            {fname}_functions[{idx}] = {type}_{fname}_{isa};
+                            {fname}_functions[{idx}] = {cname}_{isa};
                         }}
                         #endif
                         """).format(
                             ISA=vt.upper(), isa=vt,
-                            fname=name, type=tname, idx=k
-                        ))
-                if t.dispatch is not None:
-                    for dname in t.dispatch:
-                        code2list.append(textwrap.dedent("""\
-                        #ifndef NPY_DISABLE_OPTIMIZATION
-                        #include "{dname}.dispatch.h"
-                        #endif
-                        NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
-                        """).format(
-                            dname=dname, name=name, tname=tname, k=k
+                            fname=name, cname=cfunc_fname, idx=k
                         ))
             else:
-                funclist.append('NULL')
                 try:
                     thedict = arity_lookup[uf.nin, uf.nout]
                 except KeyError as e:
@@ -1077,6 +1077,13 @@ def make_arrays(funcdict):
                     #datalist.append('(void *)%s' % t.func_data)
                 sub += 1
 
+            if cfunc_fname:
+                funclist.append(cfunc_fname)
+                if t.dispatch:
+                    dispdict.setdefault(t.dispatch, []).append((name, k, cfunc_fname))
+            else:
+                funclist.append('NULL')
+
             for x in t.in_ + t.out:
                 siglist.append('NPY_%s' % (english_upper(chartoname[x]),))
 
@@ -1091,6 +1098,17 @@ def make_arrays(funcdict):
                          % (name, datanames))
         code1list.append("static char %s_signatures[] = {%s};"
                          % (name, signames))
+
+    for dname, funcs in dispdict.items():
+        code2list.append(textwrap.dedent(f"""
+            #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "{dname}.dispatch.h"
+            #endif
+        """))
+        for (ufunc_name, func_idx, cfunc_name) in funcs:
+            code2list.append(textwrap.dedent(f"""\
+                NPY_CPU_DISPATCH_CALL_XB({ufunc_name}_functions[{func_idx}] = {cfunc_name});
+            """))
     return "\n".join(code1list), "\n".join(code2list)
 
 def make_ufuncs(funcdict):

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -916,8 +916,10 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'funcs.inc.src'),
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
+            join('src', 'umath', 'loops_utils.h.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),

diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
@@ -237,5 +237,117 @@ abs_ptrdiff(char *a, char *b)
     TYPE io1 = *(TYPE *)iop1; \
     BINARY_REDUCE_LOOP_INNER
 
+#define IS_BINARY_STRIDE_ONE(esize, vsize) \
+    ((steps[0] == esize) && \
+     (steps[1] == esize) && \
+     (steps[2] == esize) && \
+     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
+     (abs_ptrdiff(args[2], args[1]) >= vsize))
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register. The check of the steps against
+ * esize also quarantees that steps are >= 0.
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+      ((abs_ptrdiff(args[1], args[0]) == 0))))
+
+/*
+ * Avoid using SIMD for very large step sizes for several reasons:
+ * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
+ *    in which case we need two i64gather instructions and an additional vinsertf32x8
+ *    instruction to load a single zmm register (since one i64gather instruction
+ *    loads into a ymm register). This is not ideal for performance.
+ * 2) Gather and scatter instructions can be slow when the loads/stores
+ *    cross page boundaries.
+ *
+ * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
+ */
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
+
+#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
+
+/*
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
+    ((steps[0] & (esizein-1)) == 0 && \
+     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
+
+#define IS_BLOCKABLE_REDUCE(esize, vsize) \
+    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
+     np
BF01
y_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY(esize, vsize) \
+    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)) && \
+     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[0]) == 0) && \
+     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
+      abs_ptrdiff(args[2], args[1]) >= 0))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
+    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
+     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[1]) == 0)) && \
+     abs_ptrdiff(args[2], args[0]) >= (esize))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
+    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
+     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
+     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
+      (abs_ptrdiff(args[2], args[0]) == 0)) && \
+     abs_ptrdiff(args[2], args[1]) >= (esize))
+
+#undef abs_ptrdiff
+
+#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)) && \
+     npy_is_aligned(args[0], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
+    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
+     npy_is_aligned(args[1], (esize)))
+
+#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
+    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
+     npy_is_aligned(args[0], (esize)))
+
+/* align var to alignment */
+#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+                                                alignment, n);\
+    for(i = 0; i < peel; i++)
+
+#define LOOP_BLOCKED(type, vsize)\
+    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+            i += (vsize / sizeof(type)))
+
+#define LOOP_BLOCKED_END\
+    for (; i < n; i++)
+
 
 #endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */