8000 ENH: create and use indexed inner loops by mattip · Pull Request #23136 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: create and use indexed inner loops #23136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Feb 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/bench_ufunc_strides.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def g(self,z,c):
return np.sum(np.multiply(z,z) + c)

def mandelbrot_numpy(self, c, maxiter):
output = np.zeros(c.shape, np.int)
output = np.zeros(c.shape, np.int32)
z = np.empty(c.shape, np.complex64)
for it in range(maxiter):
notdone = self.f(z)
Expand Down
17 changes: 17 additions & 0 deletions doc/release/upcoming_changes/23136.performance.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
``ufunc.at`` can be much faster
-------------------------------
Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:

- operands are aligned
- no casting

If ufuncs with appropriate indexed loops on 1d arguments with the above
conditions, ``ufunc.at`` can be up to 60x faster (an additional 7x speedup).
Appropriate indexed loops have been added to ``add``, ``subtract``,
``multiply``, ``divide`` (and ``floor_divide``)

The internal logic is similar to the logic used for regular ufuncs, which also
have fast paths.

Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
work.
2 changes: 2 additions & 0 deletions numpy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
"numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
)

# The experimentalAPI is used in _umath_tests
os.environ["NUMPY_EXPERIMENTAL_DTYPE_API"] = "1"

def pytest_configure(config):
config.addinivalue_line("markers",
Expand Down
69 changes: 63 additions & 6 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""
Generate the code to build all the internal ufuncs. At the base is the defdict:
a dictionary ofUfunc classes. This is fed to make_code to generate
__umath_generated.c
"""
import os
import re
import struct
import sys
import textwrap
import argparse

# identity objects
Zero = "PyLong_FromLong(0)"
One = "PyLong_FromLong(1)"
True_ = "(Py_INCREF(Py_True), Py_True)"
Expand Down Expand Up @@ -121,11 +127,11 @@ def check_td_order(tds):
for prev_i, sign in enumerate(signatures[1:]):
if sign in signatures[:prev_i+1]:
continue # allow duplicates...

_check_order(signatures[prev_i], sign)


_fdata_map = dict(
_floatformat_map = dict(
e='npy_%sf',
f='npy_%sf',
d='npy_%s',
Expand All @@ -136,11 +142,13 @@ def check_td_order(tds):
)

def build_func_data(types, f):
func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
return func_data

def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
simd=None, dispatch=None):
"""Generate a TypeDescription instance for each item in types
"""
if f is not None:
if isinstance(f, str):
func_data = build_func_data(types, f)
Expand Down Expand Up @@ -188,12 +196,15 @@ class Ufunc:
----------
nin : number of input arguments
nout : number of output arguments
identity : identity element for a two-argument function
identity : identity element for a two-argument function (like Zero)
docstring : docstring for the ufunc
type_descriptions : list of TypeDescription objects
typereso: type resolver function of type PyUFunc_TypeResolutionFunc
type_descriptions : TypeDescription objects
signature: a generalized ufunc signature (like for matmul)
indexed: add indexed loops (ufunc.at) for these type characters
"""
def __init__(self, nin, nout, identity, docstring, typereso,
*type_descriptions, signature=None):
*type_descriptions, signature=None, indexed=''):
self.nin = nin
self.nout = nout
if identity is None:
Expand All @@ -203,6 +214,7 @@ def __init__(self, nin, nout, identity, docstring, typereso,
self.typereso = typereso
self.type_descriptions = []
self.signature = signature
self.indexed = indexed
for td in type_descriptions:
self.type_descriptions.extend(td)
for td in self.type_descriptions:
Expand Down Expand Up @@ -347,6 +359,7 @@ def english_upper(s):
TypeDescription('M', FullTypeDescr, 'mM', 'M'),
],
TD(O, f='PyNumber_Add'),
indexed=flts + ints
),
'subtract':
Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
Expand All @@ -359,6 +372,7 @@ def english_upper(s):
TypeDescription('M', FullTypeDescr, 'MM', 'm'),
],
TD(O, f='PyNumber_Subtract'),
indexed=flts + ints
),
'multiply':
Ufunc(2, 1, One,
Expand All @@ -374,6 +388,7 @@ def english_upper(s):
TypeDescription('m', FullTypeDescr, 'dm', 'm'),
],
TD(O, f='PyNumber_Multiply'),
indexed=flts + ints
),
#'true_divide' : aliased to divide in umathmodule.c:initumath
'floor_divide':
Expand All @@ -388,6 +403,7 @@ def english_upper(s):
TypeDescription('m', FullTypeDescr, 'mm', 'q'),
],
TD(O, f='PyNumber_FloorDivide'),
indexed=flts + ints
),
'divide':
Ufunc(2, 1, None, # One is only a unit to the right, not the left
Expand All @@ -399,6 +415,7 @@ def english_upper(s):
TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
],
TD(O, f='PyNumber_TrueDivide'),
indexed=flts
),
'conjugate':
Ufunc(1, 1, None,
Expand Down Expand Up @@ -1255,6 +1272,40 @@ def make_ufuncs(funcdict):
if uf.typereso is not None:
mlist.append(
r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
for c in uf.indexed:
# Handle indexed loops by getting the underlying ArrayMethodObject
# from the list in f._loops and setting its field appropriately
fmt = textwrap.dedent("""
{{
PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
PyObject *info = get_info_no_cast((PyUFuncObject *)f,
dtype, {count});
if (info == NULL) {{
return -1;
}}
if (info == Py_None) {{
PyErr_SetString(PyExc_RuntimeError,
"cannot add indexed loop to ufunc "
"{name} with {typenum}");
return -1;
}}
if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
PyErr_SetString(PyExc_RuntimeError,
"Not a PyArrayMethodObject in ufunc "
"{name} with {typenum}");
}}
((PyArrayMethodObject*)info)->contiguous_indexed_loop =
{funcname};
/* info is borrowed, no need to decref*/
}}
""")
mlist.append(fmt.format(
typenum=f"NPY_{english_upper(chartoname[c])}",
count=uf.nin+uf.nout,
name=name,
funcname = f"{english_upper(chartoname[c])}_{name}_indexed",
))

mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
mlist.append(r"""Py_DECREF(f);""")
code3list.append('\n'.join(mlist))
Expand All @@ -1277,8 +1328,14 @@ def make_code(funcdict, filename):
#include "loops.h"
#include "matmul.h"
#include "clip.h"
#include "dtypemeta.h"
#include "_umath_doc_generated.h"

%s
/* Returns a borrowed ref of the second value in the matching info tuple */
PyObject *
get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
int ndtypes);

static int
InitOperators(PyObject *dictionary) {
Expand Down
3 changes: 2 additions & 1 deletion numpy/core/include/numpy/experimental_dtype_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ typedef NPY_CASTING (resolve_descriptors_function)(
#define NPY_METH_contiguous_loop 5
#define NPY_METH_unaligned_strided_loop 6
#define NPY_METH_unaligned_contiguous_loop 7
#define NPY_METH_contiguous_indexed_loop 8


typedef struct {
Expand Down Expand Up @@ -488,7 +489,7 @@ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
*/
#if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)

#define __EXPERIMENTAL_DTYPE_VERSION 6
#define __EXPERIMENTAL_DTYPE_VERSION 7

static int
import_experimental_dtype_api(int version)
Expand Down
4 changes: 2 additions & 2 deletions numpy/core/src/multiarray/argfunc.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
if (nnan != ((1LL << vstep) - 1)) {
if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
npy_uint64 nnan_4[4];
nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
Expand All @@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
#if @is_fp@
npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
if (nnan != ((1LL << vstep) - 1)) {
if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
for (int vi = 0; vi < vstep; ++vi) {
if (!((nnan >> vi) & 1)) {
return i + vi;
Expand Down
5 changes: 4 additions & 1 deletion numpy/core/src/multiarray/array_method.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ fill_arraymethod_from_slots(
case NPY_METH_get_reduction_initial:
meth->get_reduction_initial = slot->pfunc;
continue;
case NPY_METH_contiguous_indexed_loop:
meth->contiguous_indexed_loop = slot->pfunc;
continue;
default:
break;
}
Expand Down Expand Up @@ -891,7 +894,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
/*
* Fetches a strided-loop function that supports a boolean mask as additional
* (last) operand to the strided-loop. It is otherwise largely identical to
* the `get_loop` method which it wraps.
* the `get_strided_loop` method which it wraps.
* This is the core implementation for the ufunc `where=...` keyword argument.
*
* NOTE: This function does not support `move_references` or inner dimensions.
Expand Down
4 changes: 3 additions & 1 deletion numpy/core/src/multiarray/array_method.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ struct PyArrayMethodObject_tag;
* TODO: Before making this public, we should review which information should
* be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
*/
typedef struct {
typedef struct PyArrayMethod_Context_tag {
PyObject *caller; /* E.g. the original ufunc, may be NULL */
struct PyArrayMethodObject_tag *method;

Expand Down Expand Up @@ -223,6 +223,7 @@ typedef struct PyArrayMethodObject_tag {
PyArrayMethod_StridedLoop *contiguous_loop;
PyArrayMethod_StridedLoop *unaligned_strided_loop;
PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
PyArrayMethod_StridedLoop *contiguous_indexed_loop;
/* Chunk only used for wrapping array method defined in umath */
struct PyArrayMethodObject_tag *wrapped_meth;
PyArray_DTypeMeta **wrapped_dtypes;
Expand Down Expand Up @@ -266,6 +267,7 @@ extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
#define NPY_METH_contiguous_loop 5
#define NPY_METH_unaligned_strided_loop 6
#define NPY_METH_unaligned_contiguous_loop 7
#define NPY_METH_contiguous_indexed_loop 8


/*
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/multiarray/experimental_public_dtype_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "common_dtype.h"


#define EXPERIMENTAL_DTYPE_API_VERSION 6
#define EXPERIMENTAL_DTYPE_API_VERSION 7


typedef struct{
Expand Down
Loading
0