8000 more small-array performance improvements by juliantaylor · Pull Request #4904 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

more small-array performance improvements #4904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion numpy/core/src/multiarray/ctors.c
Original file line number Diff line number Diff line change
Expand Up @@ -1080,7 +1080,8 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
* platforms (debian sparc) malloc does not provide enough alignment for
* long double types
*/
PyArray_UpdateFlags((PyArrayObject *)fa, NPY_ARRAY_UPDATE_ALL);
PyArray_UpdateFlags((PyArrayObject *)fa,
strides ? NPY_ARRAY_UPDATE_ALL : NPY_ARRAY_ALIGNED);

/*
* call the __array_finalize__
Expand Down
43 changes: 36 additions & 7 deletions numpy/core/src/umath/ufunc_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -2774,17 +2774,24 @@ reduce_type_resolver(PyUFuncObject *ufunc, PyArrayObject *arr,
* resolution.
*/
if (odtype != NULL) {
type_tup = PyTuple_Pack(3, odtype, odtype, Py_None);
if (type_tup == NULL) {
return -1;
if (!(PyTypeNum_ISBOOL(odtype->type_num) &&
PyArray_TYPE(arr) == odtype->type_num) &&
!(PyTypeNum_ISFLOAT(odtype->type_num) &&
PyArray_TYPE(arr) == odtype->type_num &&
PyArray_DESCR(arr)->byteorder == odtype->byteorder &&
odtype->byteorder == NPY_NATIVE)) {
type_tup = PyTuple_Pack(3, odtype, odtype, Py_None);
if (type_tup == NULL) {
return -1;
}
}
}

/* Use the type resolution function to find our loop */
retcode = ufunc->type_resolver(
ufunc, NPY_UNSAFE_CASTING,
op, type_tup, dtypes);
Py_DECREF(type_tup);
Py_XDECREF(type_tup);
if (retcode == -1) {
return -1;
}
Expand Down Expand Up @@ -3954,8 +3961,9 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
* is used for add and multiply reduction to avoid overflow
*/
int typenum = PyArray_TYPE(mp);
if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
&& ((strcmp(ufunc->name,"add") == 0)
if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum)) &&
(ufunc->name[0] == 'a' || ufunc->name[0] == 'm') &&
((strcmp(ufunc->name,"add") == 0)
|| (strcmp(ufunc->name,"multiply") == 0))) {
if (PyTypeNum_ISBOOL(typenum)) {
typenum = NPY_LONG;
Expand Down Expand Up @@ -4481,6 +4489,7 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
int unused, const char *signature)
{
PyUFuncObject *ufunc;
npy_intp type;

if (nin + nout > NPY_MAXARGS) {
PyErr_Format(PyExc_ValueError,
Expand Down Expand Up @@ -4525,7 +4534,8 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
}
ufunc->doc = doc;

ufunc->op_flags = PyArray_malloc(sizeof(npy_uint32)*ufunc->nargs);
ufunc->op_flags = PyArray_malloc(sizeof(npy_uint32)*(ufunc->nargs +
NPY_NTYPES));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do this in a cleaner way? E.g.

typedef struct {
    PyUFuncObject public;
    /* extra fields here */
} PyUFuncObjectPrivate;

and then they can be cast back and forth to each other so long as we're careful to always allocate the full thing? (This does assume that subclassing ufunc is illegal though.)

Or if that's not doable for some reason, at least a few helpers to get/set this table?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm thinking about it this approach does not really help as we would also need some marker to indicate that field exists.
actually that object has been extended a few times (e.g. in 1.7) so I guess nobody actually relies on its size not growing. So we could just stuff a pointer to a private object into the end.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm that also won't work as we might get passed in some user copy of the old structure...
also it would break forward compatibilty with cython, we don't provide that anyway but it would be nice to avoid it for something that might turn out as temporary.

I guess a second independant lookup table based on the code generator would work

if (ufunc->op_flags == NULL) {
return PyErr_NoMemory();
}
Expand All @@ -4546,6 +4556,25 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
return NULL;
}
}

/*
* stuff a jump index to the first entry of a certain dtype behind the public ABI
* the table allows faster lookup of inner loops
*/
for (type = 0; type < NPY_NTYPES; type++) {
npy_uint32 * typeskip = ufunc->op_flags + ufunc->nargs;
npy_intp j;
for (j = 0; j < ufunc->ntypes; j++) {
if (ufunc->types[j * ufunc->nargs] == type) {< 10000 /span>
typeskip[type] = j;
break;
}
}
/* no type registered, full scan required (can this even happen? */
if (j == ufunc->ntypes) {
typeskip[type] = 0;
}
}
return (PyObject *)ufunc;
}

Expand Down
22 changes: 15 additions & 7 deletions numpy/core/src/umath/ufunc_type_resolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -1283,15 +1283,14 @@ PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
const char *ufunc_name;
PyObject *errmsg;
int i, j;

ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
npy_uint32 offset;

/*
* If there are user-loops search them first.
* TODO: There needs to be a loop selection acceleration structure,
* like a hash table.
*/
if (ufunc->userloops) {
if (NPY_UNLIKELY(ufunc->userloops)) {
switch (find_userloop(ufunc, dtypes,
out_innerloop, out_innerloopdata)) {
/* Error */
Expand All @@ -1303,11 +1302,21 @@ PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
}
}

/*
* op_flags has a jump index to the first entry of dtype[0] stuffed behind
* the public ABI
*/
if (NPY_LIKELY(dtypes[0]->type_num < NPY_NTYPES)) {
offset = ufunc->op_flags[nargs + dtypes[0]->type_num];
}
else {
offset = 0;
}
types = ufunc->types;
for (i = 0; i < ufunc->ntypes; ++i) {
for (i = offset; i < ufunc->ntypes; ++i) {
/* Copy the types into an int array for matching */
for (j = 0; j < nargs; ++j) {
if (types[j] != dtypes[j]->type_num) {
if (types[i * nargs + j] != dtypes[j]->type_num) {
break;
}
}
Expand All @@ -1316,10 +1325,9 @@ PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
*out_innerloopdata = ufunc->data[i];
return 0;
}

types += nargs;
}

ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
errmsg = PyUString_FromFormat("ufunc '%s' did not contain a loop "
"with signature matching types ", ufunc_name);
for (i = 0; i < nargs; ++i) {
Expand Down
0