-
-
Notifications
You must be signed in to change notification settings - Fork 10.9k
PERF: improve multithreaded ufunc scaling #27896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f019103
8c68479
7b044a6
cee2892
1a1ecfc
c7b9b76
f2b7cc1
6146085
e150832
b0f744c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* Improved multithreaded scaling on the free-threaded build when many threads | ||
simultaneously call the same ufunc operations. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,9 @@ | |
* case is likely desired. | ||
*/ | ||
|
||
#include <mutex> | ||
#include <shared_mutex> | ||
|
||
#include "templ_common.h" | ||
#include "npy_hashtable.h" | ||
|
||
|
@@ -89,7 +92,7 @@ find_item(PyArrayIdentityHash const *tb, PyObject *const *key) | |
NPY_NO_EXPORT PyArrayIdentityHash * | ||
PyArrayIdentityHash_New(int key_len) | ||
{ | ||
PyArrayIdentityHash *res = PyMem_Malloc(sizeof(PyArrayIdentityHash)); | ||
PyArrayIdentityHash *res = (PyArrayIdentityHash *)PyMem_Malloc(sizeof(PyArrayIdentityHash)); | ||
if (res == NULL) { | ||
PyErr_NoMemory(); | ||
return NULL; | ||
|
@@ -100,12 +103,21 @@ PyArrayIdentityHash_New(int key_len) | |
res->size = 4; /* Start with a size of 4 */ | ||
res->nelem = 0;< 2364 /td> | ||
|
||
res->buckets = PyMem_Calloc(4 * (key_len + 1), sizeof(PyObject *)); | ||
res->buckets = (PyObject **)PyMem_Calloc(4 * (key_len + 1), sizeof(PyObject *)); | ||
if (res->buckets == NULL) { | ||
PyErr_NoMemory(); | ||
PyMem_Free(res); | ||
return NULL; | ||
} | ||
|
||
#ifdef Py_GIL_DISABLED | ||
res->mutex = new(std::nothrow) std::shared_mutex(); | ||
if (res->mutex == nullptr) { | ||
PyErr_NoMemory(); | ||
PyMem_Free(res); | ||
return NULL; | F438 tr>||
} | ||
#endif | ||
return res; | ||
} | ||
|
||
|
@@ -115,6 +127,9 @@ PyArrayIdentityHash_Dealloc(PyArrayIdentityHash *tb) | |
{ | ||
PyMem_Free(tb->buckets); | ||
PyMem_Free(tb); | ||
#ifdef Py_GIL_DISABLED | ||
delete (std::shared_mutex *)tb->mutex; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems OK to assume that |
||
#endif | ||
} | ||
|
||
|
||
|
@@ -149,7 +164,7 @@ _resize_if_necessary(PyArrayIdentityHash *tb) | |
if (npy_mul_sizes_with_overflow(&alloc_size, new_size, tb->key_len + 1)) { | ||
return -1; | ||
} | ||
tb->buckets = PyMem_Calloc(alloc_size, sizeof(PyObject *)); | ||
tb->buckets = (PyObject **)PyMem_Calloc(alloc_size, sizeof(PyObject *)); | ||
if (tb->buckets == NULL) { | ||
tb->buckets = old_table; | ||
PyErr_NoMemory(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,10 @@ | |
#include "npy_import.h" | ||
#include <limits.h> | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
#define error_converting(x) (((x) == -1) && PyErr_Occurred()) | ||
|
||
#ifdef NPY_ALLOW_THREADS | ||
|
@@ -104,13 +108,13 @@ check_and_adjust_index(npy_intp *index, npy_intp max_item, int axis, | |
/* Try to be as clear as possible about what went wrong. */ | ||
if (axis >= 0) { | ||
PyErr_Format(PyExc_IndexError, | ||
"index %"NPY_INTP_FMT" is out of bounds " | ||
"for axis %d with size %"NPY_INTP_FMT, | ||
"index %" NPY_INTP_FMT" is out of bounds " | ||
"for axis %d with size %" NPY_INTP_FMT, | ||
*index, axis, max_item); | ||
} else { | ||
PyErr_Format(PyExc_IndexError, | ||
"index %"NPY_INTP_FMT" is out of bounds " | ||
"for size %"NPY_INTP_FMT, *index, max_item); | ||
"index %" NPY_INTP_FMT " is out of bounds " | ||
"for size %" NPY_INTP_FMT, *index, max_item); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should just replace all |
||
} | ||
return -1; | ||
} | ||
|
@@ -163,7 +167,9 @@ check_and_adjust_axis(int *axis, int ndim) | |
* <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52023>. | ||
* clang versions < 8.0.0 have the same bug. | ||
*/ | ||
#if (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112 \ | ||
#ifdef __cplusplus | ||
#define NPY_ALIGNOF(type) alignof(type) | ||
There was a problem hiding this comment. Wondering if we shouldn't just use |
||
#elif (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112 \ | ||
|| (defined __GNUC__ && __GNUC__ < 4 + (__GNUC_MINOR__ < 9) \ | ||
&& !defined __clang__) \ | ||
|| (defined __clang__ && __clang_major__ < 8)) | ||
|
@@ -347,4 +353,8 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out, | |
*/ | ||
#define NPY_ITER_REDUCTION_AXIS(axis) (axis + (1 << (NPY_BITSOF_INT - 2))) | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
|
||
#endif /* NUMPY_CORE_SRC_MULTIARRAY_COMMON_H_ */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,9 @@ | |
#define _MULTIARRAYMODULE | ||
#define _UMATHMODULE | ||
|
||
#include <mutex> | ||
#include <shared_mutex> | ||
|
||
#define PY_SSIZE_T_CLEAN | ||
#include <Python.h> | ||
#include <convert_datatype.h> | ||
|
@@ -504,8 +507,9 @@ call_promoter_and_recurse(PyUFuncObject *ufunc, PyObject *info, | |
PyObject *promoter = PyTuple_GET_ITEM(info, 1); | ||
if (PyCapsule_CheckExact(promoter)) { | ||
/* We could also go the other way and wrap up the python function... */ | ||
PyArrayMethod_PromoterFunction *promoter_function = PyCapsule_GetPointer( | ||
promoter, "numpy._ufunc_promoter"); | ||
PyArrayMethod_PromoterFunction *promoter_function = | ||
(PyArrayMethod_PromoterFunction *)PyCapsule_GetPointer( | ||
promoter, "numpy._ufunc_promoter"); | ||
if (promoter_function == NULL) { | ||
return NULL; | ||
} | ||
|
@@ -770,8 +774,9 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, | |
* 2. Check all registered loops/promoters to find the best match. | ||
* 3. Fall back to the legacy implementation if no match was found. | ||
*/ | ||
PyObject *info = PyArrayIdentityHash_GetItem(ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes); | ||
PyObject *info = PyArrayIdentityHash_GetItem( | ||
(PyArrayIdentityHash *)ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes); | ||
if (info != NULL && PyObject_TypeCheck( | ||
PyTuple_GET_ITEM(info, 1), &PyArrayMethod_Type)) { | ||
/* Found the ArrayMethod and NOT a promoter: return it */ | ||
|
@@ -793,8 +798,9 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, | |
* Found the ArrayMethod and NOT promoter. Before returning it | ||
* add it to the cache for faster lookup in the future. | ||
*/ | ||
if (PyArrayIdentityHash_SetItem(ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes, info, 0) < 0) { | ||
if (PyArrayIdentityHash_SetItem( | ||
(PyArrayIdentityHash *)ufunc->_dispatch_cache, | ||
|
||
return NULL; | ||
} | ||
return info; | ||
|
@@ -815,8 +821,9 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, | |
} | ||
else if (info != NULL) { | ||
/* Add result to the cache using the original types: */ | ||
if (PyArrayIdentityHash_SetItem(ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes, info, 0) < 0) { | ||
if (PyArrayIdentityHash_SetItem( | ||
(PyArrayIdentityHash *)ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes, info, 0) < 0) { | ||
return NULL; | ||
} | ||
return info; | ||
|
@@ -882,13 +889,51 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, | |
} | ||
|
||
/* Add this to the cache using the original types: */ | ||
if (cacheable && PyArrayIdentityHash_SetItem(ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes, info, 0) < 0) { | ||
if (cacheable && PyArrayIdentityHash_SetItem( | ||
(PyArrayIdentityHash *)ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes, info, 0) < 0) { | ||
return NULL; | ||
} | ||
return info; | ||
} | ||
|
||
#ifdef Py_GIL_DISABLED | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for the drive-by comment, but If you're calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you may want to wrap calls to |
||
/* | ||
* Fast path for promote_and_get_info_and_ufuncimpl. | ||
* Acquires a read lock to check for a cache hit and then | ||
* only acquires a write lock on a cache miss to fill the cache | ||
*/ | ||
static inline PyObject * | ||
promote_and_get_info_and_ufuncimpl_with_locking( | ||
PyUFuncObject *ufunc, | ||
PyArrayObject *const ops[], | ||
PyArray_DTypeMeta *signature[], | ||
PyArray_DTypeMeta *op_dtypes[], | ||
npy_bool legacy_promotion_is_possible) | ||
{ | ||
std::shared_mutex *mutex = ((std::shared_mutex *)((PyArrayIdentityHash *)ufunc->_dispatch_cache)->mutex); | ||
mutex->lock_shared(); | ||
PyObject *info = PyArrayIdentityHash_GetItem( | ||
(PyArrayIdentityHash *)ufunc->_dispatch_cache, | ||
(PyObject **)op_dtypes); | ||
mutex->unlock_shared(); | ||
|
||
if (info != NULL && PyObject_TypeCheck( | ||
PyTuple_GET_ITEM(info, 1), &PyArrayMethod_Type)) { | ||
/* Found the ArrayMethod and NOT a promoter: return it */ | ||
return info; | ||
} | ||
|
||
// cache miss, need to acquire a write lock and recursively calculate the | ||
// correct dispatch resolution | ||
mutex->lock(); | ||
info = promote_and_get_info_and_ufuncimpl(ufunc, | ||
ops, signature, op_dtypes, legacy_promotion_is_possible); | ||
mutex->unlock(); | ||
|
||
return info; | ||
} | ||
#endif | ||
|
||
/** | ||
* The central entry-point for the promotion and dispatching machinery. | ||
|
@@ -941,6 +986,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, | |
{ | ||
int nin = ufunc->nin, nargs = ufunc->nargs; | ||
npy_bool legacy_promotion_is_possible = NPY_TRUE; | ||
PyObject *all_dtypes = NULL; | ||
PyArrayMethodObject *method = NULL; | ||
|
||
/* | ||
* Get the actual DTypes we operate with by setting op_dtypes[i] from | ||
|
@@ -976,18 +1023,20 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, | |
} | ||
} | ||
|
||
PyObject *info; | ||
Py_BEGIN_CRITICAL_SECTION((PyObject *)ufunc); | ||
info = promote_and_get_info_and_ufuncimpl(ufunc, | ||
#ifdef Py_GIL_DISABLED | ||
PyObject *info = promote_and_get_info_and_ufuncimpl_with_locking(ufunc, | ||
ops, signature, op_dtypes, legacy_promotion_is_possible); | ||
#else | ||
PyObject *info = promote_and_get_info_and_ufuncimpl(ufunc, | ||
ops, signature, op_dtypes, legacy_promotion_is_possible); | ||
Py_END_CRITICAL_SECTION(); | ||
#endif | ||
|
||
if (info == NULL) { | ||
goto handle_error; | ||
} | ||
|
||
PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1); | ||
PyObject *all_dtypes = PyTuple_GET_ITEM(info, 0); | ||
method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1); | ||
all_dtypes = PyTuple_GET_ITEM(info, 0); | ||
|
||
/* | ||
* In certain cases (only the logical ufuncs really), the loop we found may | ||
|
@@ -1218,7 +1267,7 @@ install_logical_ufunc_promoter(PyObject *ufunc) | |
if (dtype_tuple == NULL) { | ||
return -1; | ||
} | ||
PyObject *promoter = PyCapsule_New(&logical_ufunc_promoter, | ||
PyObject *promoter = PyCapsule_New((void *)&logical_ufunc_promoter, | ||
"numpy._ufunc_promoter", NULL); | ||
if (promoter == NULL) { | ||
Py_DECREF(dtype_tuple); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not totally clear to me if originally not including this function in
dispatching.h
and forward-declaring it here was an intentional decision for some reason