8000 Merge pull request #12011 from pierreglaser/implement-reduce-ex · numpy/numpy@2ed08ba · GitHub
[go: up one dir, main page]

Skip to content

Commit 2ed08ba

Browse files
authored
Merge pull request #12011 from pierreglaser/implement-reduce-ex
ENH: implementation of array_reduce_ex
2 parents 41d2428 + 97d2b97 commit 2ed08ba

File tree

4 files changed

+232
-0
lines changed

4 files changed

+232
-0
lines changed

doc/release/1.16.0-notes.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@ content to be read after ``skiprows``, as in `numpy.genfromtxt`.
104104
Improvements
105105
============
106106

107+
no-copy pickling of numpy arrays
108+
--------------------------------
109+
Up to protocol 4, numpy array pickling created 2 spurious copies of the data
110+
being serlialized.
111+
With pickle protocol 5, and the ``PickleBuffer`` API, a large variety of numpy
112+
arrays can now be serialized without any copy using out-of-band buffers,
113+
and with one less copy using in-band buffers. This results, for large arrays,
114+
in an up to 66% drop in peak memory usage.
115+
107116
build shell independence
108117
------------------------
109118
NumPy builds should no longer interact with the host machine

numpy/core/numeric.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1934,6 +1934,10 @@ def fromfunction(function, shape, **kwargs):
19341934
return function(*args, **kwargs)
19351935

19361936

1937+
def _frombuffer(buf, dtype, shape, order):
1938+
return frombuffer(buf, dtype=dtype).reshape(shape, order=order)
1939+
1940+
19371941
def isscalar(num):
19381942
"""
19391943
Returns True if the type of `num` is a scalar type.

numpy/core/src/multiarray/methods.c

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,6 +1619,8 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
16191619
16201620
Notice because Python does not describe a mechanism to write
16211621
raw data to the pickle, this performs a copy to a string first
1622+
This issue is now adressed in protocol 5, where a buffer is serialized
1623+
instead of a string,
16221624
*/
16231625

16241626
state = PyTuple_New(5);
@@ -1651,6 +1653,132 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
16511653
return ret;
16521654
}
16531655

1656+
static PyObject *
1657+
array_reduce_ex(PyArrayObject *self, PyObject *args)
1658+
{
1659+
int protocol;
1660+
PyObject *ret = NULL, *numeric_mod = NULL, *from_buffer_func = NULL;
1661+
PyObject *buffer_tuple = NULL, *pickle_module = NULL, *pickle_class = NULL;
1662+
PyObject *class_args = NULL, *class_args_tuple = NULL, *unused = NULL;
1663+
PyObject *subclass_array_reduce = NULL;
1664+
PyObject *buffer = NULL, *transposed_array = NULL;
1665+
PyArray_Descr *descr = NULL;
1666+
char order;
1667+
1668+
if (PyArg_ParseTuple(args, "i", &protocol)){
1669+
descr = PyArray_DESCR(self);
1670+
if ((protocol < 5) ||
1671+
(!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) &&
1672+
!PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)) ||
1673+
PyDataType_FLAGCHK(descr, NPY_ITEM_HASOBJECT) ||
1674+
(PyType_IsSubtype(((PyObject*)self)->ob_type, &PyArray_Type) &&
1675+
((PyObject*)self)->ob_type != &PyArray_Type) ||
1676+
PyDataType_ISUNSIZED(descr)) {
1677+
/* The PickleBuffer class from version 5 of the pickle protocol
1678+
* can only be used for arrays backed by a contiguous data buffer.
1679+
* For all other cases we fallback to the generic array_reduce
1680+
* method that involves using a temporary bytes allocation. However
1681+
* we do not call array_reduce directly but instead lookup and call
1682+
* the __reduce__ method to make sure that it's possible customize
1683+
* pickling in sub-classes. */
1684+
subclass_array_reduce = PyObject_GetAttrString((PyObject *)self,
1685+
"__reduce__");
1686+
return PyObject_CallObject(subclass_array_reduce, unused);
1687+
}
1688+
else if (protocol == 5){
1689+
ret = PyTuple_New(2);
1690+
1691+
if (ret == NULL) {
1692+
return NULL;
1693+
}
1694+
1695+
/* if the python version is below 3.8, the pickle module does not provide
1696+
* built-in support for protocol 5. We try importing the pickle5
1697+
* backport instead */
1698+
#if PY_VERSION_HEX >= 0x03080000
1699+
pickle_module = PyImport_ImportModule("pickle");
1700+
#elif PY_VERSION_HEX < 0x03080000 && PY_VERSION_HEX >= 0x03060000
1701+
pickle_module = PyImport_ImportModule("pickle5");
1702+
if (pickle_module == NULL){
1703+
/* for protocol 5, raise a clear ImportError if pickle5 is not found
1704+
*/
1705+
PyErr_SetString(PyExc_ImportError, "Using pickle protocol 5 "
1706+
"requires the pickle5 module for python versions >=3.6 "
1707+
"and <3.8");
1708+
return NULL;
1709+
}
1710+
#else
1711+
PyErr_SetString(PyExc_ValueError, "pickle protocol 5 is not available "
1712+
"for python versions < 3.6");
1713+
return NULL;
1714+
#endif
1715+
if (pickle_module == NULL){
1716+
return NULL;
1717+
}
1718+
1719+
pickle_class = PyObject_GetAttrString(pickle_module,
1720+
"PickleBuffer");
1721+
1722+
class_args_tuple = PyTuple_New(1);
1723+
if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) &&
1724+
PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)){
1725+
1726+
/* if the array if Fortran-contiguous and not C-contiguous,
1727+
* the PickleBuffer instance will hold a view on the transpose
1728+
* of the initial array, that is C-contiguous. */
1729+
order = 'F';
1730+
transposed_array = PyArray_Transpose((PyArrayObject*)self, NULL);
1731+
PyTuple_SET_ITEM(class_args_tuple, 0, transposed_array);
1732+
}
1733+
else {
1734+
order = 'C';
1735+
PyTuple_SET_ITEM(class_args_tuple, 0, (PyObject *)self);
1736+
Py_INCREF(self);
1737+
}
1738+
1739+
class_args = Py_BuildValue("O", class_args_tuple);
1740+
1741+
buffer = PyObject_CallObject(pickle_class, class_args);
1742+
1743+
numeric_mod = PyImport_ImportModule("numpy.core.numeric");
1744+
if (numeric_mod == NULL) {
1745+
Py_DECREF(ret);
1746+
return NULL;
1747+
}
1748+
from_buffer_func = PyObject_GetAttrString(numeric_mod,
1749+
"_frombuffer");
1750+
Py_DECREF(numeric_mod);
1751+
1752+
Py_INCREF(descr);
1753+
1754+
buffer_tuple = PyTuple_New(4);
1755+
PyTuple_SET_ITEM(buffer_tuple, 0, buffer);
1756+
PyTuple_SET_ITEM(buffer_tuple, 1, (PyObject *)descr);
1757+
PyTuple_SET_ITEM(buffer_tuple, 2,
1758+
PyObject_GetAttrString((PyObject *)self,
1759+
"shape"));
1760+
PyTuple_SET_ITEM(buffer_tuple, 3,
1761+
PyUnicode_FromStringAndSize(&order,
1762+
(Py_ssize_t)1));
1763+
1764+
PyTuple_SET_ITEM(ret, 0, from_buffer_func);
1765+
PyTuple_SET_ITEM(ret, 1, buffer_tuple);
1766+
1767+
return ret;
1768+
}
1769+
else {
1770+
PyErr_Format(PyExc_ValueError,
1771+
"cannot call __reduce_ex__ with protocol >= %d",
1772+
5);
1773+
return NULL;
1774+
}
1775+
}
1776+
else {
1777+
return NULL;
1778+
}
1779+
1780+
}
1781+
16541782
static PyObject *
16551783
array_setstate(PyArrayObject *self, PyObject *args)
16561784
{
@@ -2524,6 +2652,9 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
25242652
{"__reduce__",
25252653
(PyCFunction) array_reduce,
25262654
METH_VARARGS, NULL},
2655+
{"__reduce_ex__",
2656+
(PyCFunction) array_reduce_ex,
2657+
METH_VARARGS, NULL},
25272658
{"__setstate__",
25282659
(PyCFunction) array_setstate,
25292660
METH_VARARGS, NULL},

numpy/core/tests/test_multiarray.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,21 @@ def test_pickle(self):
13821382

13831383
assert_equal(zs.dtype, zs2.dtype)
13841384

1385+
@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
1386+
reason="requires pickle protocol 5")
1387+
def test_pickle_with_buffercallback(self):
1388+
array = np.arange(10)
1389+
buffers = []
1390+
bytes_string = pickle.dumps(array, buffer_callback=buffers.append,
1391+
protocol=5)
1392+
array_from_buffer = pickle.loads(bytes_string, buffers=buffers)
1393+
# when using pickle protocol 5 with buffer callbacks,
1394+
# array_from_buffer is reconstructed from a buffer holding a view
1395+
# to the initial array's data, so modifying an element in array
1396+
# should modify it in array_from_buffer too.
1397+
array[0] = -1
1398+
assert array_from_buffer[0] == -1, array_from_buffer[0]
1399+
13851400

13861401
class TestMethods(object):
13871402
def test_compress(self):
@@ -3562,6 +3577,79 @@ def test_highest_available_pickle_protocol(self):
35623577
else:
35633578
assert pickle.HIGHEST_PROTOCOL < 5
35643579

3580+
@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL >= 5,
3581+
reason=('this tests the error messages when trying to'
3582+
'protocol 5 although it is not available'))
3583+
def test_correct_protocol5_error_message(self):
3584+
array = np.arange(10)
3585+
f = io.BytesIO()
3586+
3587+
if sys.version_info[:2] in ((3, 6), (3, 7)):
3588+
# For the specific case of python3.6 and 3.7, raise a clear import
3589+
# error about the pickle5 backport when trying to use protocol=5
3590+
# without the pickle5 package
3591+
with pytest.raises(ImportError):
3592+
array.__reduce_ex__(5)
3593+
3594+
elif sys.version_info[:2] < (3, 6):
3595+
# when calling __reduce_ex__ explicitly with protocol=5 on python
3596+
# raise a ValueError saying that protocol 5 is not available for
3597+
# this python version
3598+
with pytest.raises(ValueError):
3599+
array.__reduce_ex__(5)
3600+
3601+
def test_record_array_with_object_dtype(self):
3602+
my_object = object()
3603+
3604+
arr_with_object = np.array(
3605+
[(my_object, 1, 2.0)],
3606+
dtype=[('a', object), ('b', int), ('c', float)])
3607+
arr_without_object = np.array(
3608+
[('xxx', 1, 2.0)],
3609+
dtype=[('a', str), ('b', int), ('c', float)])
3610+
3611+
for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
3612+
depickled_arr_with_object = pickle.loads(
3613+
pickle.dumps(arr_with_object, protocol=proto))
3614+
depickled_arr_without_object = pickle.loads(
3615+
pickle.dumps(arr_without_object, protocol=proto))
3616+
3617+
assert_equal(arr_with_object.dtype,
3618+
depickled_arr_with_object.dtype)
3619+
assert_equal(arr_without_object.dtype,
3620+
depickled_arr_without_object.dtype)
3621+
3622+
@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
3623+
reason="requires pickle protocol 5")
3624+
def test_f_contiguous_array(self):
3625+
f_contiguous_array = np.array([[1, 2, 3], [4, 5, 6]], order='F')
3626+
buffers = []
3627+
3628+
# When using pickle protocol 5, Fortran-contiguous arrays can be
3629+
# serialized using out-of-band buffers
3630+
bytes_string = pickle.dumps(f_contiguous_array, protocol=5,
3631+
buffer_callback=buffers.append)
3632+
3633+
assert len(buffers) > 0
3634+
3635+
depickled_f_contiguous_array = pickle.loads(bytes_string,
3636+
buffers=buffers)
3637+
3638+
assert_equal(f_cont 9D71 iguous_array, depickled_f_contiguous_array)
3639+
3640+
def test_non_contiguous_array(self):
3641+
non_contiguous_array = np.arange(12).reshape(3, 4)[:, :2]
3642+
assert not non_contiguous_array.flags.c_contiguous
3643+
assert not non_contiguous_array.flags.f_contiguous
3644+
3645+
# make sure non-contiguous arrays can be pickled-depickled
3646+
# using any protocol
3647+
for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
3648+
depickled_non_contiguous_array = pickle.loads(
3649+
pickle.dumps(non_contiguous_array, protocol=proto))
3650+
3651+
assert_equal(non_contiguous_array, depickled_non_contiguous_array)
3652+
35653653
def test_roundtrip(self):
35663654
for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
35673655
carray = np.array([[2, 9], [7, 0], [3, 8]])

0 commit comments

Comments
 (0)
0