8000 ENH: implementation of array_reduce_ex by pierreglaser · Pull Request #12011 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: implementation of array_reduce_ex #12011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/release/1.16.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ content to be read after ``skiprows``, as in `numpy.genfromtxt`.
Improvements
============

no-copy pickling of numpy arrays
--------------------------------
Up to protocol 4, numpy array pickling created 2 spurious copies of the data
being serlialized.
With pickle protocol 5, and the ``PickleBuffer`` API, a large variety of numpy
arrays can now be serialized without any copy using out-of-band buffers,
and with one less copy using in-band buffers. This results, for large arrays,
in an up to 66% drop in peak memory usa 8000 ge.

build shell independence
------------------------
NumPy builds should no longer interact with the host machine
Expand Down
4 changes: 4 additions & 0 deletions numpy/core/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,10 @@ def fromfunction(function, shape, **kwargs):
return function(*args, **kwargs)


def _frombuffer(buf, dtype, shape, order):
return frombuffer(buf, dtype=dtype).reshape(shape, order=order)


def isscalar(num):
"""
Returns True if the type of `num` is a scalar type.
Expand Down
131 changes: 131 additions & 0 deletions numpy/core/src/multiarray/methods.c
Original file line number Diff line number Diff line change
Expand Up @@ -1619,6 +1619,8 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))

Notice because Python does not describe a mechanism to write
raw data to the pickle, this performs a copy to a string first
This issue is now adressed in protocol 5, where a buffer is serialized
instead of a string,
*/

state = PyTuple_New(5);
Expand Down Expand Up @@ -1651,6 +1653,132 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
return ret;
}

static PyObject *
array_reduce_ex(PyArrayObject *self, PyObject *args)
{
int protocol;
PyObject *ret = NULL, *numeric_mod = NULL, *from_buffer_func = NULL;
PyObject *buffer_tuple = NULL, *pickle_module = NULL, *pickle_class = NULL;
PyObject *class_args = NULL, *class_args_tuple = NULL, *unused = NULL;
PyObject *subclass_array_reduce = NULL;
PyObject *buffer = NULL, *transposed_array = NULL;
PyArray_Descr *descr = NULL;
char order;

if (PyArg_ParseTuple(args, "i", &protocol)){
descr = PyArray_DESCR(self);
if ((protocol < 5) ||
(!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) &&
!PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)) ||
PyDataType_FLAGCHK(descr, NPY_ITEM_HASOBJECT) ||
(PyType_IsSubtype(((PyObject*)self)->ob_type, &PyArray_Type) &&
((PyObject*)self)->ob_type != &PyArray_Type) ||
PyDataType_ISUNSIZED(descr)) {
/* The PickleBuffer class from version 5 of the pickle protocol
* can only be used for arrays backed by a contiguous data buffer.
* For all other cases we fallback to the generic array_reduce
* method that involves using a temporary bytes allocation. However
* we do not call array_reduce directly but instead lookup and call
* the __reduce__ method to make sure that it's possible customize
* pickling in sub-classes. */
subclass_array_reduce = PyObject_GetAttrString((PyObject *)self,
"__reduce__");
return PyObject_CallObject(subclass_array_reduce, unused);
}
else if (protocol == 5){
ret = PyTuple_New(2);

if (ret == NULL) {
return NULL;
}

/* if the python version is below 3.8, the pickle module does not provide
* built-in support for protocol 5. We try importing the pickle5
* backport instead */
#if PY_VERSION_HEX >= 0x03080000
pickle_module = PyImport_ImportModule("pickle");
#elif PY_VERSION_HEX < 0x03080000 && PY_VERSION_HEX >= 0x03060000
pickle_module = PyImport_ImportModule("pickle5");
if (pickle_module == NULL){
/* for protocol 5, raise a clear ImportError if pickle5 is not found
*/
PyErr_SetString(PyExc_ImportError, "Using pickle protocol 5 "
"requires the pickle5 module for python versions >=3.6 "
"and <3.8");
return NULL;
}
#else
PyErr_SetString(PyExc_ValueError, "pickle protocol 5 is not available "
"for python versions < 3.6");
return NULL;
#endif
if (pickle_module == NULL){
return NULL;
}

pickle_class = PyObject_GetAttrString(pickle_module,
"PickleBuffer");

class_args_tuple = PyTuple_New(1);
if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) &&
PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)){

/* if the array if Fortran-contiguous and not C-contiguous,
* the PickleBuffer instance will hold a view on the transpose
* of the initial array, that is C-contiguous. */
order = 'F';
transposed_array = PyArray_Transpose((PyArrayObject*)self, NULL);
PyTuple_SET_ITEM(class_args_tuple, 0, transposed_array);
}
else {
order = 'C';
PyTuple_SET_ITEM(class_args_tuple, 0, (PyObject *)self);
Py_INCREF(self);
}

class_args = Py_BuildValue("O", class_args_tuple);

buffer = PyObject_CallObject(pickle_class, class_args);

numeric_mod = PyImport_ImportModule("numpy.core.numeric");
if (numeric_mod == NULL) {
Py_DECREF(ret);
return NULL;
}
from_buffer_func = PyObject_GetAttrString(numeric_mod,
"_frombuffer");
Py_DECREF(numeric_mod);

Py_INCREF(descr);

buffer_tuple = PyTuple_New(4);
PyTuple_SET_ITEM(buffer_tuple, 0, buffer);
PyTuple_SET_ITEM(buffer_tuple, 1, (PyObject *)descr);
PyTuple_SET_ITEM(buffer_tuple, 2,
PyObject_GetAttrString((PyObject *)self,
"shape"));
PyTuple_SET_ITEM(buffer_tuple, 3,
PyUnicode_FromStringAndSize(&order,
(Py_ssize_t)1));

PyTuple_SET_ITEM(ret, 0, from_buffer_func);
PyTuple_SET_ITEM(ret, 1, buffer_tuple);

return ret;
}
else {
PyErr_Format(PyExc_ValueError,
"cannot call __reduce_ex__ with protocol >= %d",
5);
return NULL;
}
}
else {
return NULL;
}

}

static PyObject *
array_setstate(PyArrayObject *self, PyObject *args)
{
Expand Down Expand Up @@ -2524,6 +2652,9 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
{"__reduce__",
(PyCFunction) array_reduce,
METH_VARARGS, NULL},
{"__reduce_ex__",
(PyCFunction) array_reduce_ex,
METH_VARARGS, NULL},
{"__setstate__",
(PyCFunction) array_setstate,
METH_VARARGS, NULL},
Expand Down
88 changes: 88 additions & 0 deletions numpy/core/tests/test_multiarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,21 @@ def test_pickle(self):

assert_equal(zs.dtype, zs2.dtype)

@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
reason="requires pickle protocol 5")
def test_pickle_with_buffercallback(self):
array = np.arange(10)
buffers = []
bytes_string = pickle.dumps(array, buffer_callback=buffers.append,
protocol=5)
Copy link
Contributor
@ogrisel ogrisel Oct 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou in the current draft of the PEP states that buffer_callback the example can accept a list of buffers with buffer_callback=buffers.extend. However, the implementation we test here only accepts a single buffer at a time. This is in line with this section of the PEP: https://www.python.org/dev/peps/pep-0574/#passing-a-sequence-of-buffers-in-buffer-callback

I think the example of the PEP should be fixed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! You're right, I forgot to fix this.

array_from_buffer = pickle.loads(bytes_string, buffers=buffers)
# when using pickle protocol 5 with buffer callbacks,
# array_from_buffer is reconstructed from a buffer holding a view
# to the initial array's data, so modifying an element in array
# should modify it in array_from_buffer too.
array[0] = -1
assert array_from_buffer[0] == -1, array_from_buffer[0]


class TestMethods(object):
def test_compress(self):
Expand Down Expand Up @@ -3562,6 +3577,79 @@ def test_highest_available_pickle_protocol(self):
else:
assert pickle.HIGHEST_PROTOCOL < 5

@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL >= 5,
reason=('this tests the error messages when trying to'
'protocol 5 although it is not available'))
def test_correct_protocol5_error_message(self):
array = np.arange(10)
f = io.BytesIO()

if sys.version_info[:2] in ((3, 6), (3, 7)):
# For the specific case of python3.6 and 3.7, raise a clear import
# error about the pickle5 backport when trying to use protocol=5
# without the pickle5 package
with pytest.raises(ImportError):
array.__reduce_ex__(5)

elif sys.version_info[:2] < (3, 6):
# when calling __reduce_ex__ explicitly with protocol=5 on python
# raise a ValueError saying that protocol 5 is not available for
# this python version
with pytest.raises(ValueError):
array.__reduce_ex__(5)

def test_record_array_with_object_dtype(self):
my_object = object()

arr_with_object = np.array(
[(my_object, 1, 2.0)],
dtype=[('a', object), ('b', int), ('c', float)])
arr_without_object = np.array(
[('xxx', 1, 2.0)],
dtype=[('a', str), ('b', int), ('c', float)])

for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
depickled_arr_with_object = pickle.loads(
pickle.dumps(arr_with_object, protocol=proto))
depickled_arr_without_object = pickle.loads(
pickle.dumps(arr_without_object, protocol=proto))

assert_equal(arr_with_object.dtype,
depickled_arr_with_object.dtype)
assert_equal(arr_without_object.dtype,
depickled_arr_without_object.dtype)

@pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5,
reason="requires pickle protocol 5")
def test_f_contiguous_array(self):
f_contiguous_array = np.array([[1, 2, 3], [4, 5, 6]], order='F')
buffers = []

# When using pickle protocol 5, Fortran-contiguous arrays can be
# serialized using out-of-band buffers
bytes_string = pickle.dumps(f_contiguous_array, protocol=5,
buffer_callback=buffers.append)

assert len(buffers) > 0

depickled_f_contiguous_array = pickle.loads(bytes_string,
buffers=buffers)

assert_equal(f_contiguous_array, depickled_f_contiguous_array)

def test_non_contiguous_array(self):
non_contiguous_array = n 6204 p.arange(12).reshape(3, 4)[:, :2]
assert not non_contiguous_array.flags.c_contiguous
assert not non_contiguous_array.flags.f_contiguous

# make sure non-contiguous arrays can be pickled-depickled
# using any protocol
for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
depickled_non_contiguous_array = pickle.loads(
pickle.dumps(non_contiguous_array, protocol=proto))

assert_equal(non_contiguous_array, depickled_non_contiguous_array)

def test_roundtrip(self):
for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
carray = np.array([[2, 9], [7, 0], [3, 8]])
Expand Down
0