diff --git a/doc/release/1.16.0-notes.rst b/doc/release/1.16.0-notes.rst index cc69b245d01d..f2c8f8dc2df3 100644 --- a/doc/release/1.16.0-notes.rst +++ b/doc/release/1.16.0-notes.rst @@ -104,6 +104,15 @@ content to be read after ``skiprows``, as in `numpy.genfromtxt`. Improvements ============ +no-copy pickling of numpy arrays +-------------------------------- +Up to protocol 4, numpy array pickling created 2 spurious copies of the data +being serlialized. +With pickle protocol 5, and the ``PickleBuffer`` API, a large variety of numpy +arrays can now be serialized without any copy using out-of-band buffers, +and with one less copy using in-band buffers. This results, for large arrays, +in an up to 66% drop in peak memory usage. + build shell independence ------------------------ NumPy builds should no longer interact with the host machine diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py index 7c9e4129981f..56ac6942408a 100644 --- a/numpy/core/numeric.py +++ b/numpy/core/numeric.py @@ -1934,6 +1934,10 @@ def fromfunction(function, shape, **kwargs): return function(*args, **kwargs) +def _frombuffer(buf, dtype, shape, order): + return frombuffer(buf, dtype=dtype).reshape(shape, order=order) + + def isscalar(num): """ Returns True if the type of `num` is a scalar type. diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c index cdbd0d6ae46c..23b0bfd2461c 100644 --- a/numpy/core/src/multiarray/methods.c +++ b/numpy/core/src/multiarray/methods.c @@ -1619,6 +1619,8 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args)) Notice because Python does not describe a mechanism to write raw data to the pickle, this performs a copy to a string first + This issue is now adressed in protocol 5, where a buffer is serialized + instead of a string, */ state = PyTuple_New(5); @@ -1651,6 +1653,132 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args)) return ret; } +static PyObject * +array_reduce_ex(PyArrayObject *self, PyObject *args) +{ + int protocol; + PyObject *ret = NULL, *numeric_mod = NULL, *from_buffer_func = NULL; + PyObject *buffer_tuple = NULL, *pickle_module = NULL, *pickle_class = NULL; + PyObject *class_args = NULL, *class_args_tuple = NULL, *unused = NULL; + PyObject *subclass_array_reduce = NULL; + PyObject *buffer = NULL, *transposed_array = NULL; + PyArray_Descr *descr = NULL; + char order; + + if (PyArg_ParseTuple(args, "i", &protocol)){ + descr = PyArray_DESCR(self); + if ((protocol < 5) || + (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) && + !PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)) || + PyDataType_FLAGCHK(descr, NPY_ITEM_HASOBJECT) || + (PyType_IsSubtype(((PyObject*)self)->ob_type, &PyArray_Type) && + ((PyObject*)self)->ob_type != &PyArray_Type) || + PyDataType_ISUNSIZED(descr)) { + /* The PickleBuffer class from version 5 of the pickle protocol + * can only be used for arrays backed by a contiguous data buffer. + * For all other cases we fallback to the generic array_reduce + * method that involves using a temporary bytes allocation. However + * we do not call array_reduce directly but instead lookup and call + * the __reduce__ method to make sure that it's possible customize + * pickling in sub-classes. */ + subclass_array_reduce = PyObject_GetAttrString((PyObject *)self, + "__reduce__"); + return PyObject_CallObject(subclass_array_reduce, unused); + } + else if (protocol == 5){ + ret = PyTuple_New(2); + + if (ret == NULL) { + return NULL; + } + + /* if the python version is below 3.8, the pickle module does not provide + * built-in support for protocol 5. We try importing the pickle5 + * backport instead */ +#if PY_VERSION_HEX >= 0x03080000 + pickle_module = PyImport_ImportModule("pickle"); +#elif PY_VERSION_HEX < 0x03080000 && PY_VERSION_HEX >= 0x03060000 + pickle_module = PyImport_ImportModule("pickle5"); + if (pickle_module == NULL){ + /* for protocol 5, raise a clear ImportError if pickle5 is not found + */ + PyErr_SetString(PyExc_ImportError, "Using pickle protocol 5 " + "requires the pickle5 module for python versions >=3.6 " + "and <3.8"); + return NULL; + } +#else + PyErr_SetString(PyExc_ValueError, "pickle protocol 5 is not available " + "for python versions < 3.6"); + return NULL; +#endif + if (pickle_module == NULL){ + return NULL; + } + + pickle_class = PyObject_GetAttrString(pickle_module, + "PickleBuffer"); + + class_args_tuple = PyTuple_New(1); + if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*)self) && + PyArray_IS_F_CONTIGUOUS((PyArrayObject*)self)){ + + /* if the array if Fortran-contiguous and not C-contiguous, + * the PickleBuffer instance will hold a view on the transpose + * of the initial array, that is C-contiguous. */ + order = 'F'; + transposed_array = PyArray_Transpose((PyArrayObject*)self, NULL); + PyTuple_SET_ITEM(class_args_tuple, 0, transposed_array); + } + else { + order = 'C'; + PyTuple_SET_ITEM(class_args_tuple, 0, (PyObject *)self); + Py_INCREF(self); + } + + class_args = Py_BuildValue("O", class_args_tuple); + + buffer = PyObject_CallObject(pickle_class, class_args); + + numeric_mod = PyImport_ImportModule("numpy.core.numeric"); + if (numeric_mod == NULL) { + Py_DECREF(ret); + return NULL; + } + from_buffer_func = PyObject_GetAttrString(numeric_mod, + "_frombuffer"); + Py_DECREF(numeric_mod); + + Py_INCREF(descr); + + buffer_tuple = PyTuple_New(4); + PyTuple_SET_ITEM(buffer_tuple, 0, buffer); + PyTuple_SET_ITEM(buffer_tuple, 1, (PyObject *)descr); + PyTuple_SET_ITEM(buffer_tuple, 2, + PyObject_GetAttrString((PyObject *)self, + "shape")); + PyTuple_SET_ITEM(buffer_tuple, 3, + PyUnicode_FromStringAndSize(&order, + (Py_ssize_t)1)); + + PyTuple_SET_ITEM(ret, 0, from_buffer_func); + PyTuple_SET_ITEM(ret, 1, buffer_tuple); + + return ret; + } + else { + PyErr_Format(PyExc_ValueError, + "cannot call __reduce_ex__ with protocol >= %d", + 5); + return NULL; + } + } + else { + return NULL; + } + +} + static PyObject * array_setstate(PyArrayObject *self, PyObject *args) { @@ -2524,6 +2652,9 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = { {"__reduce__", (PyCFunction) array_reduce, METH_VARARGS, NULL}, + {"__reduce_ex__", + (PyCFunction) array_reduce_ex, + METH_VARARGS, NULL}, {"__setstate__", (PyCFunction) array_setstate, METH_VARARGS, NULL}, diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 837aab23ef5f..8cd0f4d920ba 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -1382,6 +1382,21 @@ def test_pickle(self): assert_equal(zs.dtype, zs2.dtype) + @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5, + reason="requires pickle protocol 5") + def test_pickle_with_buffercallback(self): + array = np.arange(10) + buffers = [] + bytes_string = pickle.dumps(array, buffer_callback=buffers.append, + protocol=5) + array_from_buffer = pickle.loads(bytes_string, buffers=buffers) + # when using pickle protocol 5 with buffer callbacks, + # array_from_buffer is reconstructed from a buffer holding a view + # to the initial array's data, so modifying an element in array + # should modify it in array_from_buffer too. + array[0] = -1 + assert array_from_buffer[0] == -1, array_from_buffer[0] + class TestMethods(object): def test_compress(self): @@ -3562,6 +3577,79 @@ def test_highest_available_pickle_protocol(self): else: assert pickle.HIGHEST_PROTOCOL < 5 + @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL >= 5, + reason=('this tests the error messages when trying to' + 'protocol 5 although it is not available')) + def test_correct_protocol5_error_message(self): + array = np.arange(10) + f = io.BytesIO() + + if sys.version_info[:2] in ((3, 6), (3, 7)): + # For the specific case of python3.6 and 3.7, raise a clear import + # error about the pickle5 backport when trying to use protocol=5 + # without the pickle5 package + with pytest.raises(ImportError): + array.__reduce_ex__(5) + + elif sys.version_info[:2] < (3, 6): + # when calling __reduce_ex__ explicitly with protocol=5 on python + # raise a ValueError saying that protocol 5 is not available for + # this python version + with pytest.raises(ValueError): + array.__reduce_ex__(5) + + def test_record_array_with_object_dtype(self): + my_object = object() + + arr_with_object = np.array( + [(my_object, 1, 2.0)], + dtype=[('a', object), ('b', int), ('c', float)]) + arr_without_object = np.array( + [('xxx', 1, 2.0)], + dtype=[('a', str), ('b', int), ('c', float)]) + + for proto in range(2, pickle.HIGHEST_PROTOCOL + 1): + depickled_arr_with_object = pickle.loads( + pickle.dumps(arr_with_object, protocol=proto)) + depickled_arr_without_object = pickle.loads( + pickle.dumps(arr_without_object, protocol=proto)) + + assert_equal(arr_with_object.dtype, + depickled_arr_with_object.dtype) + assert_equal(arr_without_object.dtype, + depickled_arr_without_object.dtype) + + @pytest.mark.skipif(pickle.HIGHEST_PROTOCOL < 5, + reason="requires pickle protocol 5") + def test_f_contiguous_array(self): + f_contiguous_array = np.array([[1, 2, 3], [4, 5, 6]], order='F') + buffers = [] + + # When using pickle protocol 5, Fortran-contiguous arrays can be + # serialized using out-of-band buffers + bytes_string = pickle.dumps(f_contiguous_array, protocol=5, + buffer_callback=buffers.append) + + assert len(buffers) > 0 + + depickled_f_contiguous_array = pickle.loads(bytes_string, + buffers=buffers) + + assert_equal(f_contiguous_array, depickled_f_contiguous_array) + + def test_non_contiguous_array(self): + non_contiguous_array = np.arange(12).reshape(3, 4)[:, :2] + assert not non_contiguous_array.flags.c_contiguous + assert not non_contiguous_array.flags.f_contiguous + + # make sure non-contiguous arrays can be pickled-depickled + # using any protocol + for proto in range(2, pickle.HIGHEST_PROTOCOL + 1): + depickled_non_contiguous_array = pickle.loads( + pickle.dumps(non_contiguous_array, protocol=proto)) + + assert_equal(non_contiguous_array, depickled_non_contiguous_array) + def test_roundtrip(self): for proto in range(2, pickle.HIGHEST_PROTOCOL + 1): carray = np.array([[2, 9], [7, 0], [3, 8]])