8000 ENH: structured datatype safety checks by ahaldane · Pull Request #5548 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: structured datatype safety checks #5548

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
ENH: structured datatype safety checks
Previously views of structured arrays containing objects were completely
disabled.  This commit adds more lenient check for whether an object-array
view is allowed, and adds similar checks to getfield/setfield

Fixes #2346. Fixes #3256. Fixes #2599. Fixes #3253. Fixes #3286.
Fixes #5762.
  • Loading branch information
ahaldane committed Jun 5, 2015
commit e2cd6fa869cec6d92062fb687d8e6952c1202017
168 changes: 168 additions & 0 deletions numpy/core/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,174 @@ def _index_fields(ary, fields):
copy_dtype = {'names':view_dtype['names'], 'formats':view_dtype['formats']}
return array(view, dtype=copy_dtype, copy=True)

def _get_all_field_offsets(dtype, base_offset=0):
""" Returns the types and offsets of all fields in a (possibly structured)
data type, including nested fields and subarrays.

Parameters
----------
dtype : data-type
Data type to extract fields from.
base_offset : int, optional
Additional offset to add to all field offsets.

Returns
-------
fields : list of (data-type, int) pairs
A flat list of (dtype, byte offset) pairs.

"""
fields = []
if dtype.fields is not None:
for name in dtype.names:
sub_dtype = dtype.fields[name][0]
sub_offset = dtype.fields[name][1] + base_offset
fields.extend(_get_all_field_offsets(sub_dtype, sub_offset))
else:
if dtype.shape:
sub_offsets = _get_all_field_offsets(dtype.base, base_offset)
count = 1
for dim in dtype.shape:
count *= dim
fields.extend((typ, off + dtype.base.itemsize*j)
for j in range(count) for (typ, off) in sub_offsets)
else:
fields.append((dtype, base_offset))
return fields

def _check_field_overlap(new_fields, old_fields):
""" Perform object memory overlap tests for two data-types (see
_view_is_safe).

This function checks that new fields only access memory contained in old
fields, and that non-object fields are not interpreted as objects and vice
versa.

Parameters
----------
new_fields : list of (data-type, int) pairs
Flat list of (dtype, byte offset) pairs for the new data type, as
returned by _get_all_field_offsets.
old_fields: list of (data-type, int) pairs
Flat list of (dtype, byte offset) pairs for the old data type, as
returned by _get_all_field_offsets.

Raises
------
TypeError
If the new fields are incompatible with the old fields

"""
from .numerictypes import object_
from .multiarray import dtype

#first go byte by byte and check we do not access bytes not in old_fields
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are smarter ways of going about this check, but we probably shouldn't let micro-optimizations distract us from the larger goal.

new_bytes = set()
for tp, off in new_fields:
new_bytes.update(set(range(off, off+tp.itemsize)))
old_bytes = set()
for tp, off in old_fields:
old_bytes.update(set(range(off, off+tp.itemsize)))
if new_bytes.difference(old_bytes):
raise TypeError("view would access data parent array doesn't own")

#next check that we do not interpret non-Objects as Objects, and vv
obj_offsets = [off for (tp, off) in old_fields if tp.type is object_]
obj_size = dtype(object_).itemsize

for fld_dtype, fld_offset in new_fields:
if fld_dtype.type is object_:
# check we do not create object views where
# there are no objects.
if fld_offset not in obj_offsets:
raise TypeError("cannot view non-Object data as Object type")
else:
# next check we do not create non-object views
# where there are already objects.
# see validate_object_field_overlap for a similar computation.
for obj_offset in obj_offsets:
if (fld_offset < obj_offset + obj_size and
obj_offset < fld_offset + fld_dtype.itemsize):
raise TypeError("cannot view Object as non-Object type")

def _getfield_is_safe(oldtype, newtype, offset):
""" Checks safety of getfield for object arrays.

As in _view_is_safe, we need to check that memory containing objects is not
reinterpreted as a non-object datatype and vice versa.

Parameters
----------
oldtype : data-type
Data type of the original ndarray.
newtype : data-type
Data type of the field being accessed by ndarray.getfield
offset : int
Offset of the field being accessed by ndarray.getfield

Raises
------
TypeError
If the field access is invalid

"""
new_fields = _get_all_field_offsets(newtype, offset)
old_fields = _get_all_field_offsets(oldtype)
# raises if there is a problem
_check_field_overlap(new_fields, old_fields)

def _view_is_safe(oldtype, newtype):
""" Checks safety of a view involving object arrays, for example when
doing::

np.zeros(10, dtype=oldtype).view(newtype)

We need to check that
1) No memory that is not an object will be interpreted as a object,
2) No memory containing an object will be interpreted as an arbitrary type.
Both cases can cause segfaults, eg in the case the view is written to.
Strategy here is to also disallow views where newtype has any field in a
place oldtype doesn't.

Parameters
----------
oldtype : data-type
Data type of original ndarray
newtype : data-type
Data type of the view

Raises
------
TypeError
If the new type is incompatible with the old type.

"""
new_fields = _get_all_field_offsets(newtype)
new_size = newtype.itemsize

old_fields = _get_all_field_offsets(oldtype)
old_size = oldtype.itemsize

# if the itemsizes are not equal, we need to check that all the
# 'tiled positions' of the object match up. Here, we allow
# for arbirary itemsizes (even those possibly disallowed
# due to stride/data length issues).
if old_size == new_size:
new_num = old_num = 1
else:
gcd_new_old = _gcd(new_size, old_size)
new_num = old_size // gcd_new_old
old_num = new_size // gcd_new_old

# get position of fields within the tiling
new_fieldtile = [(tp, off + new_size*j)
for j in range(new_num) for (tp, off) in new_fields]
old_fieldtile = [(tp, off + old_size*j)
for j in range(old_num) for (tp, off) in old_fields]

# raises if there is a problem
_check_field_overlap(new_fieldtile, old_fieldtile)

# Given a string containing a PEP 3118 format specifier,
# construct a Numpy dtype

Expand Down
19 changes: 12 additions & 7 deletions numpy/core/src/multiarray/getset.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include "numpy/arrayobject.h"

#include "npy_config.h"

#include "npy_pycompat.h"
#include "npy_import.h"

#include "common.h"
#include "scalartypes.h"
Expand Down Expand Up @@ -434,6 +434,13 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
npy_intp newdim;
int i;
char *msg = "new type not compatible with array.";
PyObject *safe;
static PyObject *checkfunc = NULL;

npy_cache_pyfunc("numpy.core._internal", "_view_is_safe", &checkfunc);
if (checkfunc == NULL) {
return -1;
}

if (arg == NULL) {
PyErr_SetString(PyExc_AttributeError,
Expand All @@ -448,15 +455,13 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
return -1;
}

if (PyDataType_FLAGCHK(newtype, NPY_ITEM_HASOBJECT) ||
PyDataType_FLAGCHK(newtype, NPY_ITEM_IS_POINTER) ||
PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_HASOBJECT) ||
PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_IS_POINTER)) {
PyErr_SetString(PyExc_TypeError,
"Cannot change data-type for object array.");
/* check that we are not reinterpreting memory containing Objects */
safe = PyObject_CallFunction(checkfunc, "OO", PyArray_DESCR(self), newtype);
if (safe == NULL) {
Py_DECREF(newtype);
return -1;
}
Py_DECREF(safe);

if (newtype->elsize == 0) {
/* Allow a void view */
Expand Down
43 changes: 17 additions & 26 deletions numpy/core/src/multiarray/methods.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "npy_config.h"
#include "npy_pycompat.h"
#include "npy_import.h"
#include "ufunc_override.h"
#include "common.h"
#include "ctors.h"
Expand Down Expand Up @@ -358,15 +359,23 @@ NPY_NO_EXPORT PyObject *
PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
{
PyObject *ret = NULL;
PyObject *safe;
static PyObject *checkfunc = NULL;

if (offset < 0 || (offset + typed->elsize) > PyArray_DESCR(self)->elsize) {
PyErr_Format(PyExc_ValueError,
"Need 0 <= offset <= %d for requested type "
"but received offset = %d",
PyArray_DESCR(self)->elsize-typed->elsize, offset);
Py_DECREF(typed);
npy_cache_pyfunc("numpy.core._internal", "_getfield_is_safe", &checkfunc);
if (checkfunc == NULL) {
return NULL;
}

/* check that we are not reinterpreting memory containing Objects */
/* only returns True or raises */
safe = PyObject_CallFunction(checkfunc, "OOi", PyArray_DESCR(self),
typed, offset);
if (safe == NULL) {
return NULL;
}
Py_DECREF(safe);

ret = PyArray_NewFromDescr(Py_TYPE(self),
typed,
PyArray_NDIM(self), PyArray_DIMS(self),
Expand Down Expand Up @@ -417,23 +426,12 @@ PyArray_SetField(PyArrayObject *self, PyArray_Descr *dtype,
PyObject *ret = NULL;
int retval = 0;

if (offset < 0 || (offset + dtype->elsize) > PyArray_DESCR(self)->elsize) {
PyErr_Format(PyExc_ValueError,
"Need 0 <= offset <= %d for requested type "
"but received offset = %d",
PyArray_DESCR(self)->elsize-dtype->elsize, offset);
Py_DECREF(dtype);
return -1;
}
ret = PyArray_NewFromDescr(Py_TYPE(self),
dtype, PyArray_NDIM(self), PyArray_DIMS(self),
PyArray_STRIDES(self), PyArray_BYTES(self) + offset,
PyArray_FLAGS(self), (PyObject *)self);
/* getfield returns a view we can write to */
ret = PyArray_GetField(self, dtype, offset);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting approach. Perhaps it warrants an explanatory comment that the return is a view, not a copy?

if (ret == NULL) {
return -1;
}

PyArray_UpdateFlags((PyArrayObject *)ret, NPY_ARRAY_UPDATE_ALL);
retval = PyArray_CopyObject((PyArrayObject *)ret, val);
Py_DECREF(ret);
return retval;
Expand All @@ -455,13 +453,6 @@ array_setfield(PyArrayObject *self, PyObject *args, PyObject *kwds)
return NULL;
}

if (PyDataType_REFCHK(PyArray_DESCR(self))) {
PyErr_SetString(PyExc_RuntimeError,
"cannot call setfield on an object array");
Py_DECREF(dtype);
return NULL;
}

if (PyArray_SetField(self, dtype, offset, value) < 0) {
return NULL;
}
Expand Down
Loading
0