8000 BUG MaskedArray __eq__ wrong for masked scalar, multi-d recarray by mhvk · Pull Request #8590 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG MaskedArray __eq__ wrong for masked scalar, multi-d recarray #8590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/release/1.13.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@ Better default repr for ``ndarray`` subclasses
Subclasses of ndarray with no ``repr`` specialization now correctly indent
their data and type lines.

More reliable comparisons of masked arrays
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Comparisons of masked arrays were buggy for masked scalars and failed for
structured arrays with dimension higher than one. Both problems are now
solved. In the process, it was ensured that in getting the result for a
structured array, masked fields are properly ignored, i.e., the result is equal
if all fields that are non-masked in both are equal, thus making the behaviour
identical to what one gets by comparing an unstructured masked array and then
doing ``.all()`` over some axis.

Changes
=======
Expand Down
159 changes: 77 additions & 82 deletions numpy/ma/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from __future__ import division, absolute_import, print_function

import sys
import operator
import warnings
from functools import reduce

Expand Down Expand Up @@ -1602,21 +1603,11 @@ def make_mask(m, copy=False, shrink=True, dtype=MaskType):
"""
if m is nomask:
return nomask
elif isinstance(m, ndarray):
# We won't return after this point to make sure we can shrink the mask
# Fill the mask in case there are missing data
m = filled(m, True)
# Make sure the input dtype is valid
dtype = make_mask_descr(dtype)
if m.dtype == dtype:
if copy:
result = m.copy()
else:
result = m
else:
result = np.array(m, dtype=dtype, copy=copy)
else:
result = np.array(filled(m, True), dtype=MaskType)

# Make sure the input dtype is valid.
dtype = make_mask_descr(dtype)
# Fill the mask in case there are missing data; turn it into an ndarray.
result = np.array(filled(m, True), copy=copy, dtype=dtype, subok=True)
# Bas les masques !
if shrink and (not result.dtype.names) and (not result.any()):
return nomask
Expand Down Expand Up @@ -1733,7 +1724,8 @@ def _recursive_mask_or(m1, m2, newmask):
if (dtype1 != dtype2):
raise ValueError("Incompatible dtypes '%s'<>'%s'" % (dtype1, dtype2))
if dtype1.names:
newmask = np.empty_like(m1)
# Allocate an output mask array with the properly broadcast shape.
newmask = np.empty(np.broadcast(m1, m2).shape, dtype1)
_recursive_mask_or(m1, m2, newmask)
return newmask
return make_mask(umath.logical_or(m1, m2), copy=copy, shrink=shrink)
Expand Down Expand Up @@ -3873,81 +3865,84 @@ def _delegate_binop(self, other):
return True
return False

def __eq__(self, other):
"""
Check whether other equals self elementwise.
def _comparison(self, other, compare):
"""Compare self with other using operator.eq or operator.ne.

When either of the elements is masked, the result is masked as well,
but the underlying boolean data are still set, with self and other
considered equal if both are masked, and unequal otherwise.

For structured arrays, all fields are combined, with masked values
ignored. The result is masked if all fields were masked, with self
and other considered equal only if both were fully masked.
"""
if self is masked:
return masked
omask = getmask(other)
if omask is nomask:
check = self.filled(0).__eq__(other)
try:
check = check.view(type(self))
check._mask = self._mask
except AttributeError:
# Dang, we have a bool instead of an array: return the bool
return check
smask = self.mask
mask = mask_or(smask, omask, copy=True)

odata = getdata(other)
if mask.dtype.names:
# For possibly masked structured arrays we need to be careful,
# since the standard structured array comparison will use all
# fields, masked or not. To avoid masked fields influencing the
# outcome, we set all masked fields in self to other, so they'll
# count as equal. To prepare, we ensure we have the right shape.
broadcast_shape = np.broadcast(self, odata).shape
sbroadcast = np.broadcast_to(self, broadcast_shape, subok=True)
sbroadcast._mask = mask
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need broadcasting too? Perhaps we should just broadcast mask at the top, since we do so later anyway

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, filled does the broadcasting correctly. I went back and forth about putting the broadcasting at the top, but decided against it, since usually it will not be needed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess my point is more whether it's even valid to have a mask that doesn't match the data shape, which I don't believe is guaranteed here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, no, it isn't, but filled does broadcast correctly (and the mask is only used for that purpose; ideally, the filled function would just take a mask argument, using self.mask by default; but that's for yet another PR...)

sdata = sbroadcast.filled(odata)
# Now take care of the mask; the merged mask should have an item
# masked if all fields were masked (in one and/or other).
mask = (mask == np.ones((), mask.dtype))

else:
odata = filled(other, 0)
check = self.filled(0).__eq__(odata).view(type(self))
if self._mask is nomask:
check._mask = omask
else:
mask = mask_or(self._mask, omask)
if mask.dtype.names:
if mask.size > 1:
axis = 1
else:
axis = None
try:
mask = mask.view((bool_, len(self.dtype))).all(axis)
except (ValueError, np.AxisError):
# TODO: what error are we trying to catch here?
# invalid axis, or invalid view?
mask = np.all([[f[n].all() for n in mask.dtype.names]
for f in mask], axis=axis)
check._mask = mask
# For regular arrays, just use the data as they come.
sdata = self.data

check = compare(sdata, odata)

if isinstance(check, (np.bool_, bool)):
return masked if mask else check

if mask is not nomask:
# Adjust elements that were masked, which should be treated
# as equal if masked in both, unequal if masked in one.
# Note that this works automatically for structured arrays too.
check = np.where(mask, compare(smask, omask), check)
if mask.shape != check.shape:
# Guarantee consistency of the shape, making a copy since the
# the mask may need to get written to later.
mask = np.broadcast_to(mask, check.shape).copy()

check = check.view(type(self))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Went back to view(type(self)) here, since python will already have ensured that, if other is a subclass of self, that this would be called as other.__eq__(self).

check._mask = mask
return check

def __ne__(self, other):
def __eq__(self, other):
"""Check whether other equals self elementwise.

When either of the elements is masked, the result is masked as well,
but the underlying boolean data are still set, with self and other
considered equal if both are masked, and unequal otherwise.

For structured arrays, all fields are combined, with masked values
ignored. The result is masked if all fields were masked, with self
and other considered equal only if both were fully masked.
"""
Check whether other doesn't equal self elementwise
return self._comparison(other, operator.eq)

def __ne__(self, other):
"""Check whether other does not equal self elementwise.

When either of the elements is masked, the result is masked as well,
but the underlying boolean data are still set, with self and other
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any tests of this underlying bool data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, note how the tests all do something like test = (?? != ??) and then test both the values and the mask.

considered equal if both are masked, and unequal otherwise.

For structured arrays, all fields are combined, with masked values
ignored. The result is masked if all fields were masked, with self
and other considered equal only if both were fully masked.
"""
if self is masked:
return masked
omask = getmask(other)
if omask is nomask:
check = self.filled(0).__ne__(other)
try:
check = check.view(type(self))
check._mask = self._mask
except AttributeError:
# In case check is a boolean (or a numpy.bool)
return check
else:
odata = filled(other, 0)
check = self.filled(0).__ne__(odata).view(type(self))
if self._mask is nomask:
check._mask = omask
else:
mask = mask_or(self._mask, omask)
if mask.dtype.names:
if mask.size > 1:
axis = 1
else:
axis = None
try:
mask = mask.view((bool_, len(self.dtype))).all(axis)
except (ValueError, np.AxisError):
# TODO: what error are we trying to catch here?
# invalid axis, or invalid view?
mask = np.all([[f[n].all() for n in mask.dtype.names]
for f in mask], axis=axis)
check._mask = mask
return check
return self._comparison(other, operator.ne)

def __add__(self, other):
"""
Expand Down
103 changes: 95 additions & 8 deletions numpy/ma/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,32 +1335,96 @@ def test_eq_on_structured(self):
ndtype = [('A', int), ('B', int)]
a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype)
test = (a == a)
assert_equal(test, [True, True])
assert_equal(test.data, [True, True])
assert_equal(test.mask, [False, False])
test = (a == a[0])
assert_equal(test.data, [True, False])
assert_equal(test.mask, [False, False])
b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype)
test = (a == b)
assert_equal(test, [False, True])
assert_equal(test.data, [False, True])
assert_equal(test.mask, [True, False])
test = (a[0] == b)
assert_equal(test.data, [False, False])
assert_equal(test.mask, [True, False])
b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype)
test = (a == b)
assert_equal(test, [True, False])
assert_equal(test.data, [True, True])
assert_equal(test.mask, [False, False])
# complicated dtype, 2-dimensional array.
ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])]
a = array([[(1, (1, 1)), (2, (2, 2))],
[(3, (3, 3)), (4, (4, 4))]],
mask=[[(0, (1, 0)), (0, (0, 1))],
[(1, (0, 0)), (1, (1, 1))]], dtype=ndtype)
test = (a[0, 0] == a)
assert_equal(test.data, [[True, False], [False, False]])
assert_equal(test.mask, [[False, False], [False, True]])

def test_ne_on_structured(self):
# Test the equality of structured arrays
ndtype = [('A', int), ('B', int)]
a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype)
test = (a != a)
assert_equal(test, [False, False])
assert_equal(test.data, [False, False])
assert_equal(test.mask, [False, False])
test = (a != a[0])
assert_equal(test.data, [False, True])
assert_equal(test.mask, [False, False])
b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype)
test = (a != b)
assert_equal(test, [True, False])
assert_equal(test.data, [True, False])
assert_equal(test.mask, [True, False])
test = (a[0] != b)
assert_equal(test.data, [True, True])
assert_equal(test.mask, [True, False])
b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype)
test = (a != b)
assert_equal(test, [False, True])
assert_equal(test.data, [False, False])
assert_equal(test.mask, [False, False])
# complicated dtype, 2-dimensional array.
ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])]
a = array([[(1, (1, 1)), (2, (2, 2))],
[(3, (3, 3)), (4, (4, 4))]],
mask=[[(0, (1, 0)), (0, (0, 1))],
[(1, (0, 0)), (1, (1, 1))]], dtype=ndtype)
test = (a[0, 0] != a)
assert_equal(test.data, [[False, True], [True, True]])
assert_equal(test.mask, [[False, False], [False, True]])

def test_eq_ne_structured_extra(self):
# ensure simple examples are symmetric and make sense.
# from https://github.com/numpy/numpy/pull/8590#discussion_r101126465
dt = np.dtype('i4,i4')
for m1 in (mvoid((1, 2), mask=(0, 0), dtype=dt),
mvoid((1, 2), mask=(0, 1), dtype=dt),
mvoid((1, 2), mask=(1, 0), dtype=dt),
mvoid((1, 2), mask=(1, 1), dtype=dt)):
ma1 = m1.view(MaskedArray)
r1 = ma1.view('2i4')
for m2 in (np.array((1, 1), dtype=dt),
mvoid((1, 1), dtype=dt),
mvoid((1, 0), mask=(0, 1), dtype=dt),
mvoid((3, 2), mask=(0, 1), dtype=dt)):
ma2 = m2.view(MaskedArray)
r2 = ma2.view('2i4')
eq_expected = (r1 == r2).all()
assert_equal(m1 == m2, eq_expected)
assert_equal(m2 == m1, eq_expected)
assert_equal(ma1 == m2, eq_expected)
assert_equal(m1 == ma2, eq_expected)
assert_equal(ma1 == ma2, eq_expected)
# Also check it is the same if we do it element by element.
el_by_el = [m1[name] == m2[name] for name in dt.names]
assert_equal(array(el_by_el, dtype=bool).all(), eq_expected)
ne_expected = (r1 != r2).any()
assert_equal(m1 != m2, ne_expected)
assert_equal(m2 != m1, ne_expected)
assert_equal(ma1 != m2, ne_expected)
assert_equal(m1 != ma2, ne_expected)
assert_equal(ma1 != ma2, ne_expected)
el_by_el = [m1[name] != m2[name] for name in dt.names]
assert_equal(array(el_by_el, dtype=bool).any(), ne_expected)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any tests of equality against normal (non-masked) arrays?


def test_eq_with_None(self):
# Really, comparisons with None should not be done, but check them
Expand Down Expand Up @@ -1393,6 +1457,22 @@ def test_eq_with_scalar(self):
assert_equal(a == 0, False)
assert_equal(a != 1, False)
assert_equal(a != 0, True)
b = array(1, mask=True)
assert_equal(b == 0, masked)
assert_equal(b == 1, masked)
assert_equal(b != 0, masked)
assert_equal(b != 1, masked)

def test_eq_different_dimensions(self):
m1 = array([1, 1], mask=[0, 1])
# test comparison with both masked and regular arrays.
for m2 in (array([[0, 1], [1, 2]]),
np.array([[0, 1], [1, 2]])):
test = (m1 == m2)
assert_equal(test.data, [[False, False],
[True, False]])
assert_equal(test.mask, [[False, True],
[False, True]])

def test_numpyarithmetics(self):
# Check that the mask is not back-propagated when using numpy functions
Expand Down Expand Up @@ -3978,7 +4058,15 @@ def test_make_mask(self):
test = make_mask(mask, dtype=mask.dtype)
assert_equal(test.dtype, bdtype)
assert_equal(test, np.array([(0, 0), (0, 1)], dtype=bdtype))

# Ensure this also works for void
mask = np.array((False, True), dtype='?,?')[()]
assert_(isinstance(mask, np.void))
test = make_mask(mask, dtype=mask.dtype)
assert_equal(test, mask)
assert_(test is not mask)
mask = np.array((0, 1), dtype='i4,i4')[()]
test2 = make_mask(mask, dtype=mask.dtype)
assert_equal(test2, test)
# test that nomask is returned when m is nomask.
bools = [True, False]
dtypes = [MaskType, np.float]
Expand All @@ -3987,7 +4075,6 @@ def test_make_mask(self):
res = make_mask(nomask, copy=cpy, shrink=shr, dtype=dt)
assert_(res is nomask, msgformat % (cpy, shr, dt))


def test_mask_or(self):
# Initialize
mtype = [('a', np.bool), ('b', np.bool)]
Expand Down
0