8000 BUG: Fixes for ma.median and nanpercentile. by juliantaylor · Pull Request #8372 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG: Fixes for ma.median and nanpercentile. #8372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 13, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/release/1.12.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ The following functions are changed: ``sum``, ``product``,
The previous identity was 1, it is now -1. See entry in `Improvements`_ for
more explanation.

ma.median warns and returns nan when unmasked invalid values are encountered
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Similar to unmasked median the masked median `ma.median` now emits a Runtime
warning and returns `NaN` in slices where an unmasked `NaN` is present.

Greater consistancy in ``assert_almost_equal``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The precision check for scalars has been changed to match that for arrays. It
Expand Down
18 changes: 1 addition & 17 deletions numpy/lib/function_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3994,23 +3994,7 @@ def _median(a, axis=None, out=None, overwrite_input=False):
if np.issubdtype(a.dtype, np.inexact) and sz > 0:
# warn and return nans like mean would
rout = mean(part[indexer], axis=axis, out=out)
part = np.rollaxis(part, axis, part.ndim)
n = np.isnan(part[..., -1])
if rout.ndim == 0:
if n == True:
warnings.warn("Invalid value encountered in median",
RuntimeWarning, stacklevel=3)
if out is not None:
out[...] = a.dtype.type(np.nan)
rout = out
else:
rout = a.dtype.type(np.nan)
elif np.count_nonzero(n.ravel()) > 0:
warnings.warn("Invalid value encountered in median for" +
" %d results" % np.count_nonzero(n.ravel()),
RuntimeWarning, stacklevel=3)
rout[n] = np.nan
return rout
return np.lib.utils._median_nancheck(part, rout, axis, out)
else:
# if there are no nans
# Use mean in odd and even case to coerce data type
Expand Down
4 changes: 2 additions & 2 deletions numpy/lib/nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ def _nanmedian(a, axis=None, out=None, overwrite_input=False):
else:
# for small medians use sort + indexing which is still faster than
# apply_along_axis
if a.shape[axis] < 400:
if a.shape[axis] < 1000:
return _nanmedian_small(a, axis, out, overwrite_input)
result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
if out is not None:
Expand Down Expand Up @@ -1134,7 +1134,7 @@ def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
See nanpercentile for parameter usage

"""
if axis is None:
if axis is None or a.ndim == 1:
part = a.ravel()
result = _nanpercentile1d(part, q, overwrite_input, interpolation)
else:
Expand Down
48 changes: 35 additions & 13 deletions numpy/lib/tests/test_nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,18 +693,36 @@ def test_extended_axis_invalid(self):
def test_float_special(self):
with suppress_warnings() as sup:
sup.filter(RuntimeWarning)
a = np.array([[np.inf, np.nan], [np.nan, np.nan]])
assert_equal(np.nanmedian(a, axis=0), [np.inf, np.nan])
assert_equal(np.nanmedian(a, axis=1), [np.inf, np.nan])
assert_equal(np.nanmedian(a), np.inf)

# minimum fill value check
a = np.array([[np.nan, np.nan, np.inf], [np.nan, np.nan, np.inf]])
assert_equal(np.nanmedian(a, axis=1), np.inf)

# no mask path
a = np.array([[np.inf, np.inf], [np.inf, np.inf]])
assert_equal(np.nanmedian(a, axis=1), np.inf)
for inf in [np.inf, -np.inf]:
a = np.array([[inf, np.nan], [np.nan, np.nan]])
assert_equal(np.nanmedian(a, axis=0), [inf, np.nan])
assert_equal(np.nanmedian(a, axis=1), [inf, np.nan])
assert_equal(np.nanmedian(a), inf)

# minimum fill value check
a = np.array([[np.nan, np.nan, inf],
[np.nan, np.nan, inf]])
assert_equal(np.nanmedian(a), inf)
assert_equal(np.nanmedian(a, axis=0), [np.nan, np.nan, inf])
assert_equal(np.nanmedian(a, axis=1), inf)

# no mask path
a = np.array([[inf, inf], [inf, inf]])
assert_equal(np.nanmedian(a, axis=1), inf)

for i in range(0, 10):
for j in range(1, 10):
a = np.array([([np.nan] * i) + ([inf] * j)] * 2)
assert_equal(np.nanmedian(a), inf)
assert_equal(np.nanmedian(a, axis=1), inf)
assert_equal(np.nanmedian(a, axis=0),
([np.nan] * i) + [inf] * j)

a = np.array([([np.nan] * i) + ([-inf] * j)] * 2)
assert_equal(np.nanmedian(a), -inf)
assert_equal(np.nanmedian(a, axis=1), -inf)
assert_equal(np.nanmedian(a, axis=0),
([np.nan] * i) + [-inf] * j)


class TestNanFunctions_Percentile(TestCase):
Expand Down Expand Up @@ -805,7 +823,11 @@ def test_empty(self):
assert_(len(w) == 0)

def test_scalar(self):
assert_(np.nanpercentile(0., 100) == 0.)
assert_equal(np.nanpercentile(0., 100), 0.)
a = np.arange(6)
r = np.nanpercentile(a, 50, axis=0)
assert_equal(r, 2.5)
assert_(np.isscalar(r))

def test_extended_axis_invalid(self):
d = np.ones((3, 5, 7, 11))
Expand Down
46 changes: 46 additions & 0 deletions numpy/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from numpy.core.numerictypes import issubclass_, issubsctype, issubdtype
from numpy.core import ndarray, ufunc, asarray
import numpy as np

# getargspec and formatargspec were removed in Python 3.6
from numpy.compat import getargspec, formatargspec
Expand Down Expand Up @@ -1119,4 +1120,49 @@ def safe_eval(source):
import ast

return ast.literal_eval(source)


def _median_nancheck(data, result, axis, out):
"""
Utility function to check median result from data for NaN values at the end
and return NaN in that case. Input result can also be a MaskedArray.

Parameters
----------
data : array
Input data to median function
result : Array or MaskedArray
Result of median function
axis : {int, sequence of int, None}, optional
Axis or axes along which the median was computed.
out : ndarray, optional
Output array in which to place the result.
Returns
-------
median : scalar or ndarray
Median or NaN in axes which contained NaN in the input.
"""
if data.size == 0:
return result
data = np.rollaxis(data, axis, data.ndim)
n = np.isnan(data[..., -1])
# masked NaN values are ok
if np.ma.isMaskedArray(n):
n = n.filled(False)
if result.ndim == 0:
if n == True:
warnings.warn("Invalid value encountered in median",
RuntimeWarning, stacklevel=3)
if out is not None:
out[...] = data.dtype.type(np.nan)
result = out
else:
result = data.dtype.type(np.nan)
elif np.count_nonzero(n.ravel()) > 0:
warnings.warn("Invalid value encountered in median for" +
" %d results" % np.count_nonzero(n.ravel()),
RuntimeWarning, stacklevel=3)
result[n] = np.nan
return result

#-----------------------------------------------------------------------------
52 changes: 42 additions & 10 deletions numpy/ma/extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,24 +699,45 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
return r

def _median(a, axis=None, out=None, overwrite_input=False):
# when an unmasked NaN is present return it, so we need to sort the NaN
# values behind the mask
if np.issubdtype(a.dtype, np.inexact):
fill_value = np.inf
else:
fill_value = None
if overwrite_input:
if axis is None:
asorted = a.ravel()
asorted.sort()
asorted.sort(fill_value=fill_value)
else:
a.sort(axis=axis)
a.sort(axis=axis, fill_value=fill_value)
asorted = a
else:
asorted = sort(a, axis=axis)
asorted = sort(a, axis=axis, fill_value=fill_value)

if axis is None:
axis = 0
elif axis < 0:
axis += asorted.ndim

if asorted.ndim == 1:
counts = count(asorted)
idx, odd = divmod(count(asorted), 2)
return asorted[idx + odd - 1 : idx + 1].mean(out=out)
mid = asorted[idx + odd - 1 : idx + 1]
if np.issubdtype(asorted.dtype, np.inexact) and asorted.size > 0:
# avoid inf / x = masked
s = mid.sum(out=out)
np.true_divide(s, 2., casting='unsafe')
s = np.lib.utils._median_nancheck(asorted, s, axis, out)
else:
s = mid.mean(out=out)

# if result is masked either the input contained enough
# minimum_fill_value so that it would be the median or all values
# masked
if np.ma.is_masked(s) and not np.all(asorted.mask):
return np.ma.minimum_fill_value(asorted)
return s

counts = count(asorted, axis=axis)
h = counts // 2
Expand All @@ -727,24 +748,35 @@ def _median(a, axis=None, out=None, overwrite_input=False):
ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij')

# insert indices of low and high median
ind.insert(axis, np.maximum(0, h - 1))
ind.insert(axis, h - 1)
low = asorted[tuple(ind)]
ind[axis] = h
ind[axis] = np.minimum(h, asorted.shape[axis] - 1)
high = asorted[tuple(ind)]

# duplicate high if odd number of elements so mean does nothing
odd = counts % 2 == 1
if asorted.ndim > 1:
np.copyto(low, high, where=odd)
elif odd:
low = high
np.copyto(low, high, where=odd)
# not necessary for scalar True/False masks
try:
np.copyto(low.mask, high.mask, where=odd)
except:
pass

if np.issubdtype(asorted.dtype, np.inexact):
# avoid inf / x = masked
s = np.ma.sum([low, high], axis=0, out=out)
np.true_divide(s.data, 2., casting='unsafe', out=s.data)

s = np.lib.utils._median_nancheck(asorted, s, axis, out)
else:
s = np.ma.mean([low, high], axis=0, out=out)

# if result is masked either the input contained enough minimum_fill_value
# so that it would be the median or all values masked
if np.ma.is_masked(s):
rep = (~np.all(asorted.mask, axis=axis)) & s.mask
s.data[rep] = np.ma.minimum_fill_value(asorted)
s.mask[rep] = False
return s


Expand Down
Loading
0