From 2e79af2ad60763bc02e606d9ef802415c28cad24 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Sun, 11 Dec 2016 20:35:50 +0100 Subject: [PATCH 1/5] BUG: fix nanpercentile not returning scalar with axis argument Closes gh-8220 --- numpy/lib/nanfunctions.py | 2 +- numpy/lib/tests/test_nanfunctions.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index c024055bae0b..24472dfcd910 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -1134,7 +1134,7 @@ def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False, See nanpercentile for parameter usage """ - if axis is None: + if axis is None or a.ndim == 1: part = a.ravel() result = _nanpercentile1d(part, q, overwrite_input, interpolation) else: diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index 06c0953b5df7..3fde51a73092 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -805,7 +805,11 @@ def test_empty(self): assert_(len(w) == 0) def test_scalar(self): - assert_(np.nanpercentile(0., 100) == 0.) + assert_equal(np.nanpercentile(0., 100), 0.) + a = np.arange(6) + r = np.nanpercentile(a, 50, axis=0) + assert_equal(r, 2.5) + assert_(np.isscalar(r)) def test_extended_axis_invalid(self): d = np.ones((3, 5, 7, 11)) From 52336fcea1136302885112f90bf11aa9eaf602d3 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Tue, 6 Dec 2016 00:37:53 +0100 Subject: [PATCH 2/5] BUG: handle unmasked NaN in ma.median like normal median This requires to base masked median on sort(endwith=False) as we need to distinguish Inf and NaN. Using Inf as filler element of the sort does not work as then the mask is not guaranteed to be at the end. Closes gh-8340 Also fixed 1d ma.median not handling np.inf correctly, the nd variant was ok. --- numpy/lib/function_base.py | 18 +-- numpy/lib/tests/test_nanfunctions.py | 42 ++++-- numpy/lib/utils.py | 46 +++++++ numpy/ma/extras.py | 52 ++++++-- numpy/ma/tests/test_extras.py | 186 +++++++++++++++++++++++++++ 5 files changed, 305 insertions(+), 39 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 029305344edb..c8abb1492c72 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3994,23 +3994,7 @@ def _median(a, axis=None, out=None, overwrite_input=False): if np.issubdtype(a.dtype, np.inexact) and sz > 0: # warn and return nans like mean would rout = mean(part[indexer], axis=axis, out=out) - part = np.rollaxis(part, axis, part.ndim) - n = np.isnan(part[..., -1]) - if rout.ndim == 0: - if n == True: - warnings.warn("Invalid value encountered in median", - RuntimeWarning, stacklevel=3) - if out is not None: - out[...] = a.dtype.type(np.nan) - rout = out - else: - rout = a.dtype.type(np.nan) - elif np.count_nonzero(n.ravel()) > 0: - warnings.warn("Invalid value encountered in median for" + - " %d results" % np.count_nonzero(n.ravel()), - RuntimeWarning, stacklevel=3) - rout[n] = np.nan - return rout + return np.lib.utils._median_nancheck(part, rout, axis, out) else: # if there are no nans # Use mean in odd and even case to coerce data type diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index 3fde51a73092..e1942338b9d4 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -693,18 +693,36 @@ def test_extended_axis_invalid(self): def test_float_special(self): with suppress_warnings() as sup: sup.filter(RuntimeWarning) - a = np.array([[np.inf, np.nan], [np.nan, np.nan]]) - assert_equal(np.nanmedian(a, axis=0), [np.inf, np.nan]) - assert_equal(np.nanmedian(a, axis=1), [np.inf, np.nan]) - assert_equal(np.nanmedian(a), np.inf) - - # minimum fill value check - a = np.array([[np.nan, np.nan, np.inf], [np.nan, np.nan, np.inf]]) - assert_equal(np.nanmedian(a, axis=1), np.inf) - - # no mask path - a = np.array([[np.inf, np.inf], [np.inf, np.inf]]) - assert_equal(np.nanmedian(a, axis=1), np.inf) + for inf in [np.inf, -np.inf]: + a = np.array([[inf, np.nan], [np.nan, np.nan]]) + assert_equal(np.nanmedian(a, axis=0), [inf, np.nan]) + assert_equal(np.nanmedian(a, axis=1), [inf, np.nan]) + assert_equal(np.nanmedian(a), inf) + + # minimum fill value check + a = np.array([[np.nan, np.nan, inf], + [np.nan, np.nan, inf]]) + assert_equal(np.nanmedian(a), inf) + assert_equal(np.nanmedian(a, axis=0), [np.nan, np.nan, inf]) + assert_equal(np.nanmedian(a, axis=1), inf) + + # no mask path + a = np.array([[inf, inf], [inf, inf]]) + assert_equal(np.nanmedian(a, axis=1), inf) + + for i in range(0, 10): + for j in range(1, 10): + a = np.array([([np.nan] * i) + ([inf] * j)] * 2) + assert_equal(np.nanmedian(a), inf) + assert_equal(np.nanmedian(a, axis=1), inf) + assert_equal(np.nanmedian(a, axis=0), + ([np.nan] * i) + [inf] * j) + + a = np.array([([np.nan] * i) + ([-inf] * j)] * 2) + assert_equal(np.nanmedian(a), -inf) + assert_equal(np.nanmedian(a, axis=1), -inf) + assert_equal(np.nanmedian(a, axis=0), + ([np.nan] * i) + [-inf] * j) class TestNanFunctions_Percentile(TestCase): diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py index 97b93cace27d..0a8f112563bc 100644 --- a/numpy/lib/utils.py +++ b/numpy/lib/utils.py @@ -8,6 +8,7 @@ from numpy.core.numerictypes import issubclass_, issubsctype, issubdtype from numpy.core import ndarray, ufunc, asarray +import numpy as np # getargspec and formatargspec were removed in Python 3.6 from numpy.compat import getargspec, formatargspec @@ -1119,4 +1120,49 @@ def safe_eval(source): import ast return ast.literal_eval(source) + + +def _median_nancheck(data, result, axis, out): + """ + Utility function to check median result from data for NaN values at the end + and return NaN in that case. Input result can also be a MaskedArray. + + Parameters + ---------- + data : array + Input data to median function + result : Array or MaskedArray + Result of median function + axis : {int, sequence of int, None}, optional + Axis or axes along which the median was computed. + out : ndarray, optional + Output array in which to place the result. + Returns + ------- + median : scalar or ndarray + Median or NaN in axes which contained NaN in the input. + """ + if data.size == 0: + return result + data = np.rollaxis(data, axis, data.ndim) + n = np.isnan(data[..., -1]) + # masked NaN values are ok + if np.ma.isMaskedArray(n): + n = n.filled(False) + if result.ndim == 0: + if n == True: + warnings.warn("Invalid value encountered in median", + RuntimeWarning, stacklevel=3) + if out is not None: + out[...] = data.dtype.type(np.nan) + result = out + else: + result = data.dtype.type(np.nan) + elif np.count_nonzero(n.ravel()) > 0: + warnings.warn("Invalid value encountered in median for" + + " %d results" % np.count_nonzero(n.ravel()), + RuntimeWarning, stacklevel=3) + result[n] = np.nan + return result + #----------------------------------------------------------------------------- diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py index e4ff8ef2d9f5..dadf032e0187 100644 --- a/numpy/ma/extras.py +++ b/numpy/ma/extras.py @@ -699,15 +699,21 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): return r def _median(a, axis=None, out=None, overwrite_input=False): + # when an unmasked NaN is present return it, so we need to sort the NaN + # values behind the mask + if np.issubdtype(a.dtype, np.inexact): + fill_value = np.inf + else: + fill_value = None if overwrite_input: if axis is None: asorted = a.ravel() - asorted.sort() + asorted.sort(fill_value=fill_value) else: - a.sort(axis=axis) + a.sort(axis=axis, fill_value=fill_value) asorted = a else: - asorted = sort(a, axis=axis) + asorted = sort(a, axis=axis, fill_value=fill_value) if axis is None: axis = 0 @@ -715,8 +721,23 @@ def _median(a, axis=None, out=None, overwrite_input=False): axis += asorted.ndim if asorted.ndim == 1: + counts = count(asorted) idx, odd = divmod(count(asorted), 2) - return asorted[idx + odd - 1 : idx + 1].mean(out=out) + mid = asorted[idx + odd - 1 : idx + 1] + if np.issubdtype(asorted.dtype, np.inexact) and asorted.size > 0: + # avoid inf / x = masked + s = mid.sum(out=out) + np.true_divide(s, 2., casting='unsafe') + s = np.lib.utils._median_nancheck(asorted, s, axis, out) + else: + s = mid.mean(out=out) + + # if result is masked either the input contained enough + # minimum_fill_value so that it would be the median or all values + # masked + if np.ma.is_masked(s) and not np.all(asorted.mask): + return np.ma.minimum_fill_value(asorted) + return s counts = count(asorted, axis=axis) h = counts // 2 @@ -727,24 +748,35 @@ def _median(a, axis=None, out=None, overwrite_input=False): ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij') # insert indices of low and high median - ind.insert(axis, np.maximum(0, h - 1)) + ind.insert(axis, h - 1) low = asorted[tuple(ind)] - ind[axis] = h + ind[axis] = np.minimum(h, asorted.shape[axis] - 1) high = asorted[tuple(ind)] # duplicate high if odd number of elements so mean does nothing odd = counts % 2 == 1 - if asorted.ndim > 1: - np.copyto(low, high, where=odd) - elif odd: - low = high + np.copyto(low, high, where=odd) + # not necessary for scalar True/False masks + try: + np.copyto(low.mask, high.mask, where=odd) + except: + pass if np.issubdtype(asorted.dtype, np.inexact): # avoid inf / x = masked s = np.ma.sum([low, high], axis=0, out=out) np.true_divide(s.data, 2., casting='unsafe', out=s.data) + + s = np.lib.utils._median_nancheck(asorted, s, axis, out) else: s = np.ma.mean([low, high], axis=0, out=out) + + # if result is masked either the input contained enough minimum_fill_value + # so that it would be the median or all values masked + if np.ma.is_masked(s): + rep = (~np.all(asorted.mask, axis=axis)) & s.mask + s.data[rep] = np.ma.minimum_fill_value(asorted) + s.mask[rep] = False return s diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py index 0a6de4ebaeef..faee4f599aa5 100644 --- a/numpy/ma/tests/test_extras.py +++ b/numpy/ma/tests/test_extras.py @@ -659,6 +659,15 @@ def test_pytype(self): r = np.ma.median([[np.inf, np.inf], [np.inf, np.inf]], axis=-1) assert_equal(r, np.inf) + def test_inf(self): + # test that even which computes handles inf / x = masked + r = np.ma.median(np.ma.masked_array([[np.inf, np.inf], + [np.inf, np.inf]]), axis=-1) + assert_equal(r, np.inf) + r = np.ma.median(np.ma.masked_array([[np.inf, np.inf], + [np.inf, np.inf]]), axis=None) + assert_equal(r, np.inf) + def test_non_masked(self): x = np.arange(9) assert_equal(np.ma.median(x), 4.) @@ -799,6 +808,183 @@ def test_single_non_masked_value_on_axis(self): assert_array_equal(np.ma.median(masked_arr, axis=0), expected) + def test_nan(self): + with suppress_warnings() as w: + w.record(RuntimeWarning) + w.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + for mask in (False, np.zeros(6, dtype=np.bool)): + dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]]) + dm.mask = mask + + # scalar result + r = np.ma.median(dm, axis=None) + assert_(np.isscalar(r)) + assert_array_equal(r, np.nan) + r = np.ma.median(dm.ravel(), axis=0) + assert_(np.isscalar(r)) + assert_array_equal(r, np.nan) + + r = np.ma.median(dm, axis=0) + assert_equal(type(r), MaskedArray) + assert_array_equal(r, [1, np.nan, 3]) + r = np.ma.median(dm, axis=1) + assert_equal(type(r), MaskedArray) + assert_array_equal(r, [np.nan, 2]) + r = np.ma.median(dm, axis=-1) + assert_equal(type(r), MaskedArray) + assert_array_equal(r, [np.nan, 2]) + + dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]]) + dm[:, 2] = np.ma.masked + assert_array_equal(np.ma.median(dm, axis=None), np.nan) + assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3]) + assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5]) + assert_equal([x.category is RuntimeWarning for x in w.log], + [True]*13) + + def test_out_nan(self): + with warnings.catch_warnings(record=True): + warnings.filterwarnings('always', '', RuntimeWarning) + o = np.ma.masked_array(np.zeros((4,))) + d = np.ma.masked_array(np.ones((3, 4))) + d[2, 1] = np.nan + d[2, 2] = np.ma.masked + assert_equal(np.ma.median(d, 0, out=o), o) + o = np.ma.masked_array(np.zeros((3,))) + assert_equal(np.ma.median(d, 1, out=o), o) + o = np.ma.masked_array(np.zeros(())) + assert_equal(np.ma.median(d, out=o), o) + + def test_nan_behavior(self): + a = np.ma.masked_array(np.arange(24, dtype=float)) + a[::3] = np.ma.masked + a[2] = np.nan + with suppress_warnings() as w: + w.record(RuntimeWarning) + w.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + assert_array_equal(np.ma.median(a), np.nan) + assert_array_equal(np.ma.median(a, axis=0), np.nan) + assert_(w.log[0].category is RuntimeWarning) + assert_(w.log[1].category is RuntimeWarning) + + a = np.ma.masked_array(np.arange(24, dtype=float).reshape(2, 3, 4)) + a.mask = np.arange(a.size) % 2 == 1 + aorig = a.copy() + a[1, 2, 3] = np.nan + a[1, 1, 2] = np.nan + + # no axis + with suppress_warnings() as w: + w.record(RuntimeWarning) + w.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + warnings.filterwarnings('always', '', RuntimeWarning) + assert_array_equal(np.ma.median(a), np.nan) + assert_(np.isscalar(np.ma.median(a))) + assert_(w.log[0].category is RuntimeWarning) + + # axis0 + b = np.ma.median(aorig, axis=0) + b[2, 3] = np.nan + b[1, 2] = np.nan + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', RuntimeWarning) + assert_equal(np.ma.median(a, 0), b) + assert_equal(len(w), 1) + + # axis1 + b = np.ma.median(aorig, axis=1) + b[1, 3] = np.nan + b[1, 2] = np.nan + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', RuntimeWarning) + assert_equal(np.ma.median(a, 1), b) + assert_equal(len(w), 1) + + # axis02 + b = np.ma.median(aorig, axis=(0, 2)) + b[1] = np.nan + b[2] = np.nan + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', RuntimeWarning) + assert_equal(np.ma.median(a, (0, 2)), b) + assert_equal(len(w), 1) + + def test_ambigous_fill(self): + # 255 is max value, used as filler for sort + a = np.array([[3, 3, 255], [3, 3, 255]], dtype=np.uint8) + a = np.ma.masked_array(a, mask=a == 3) + assert_array_equal(np.ma.median(a, axis=1), 255) + assert_array_equal(np.ma.median(a, axis=1).mask, False) + assert_array_equal(np.ma.median(a, axis=0), a[0]) + assert_array_equal(np.ma.median(a), 255) + + def test_special(self): + for inf in [np.inf, -np.inf]: + a = np.array([[inf, np.nan], [np.nan, np.nan]]) + a = np.ma.masked_array(a, mask=np.isnan(a)) + assert_equal(np.ma.median(a, axis=0), [inf, np.nan]) + assert_equal(np.ma.median(a, axis=1), [inf, np.nan]) + assert_equal(np.ma.median(a), inf) + + a = np.array([[np.nan, np.nan, inf], [np.nan, np.nan, inf]]) + a = np.ma.masked_array(a, mask=np.isnan(a)) + assert_array_equal(np.ma.median(a, axis=1), inf) + assert_array_equal(np.ma.median(a, axis=1).mask, False) + assert_array_equal(np.ma.median(a, axis=0), a[0]) + assert_array_equal(np.ma.median(a), inf) + + # no mask + a = np.array([[inf, inf], [inf, inf]]) + assert_equal(np.ma.median(a), inf) + assert_equal(np.ma.median(a, axis=0), inf) + assert_equal(np.ma.median(a, axis=1), inf) + + for i in range(0, 10): + for j in range(1, 10): + a = np.array([([np.nan] * i) + ([inf] * j)] * 2) + a = np.ma.masked_array(a, mask=np.isnan(a)) + assert_equal(np.ma.median(a), inf) + assert_equal(np.ma.median(a, axis=1), inf) + assert_equal(np.ma.median(a, axis=0), + ([np.nan] * i) + [inf] * j) + + def test_empty(self): + # empty arrays + a = np.ma.masked_array(np.array([], dtype=float)) + with suppress_warnings() as w: + w.record(RuntimeWarning) + w.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + assert_array_equal(np.ma.median(a), np.nan) + assert_(w.log[0].category is RuntimeWarning) + + # multiple dimensions + a = np.ma.masked_array(np.array([], dtype=float, ndmin=3)) + # no axis + with suppress_warnings() as w: + w.record(RuntimeWarning) + w.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + warnings.filterwarnings('always', '', RuntimeWarning) + assert_array_equal(np.ma.median(a), np.nan) + assert_(w.log[0].category is RuntimeWarning) + + # axis 0 and 1 + b = np.ma.masked_array(np.array([], dtype=float, ndmin=2)) + assert_equal(np.median(a, axis=0), b) + assert_equal(np.median(a, axis=1), b) + + # axis 2 + b = np.ma.masked_array(np.array(np.nan, dtype=float, ndmin=2)) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', RuntimeWarning) + assert_equal(np.median(a, axis=2), b) + assert_(w[0].category is RuntimeWarning) + + def test_object(self): + o = np.ma.masked_array(np.arange(7.)) + assert_(type(np.ma.median(o.astype(object))), float) + o[2] = np.nan + assert_(type(np.ma.median(o.astype(object))), float) + class TestCov(TestCase): From 0ee6af6e35d9e6eaf9e2cc43200397235ea9b1ce Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Sun, 11 Dec 2016 23:44:13 +0100 Subject: [PATCH 3/5] MAINT: make regex strings raw Python 3.6 gets more strict about escape sequences, \. is invalid. As it could get a syntax error the version check would not work. --- numpy/testing/nosetester.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numpy/testing/nosetester.py b/numpy/testing/nosetester.py index c07d65642987..7c01c928b23d 100644 --- a/numpy/testing/nosetester.py +++ b/numpy/testing/nosetester.py @@ -420,11 +420,11 @@ def test(self, label='fast', verbose=1, extra_argv=None, sup.filter(DeprecationWarning, r"sys\.exc_clear\(\) not supported in 3\.x", module=threading) - sup.filter(DeprecationWarning, message="in 3\.x, __setslice__") - sup.filter(DeprecationWarning, message="in 3\.x, __getslice__") - sup.filter(DeprecationWarning, message="buffer\(\) not supported in 3\.x") - sup.filter(DeprecationWarning, message="CObject type is not supported in 3\.x") - sup.filter(DeprecationWarning, message="comparing unequal types not supported in 3\.x") + sup.filter(DeprecationWarning, message=r"in 3\.x, __setslice__") + sup.filter(DeprecationWarning, message=r"in 3\.x, __getslice__") + sup.filter(DeprecationWarning, message=r"buffer\(\) not supported in 3\.x") + sup.filter(DeprecationWarning, message=r"CObject type is not supported in 3\.x") + sup.filter(DeprecationWarning, message=r"comparing unequal types not supported in 3\.x") # Filter out some deprecation warnings inside nose 1.3.7 when run # on python 3.5b2. See # https://github.com/nose-devs/nose/issues/929 From 1e3ec9fd3b4261c5222b2383589d170e98b7ecc3 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Mon, 12 Dec 2016 23:15:21 +0100 Subject: [PATCH 4/5] ENH: update the small nanmedian threshold The apply_along_axis path is significantly more expensive than currently accounted for in the check. Increase the minimum axis size from 400 to 1000 elements. Either apply_along_axis got more expensive over time or the original benchmarking was flawed. --- numpy/lib/nanfunctions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index 24472dfcd910..812b9bcb3e80 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -869,7 +869,7 @@ def _nanmedian(a, axis=None, out=None, overwrite_input=False): else: # for small medians use sort + indexing which is still faster than # apply_along_axis - if a.shape[axis] < 400: + if a.shape[axis] < 1000: return _nanmedian_small(a, axis, out, overwrite_input) result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input) if out is not None: From 4c634494c56741518d0124c09456916a6caef7f1 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Tue, 13 Dec 2016 00:26:36 +0100 Subject: [PATCH 5/5] DOC: add ma.median nan change to release notes --- doc/release/1.12.0-notes.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/release/1.12.0-notes.rst b/doc/release/1.12.0-notes.rst index 8595a9adcd75..50c472080174 100644 --- a/doc/release/1.12.0-notes.rst +++ b/doc/release/1.12.0-notes.rst @@ -177,6 +177,11 @@ The following functions are changed: ``sum``, ``product``, The previous identity was 1, it is now -1. See entry in `Improvements`_ for more explanation. +ma.median warns and returns nan when unmasked invalid values are encountered +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Similar to unmasked median the masked median `ma.median` now emits a Runtime +warning and returns `NaN` in slices where an unmasked `NaN` is present. + Greater consistancy in ``assert_almost_equal`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The precision check for scalars has been changed to match that for arrays. It