10000 ENH: rewrite ma.median to improve poor performance for multiple dimensions by juliantaylor · Pull Request #4760 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: rewrite ma.median to improve poor performance for multiple dimensions #4760

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 2, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion numpy/lib/nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,19 +635,37 @@ def _nanmedian(a, axis=None, out=None, overwrite_input=False):
See nanmedian for parameter usage

"""
if axis is None:
if axis is None or a.ndim == 1:
part = a.ravel()
if out is None:
return _nanmedian1d(part, overwrite_input)
else:
out[...] = _nanmedian1d(part, overwrite_input)
return out
else:
# for small medians use sort + indexing which is still faster than
# apply_along_axis
if a.shape[axis] < 400:
return _nanmedian_small(a, axis, out, overwrite_input)
result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
if out is not None:
out[...] = result
return result

def _nanmedian_small(a, axis=None, out=None, overwrite_input=False):
"""
sort + indexing median, faster for small medians along multiple dimensions
due to the high overhead of apply_along_axis
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might mention nanmedian for parameter documentation.

see nanmedian for parameter usage
"""
a = np.ma.masked_array(a, np.isnan(a))
m = np.ma.median(a, axis=axis, overwrite_input=overwrite_input)
for i in range(np.count_nonzero(m.mask.ravel())):
warnings.warn("All-NaN slice encountered", RuntimeWarning)
if out is not None:
out[...] = m.filled(np.nan)
return out
return m.filled(np.nan)

def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
"""
Expand Down
18 changes: 17 additions & 1 deletion numpy/lib/tests/test_nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from numpy.testing import (
run_module_suite, TestCase, assert_, assert_equal, assert_almost_equal,
assert_raises
assert_raises, assert_array_equal
)


Expand Down Expand Up @@ -580,6 +580,22 @@ def test_out(self):
assert_almost_equal(res, resout)
assert_almost_equal(res, tgt)

def test_small_large(self):
# test the small and large code paths, current cutoff 400 elements
for s in [5, 20, 51, 200, 1000]:
d = np.random.randn(4, s)
# Randomly set some elements to NaN:
w = np.random.randint(0, d.size, size=d.size // 5)
d.ravel()[w] = np.nan
d[:,0] = 1. # ensure at least one good value
# use normal median without nans to compare
tgt = []
for x in d:
nonan = np.compress(~np.isnan(x), x)
tgt.append(np.median(nonan, overwrite_input=True))

assert_array_equal(np.nanmedian(d, axis=-1), tgt)

def test_result_values(self):
tgt = [np.median(d) for d in _rdat]
res = np.nanmedian(_ndat, axis=1)
Expand Down
11 changes: 6 additions & 5 deletions numpy/ma/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5082,12 +5082,12 @@ def sort(self, axis= -1, kind='quicksort', order=None,
filler = maximum_fill_value(self)
else:
filler = fill_value
idx = np.indices(self.shape)
idx = np.meshgrid(*[np.arange(x) for x in self.shape], sparse=True,
indexing='ij')
idx[axis] = self.filled(filler).argsort(axis=axis, kind=kind,
order=order)
idx_l = idx.tolist()
tmp_mask = self._mask[idx_l].flat
tmp_data = self._data[idx_l].flat
tmp_mask = self._mask[idx].flat
tmp_data = self._data[idx].flat
self._data.flat = tmp_data
self._mask.flat = tmp_mask
return
Expand Down Expand Up @@ -6188,7 +6188,8 @@ def sort(a, axis= -1, kind='quicksort', order=None, endwith=True, fill_value=Non
else:
filler = fill_value
# return
indx = np.indices(a.shape).tolist()
indx = np.meshgrid(*[np.arange(x) for x in a.shape], sparse=True,
indexing='ij')
indx[axis] = filled(a, filler).argsort(axis=axis, kind=kind, order=order)
return a[indx]
sort.__doc__ = MaskedArray.sort.__doc__
Expand Down
41 changes: 25 additions & 16 deletions numpy/ma/extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,15 +668,9 @@ def median(a, axis=None, out=None, overwrite_input=False):
fill_value = 1e+20)

"""
def _median1D(data):
counts = filled(count(data), 0)
(idx, rmd) = divmod(counts, 2)
if rmd:
choice = slice(idx, idx + 1)
else:
choice = slice(idx - 1, idx + 1)
return data[choice].mean(0)
#
if not hasattr(a, 'mask') or np.count_nonzero(a.mask) == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this has a mask when called from nanmedian, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes but ma.median should also work on unmasked arrays, some existing testcases do test this.

return masked_array(np.median(a, axis=axis, out=out,
overwrite_input=overwrite_input), copy=False)
if overwrite_input:
if axis is None:
asorted = a.ravel()
Expand All @@ -687,14 +681,29 @@ def _median1D(data):
else:
asorted = sort(a, axis=axis)
if axis is None:
result = _median1D(asorted)
axis = 0
elif axis < 0:
axis += a.ndim

counts = asorted.shape[axis] - (asorted.mask).sum(axis=axis)
h = counts // 2
# create indexing mesh grid for all but reduced axis
axes_grid = [np.arange(x) for i, x in enumerate(asorted.shape)
if i != axis]
ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij')
# insert indices of low and high median
ind.insert(axis, h - 1)
low = asorted[ind]
ind[axis] = h
high = asorted[ind]
# duplicate high if odd number of elements so mean does nothing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for the astropy guys?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, might add a comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no the mean us used for even elements medians like in the normal median but in the case of masked median some entries may be odd some even, to get them into one output array I always select two elements and if the input was odd I make those pairs the same so the mean is essentially a no-op, you do have an additional small numerical error on the odd elements but I couldn't come up with a better way to do it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, makes sense.

odd = counts % 2 == 1
if asorted.ndim == 1:
if odd:
low = high
else:
result = apply_along_axis(_median1D, axis, asorted)
if out is not None:
out = result
return result


low[odd] = high[odd]
return np.ma.mean([low, high], axis=0, out=out)


#..............................................................................
Expand Down
13 changes: 13 additions & 0 deletions numpy/ma/tests/test_extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,19 @@ def test_3d(self):
x[x % 5 == 0] = masked
assert_equal(median(x, 0), [[12, 10], [8, 9], [16, 17]])

def test_neg_axis(self):
x = masked_array(np.arange(30).reshape(10, 3))
x[:3] = x[-3:] = masked
assert_equal(median(x, axis=-1), median(x, axis=1))

def test_out(self):
x = masked_array(np.arange(30).reshape(10, 3))
x[:3] = x[-3:] = masked
out = masked_array(np.ones(10))
r = median(x, axis=1, out=out)
assert_equal(r, out)
assert_(type(r) == MaskedArray)


class TestCov(TestCase):

Expand Down
0