10000 BUG: np.histogramdd loses precision on its inputs, leading to incorrect results by eric-wieser · Pull Request #11023 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

BUG: np.histogramdd loses precision on its inputs, leading to incorrect results #11023

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/release/1.15.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@ passed explicitly, and are not yet computed automatically.
No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
chosen is related to the data size in this situation.

``histogram`` and ``histogramdd`` return edges matching the float type of the data
----------------------------------------------------------------------------------
When passed ``float16``, ``np.float32``, or ``np.longdouble`` data, the
returned edges are now of the same dtype. Previously, ``histogram`` would only
return the same type if explicit bins were given, and ``histogram`` would
produce ``float64`` bins no matter what the inputs.

``histogramdd`` allows explicit ranges to be given in a subset of axes
----------------------------------------------------------------------
The ``range`` argument of `histogramdd` can now contain ``None`` values to
Expand Down
32 changes: 10 additions & 22 deletions numpy/lib/histograms.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -877,12 +877,6 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
# bins is an integer
bins = D*[bins]

# avoid rounding issues for comparisons when dealing with inexact types
if np.issubdtype(sample.dtype, np.inexact):
edge_dt = sample.dtype
else:
edge_dt = float

# normalize the range argument
if range is None:
range = (None,) * D
Expand All @@ -896,13 +890,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
raise ValueError(
'`bins[{}]` must be positive, when an integer'.format(i))
smin, smax = _get_outer_edges(sample[:,i], range[i])
edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt)
edges[i] = np.linspace(smin, smax, bins[i] + 1)
elif np.ndim(bins[i]) == 1:
edges[i] = np.asarray(bins[i], edge_dt)
# not just monotonic, due to the use of mindiff below
if np.any(edges[i][:-1] >= edges[i][1:]):
edges[i] = np.asarray(bins[i])
if np.any(edges[i][:-1] > edges[i][1:]):
raise ValueError(
'`bins[{}]` must be strictly increasing, when an array'
'`bins[{}]` must be monotonically increasing, when an array'
.format(i))
else:
raise ValueError(
Expand All @@ -913,24 +906,19 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):

# Compute the bin number each sample falls into.
Ncount = tuple(
np.digitize(sample[:, i], edges[i])
# avoid np.digitize to work around gh-11022
np.searchsorted(edges[i], sample[:, i], side='right')
for i in _range(D)
)

# Using digitize, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right edge to be
# counted in the last bin, and not as an outlier.
for i in _range(D):
# Rounding precision
mindiff = dedges[i].min()
if not np.isinf(mindiff):
decimal = int(-np.log10(mindiff)) + 6
# Find which points are on the rightmost edge.
not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
on_edge = (np.around(sample[:, i], decimal) ==
np.around(edges[i][-1], decimal))
# Shift these points one bin to the left.
Ncount[i][on_edge & not_smaller_than_edge] -= 1
# Find which points are on the rightmost edge.
on_edge = (sample[:, i] == edges[i][-1])
# Shift these points one bin to the left.
Ncount[i][on_edge] -= 1

# Compute the sample indices in the flattened histogram matrix.
# This raises an error if the array is too large.
Expand Down
41 changes: 38 additions & 3 deletions numpy/lib/tests/test_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,8 +612,6 @@ def test_bins_errors(self):
x = np.arange(8).reshape(2, 4)
assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
assert_raises(
ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
assert_raises(
ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
Expand Down Expand Up @@ -646,7 +644,7 @@ def test_rightmost_binedge(self):
bins = [[0., 0.5, 1.0]]
hist, _ = histogramdd(x, bins=bins)
assert_(hist[0] == 0.0)
assert_(hist[1] == 1.)
assert_(hist[1] == 0.0)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was weird. 1.0000000001 <= 1.0 should be false.

x = [1.0001]
bins = [[0., 0.5, 1.0]]
hist, _ = histogramdd(x, bins=bins)
Expand All @@ -660,3 +658,40 @@ def test_finite_range(self):
range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
assert_raises(ValueError, histogramdd, vals,
range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])

def test_equal_edges(self):
""" Test that adjacent entries in an edge array can be equal """
x = np.array([0, 1, 2])
y = np.array([0, 1, 2])
x_edges = np.array([0, 2, 2])
y_edges = 1
hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On master, this test crashes with

ValueError: `bins[0]` must be strictly increasing, when an array


hist_expected = np.array([
[2.],
[1.], # x == 2 falls in the final bin
])
assert_equal(hist, hist_expected)

def test_edge_dtype(self):
""" Test that if an edge array is input, its type is preserved """
x = np.array([0, 10, 20])
y = x / 10
x_edges = np.array([0, 5, 15, 20])
y_edges = x_edges / 10
hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))

assert_equal(edges[0].dtype, x_edges.dtype)
assert_equal(edges[1].dtype, y_edges.dtype)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These fail on master, as edges[i].dtype is float64


def test_large_integers(self):
big = 2**60 # Too large to represent with a full precision float

x = np.array([0], np.int64)
x_edges = np.array([-1, +1], np.int64)
y = big + x
y_edges = big + x_edges

hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
Copy link
Member Author
@eric-wieser eric-wieser May 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also fails on master with

ValueError: `bins[0]` must be strictly increasing, when an array

There exist more malicious test cases that give the wrong result rather than crash, but I figured this was sufficient.


assert_equal(hist[0, 0], 1)
2 changes: 1 addition & 1 deletion numpy/lib/twodim_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
N = 1

if N != 1 and N != 2:
xedges = yedges = asarray(bins, float)
xedges = yedges = asarray(bins)
bins = [xedges, yedges]
hist, edges = histogramdd([x, y], bins, range, normed, weights)
return hist, edges[0], edges[1]
Expand Down
0