8000 Merge pull request #11023 from eric-wieser/histogramdd-no-fuzz · numpy/numpy@2cf2db3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2cf2db3

Browse files
authored
Merge pull request #11023 from eric-wieser/histogramdd-no-fuzz
BUG: np.histogramdd loses precision on its inputs, leading to incorrect results
2 parents dee82fb + 63cd4da commit 2cf2db3

File 10000 tree

4 files changed

+56
-26
lines changed

4 files changed

+56
-26
lines changed

doc/release/1.15.0-notes.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,13 @@ passed explicitly, and are not yet computed automatically.
216216
No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
217217
chosen is related to the data size in this situation.
218218

219+
``histogram`` and ``histogramdd`` return edges matching the float type of the data
220+
----------------------------------------------------------------------------------
221+
When passed ``float16``, ``np.float32``, or ``np.longdouble`` data, the
222+
returned edges are now of the same dtype. Previously, ``histogram`` would only
223+
return the same type if explicit bins were given, and ``histogram`` would
224+
produce ``float64`` bins no matter what the inputs.
225+
219226
``histogramdd`` allows explicit ranges to be given in a subset of axes
220227
----------------------------------------------------------------------
221228
The ``range`` argument of `histogramdd` can now contain ``None`` values to

numpy/lib/histograms.py

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -877,12 +877,6 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
877877
# bins is an integer
878878
bins = D*[bins]
879879

880-
# avoid rounding issues for comparisons when dealing with inexact types
881-
if np.issubdtype(sample.dtype, np.inexact):
882-
edge_dt = sample.dtype
883-
else:
884-
edge_dt = float
885-
886880
# normalize the range argument
887881
if range is None:
888882
range = (None,) * D
@@ -896,13 +890,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
896890
raise ValueError(
897891
'`bins[{}]` must be positive, when an integer'.format(i))
898892
smin, smax = _get_outer_edges(sample[:,i], range[i])
899-
edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt)
893+
edges[i] = np.linspace(smin, smax, bins[i] + 1)
900894
elif np.ndim(bins[i]) == 1:
901-
edges[i] = np.asarray(bins[i], edge_dt)
902-
# not just monotonic, due to the use of mindiff below
903-
if np.any(edges[i][:-1] >= edges[i][1:]):
895+
edges[i] = np.asarray(bins[i])
896+
if np.any(edges[i][:-1] > edges[i][1:]):
904897
raise ValueError(
905-
'`bins[{}]` must be strictly increasing, when an array'
898+
'`bins[{}]` must be monotonically increasing, when an array'
906899
.format(i))
907900
else:
908901
raise ValueError(
@@ -913,24 +906,19 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
913906

914907
# Compute the bin number each sample falls into.
915908
Ncount = tuple(
916-
np.digitize(sample[:, i], edges[i])
909+
# avoid np.digitize to work around gh-11022
910+
np.searchsorted(edges[i], sample[:, i], side='right')
917911
for i in _range(D)
918912
)
919913

920914
# Using digitize, values that fall on an edge are put in the right bin.
921915
# For the rightmost bin, we want values equal to the right edge to be
922916
# counted in the last bin, and not as an outlier.
923917
for i in _range(D):
924-
# Rounding precision
925-
mindiff = dedges[i].min()
926-
if not np.isinf(mindiff):
927-
decimal = int(-np.log10(mindiff)) + 6
928-
# Find which points are on the rightmost edge.
929-
not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
930-
on_edge = (np.around(sample[:, i], decimal) ==
931-
np.around(edges[i][-1], decimal))
932-
# Shift these points one bin to the left.
933-
Ncount[i][on_edge & not_smaller_than_edge] -= 1
918+
# Find which points are on the rightmost edge.
919+
on_edge = (sample[:, i] == edges[i][-1])
920+
# Shift these points one bin to the left.
921+
Ncount[i][on_edge] -= 1
934922

935923
# Compute the sample indices in the flattened histogram matrix.
936924
# This raises an error if the array is too large.

numpy/lib/tests/test_histograms.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -612,8 +612,6 @@ def test_bins_errors(self):
612612
x = np.arange(8).reshape(2, 4)
613613
assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
614614
assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
615-
assert_raises(
616-
ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
617615
assert_raises(
618616
ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
619617
assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
@@ -646,7 +644,7 @@ def test_rightmost_binedge(self):
646644
bins = [[0., 0.5, 1.0]]
647645
hist, _ = histogramdd(x, bins=bins)
648646
assert_(hist[0] == 0.0)
649-
assert_(hist[1] == 1.)
647+
assert_(hist[1] == 0.0)
650648
x = [1.0001]
651649
bins = [[0., 0.5, 1.0]]
652650
hist, _ = histogramdd(x, bins=bins)
@@ -660,3 +658,40 @@ def test_finite_range(self):
660658
range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
661659
assert_raises(ValueError, histogramdd, vals,
662660
range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
661+
662+
def test_equal_edges(self):
663+
""" Test that adjacent entries in an edge array can be equal """
664+
x = np.array([0, 1, 2])
665+
y = np.array([0, 1, 2])
666+
x_edges = np.array([0, 2, 2])
667+
y_edges = 1
668+
hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
669+
670+
hist_expected = np.array([
671+
[2.],
672+
[1.], # x == 2 falls in the final bin
673+
])
674+
assert_equal(hist, hist_expected)
675+
676+
def test_edge_dtype(self):
677+
""" Test that if an edge array is input, its type is preserved """
678+
x = np.array([0, 10, 20])
679+
y = x / 10
680+
x_edges = np.array([0, 5, 15, 20])
681+
y_edges = x_edges / 10
682+
hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
683+
684+
assert_equal(edges[0].dtype, x_edges.dtype)
685+
assert_equal(edges[1].dtype, y_edges.dtype)
686+
687+
def test_large_integers(self):
688+
big = 2**60 # Too large to represent with a full precision float
689+
690+
x = np.array([0], np.int64)
691+
x_edges = np.array([-1, +1], np.int64)
692+
y = big + x
693+
y_edges = big + x_edges
694+
695+
hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
696+
697+
assert_equal(hist[0, 0], 1)

numpy/lib/twodim_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
650650
N = 1
651651

652652
if N != 1 and N != 2:
653-
xedges = yedges = asarray(bins, float)
653+
xedges = yedges = asarray(bins)
654654
bins = [xedges, yedges]
655655
hist, edges = histogramdd([x, y], bins, range, normed, weights)
656656
return hist, edges[0], edges[1]

0 commit comments

Comments
 (0)
0