8000 ENH: When histogramming data with integer dtype, force bin width >= 1. · numpy/numpy@36596ed · GitHub
[go: up one dir, main page]

Skip to content

Commit 36596ed

Browse files
committed
ENH: When histogramming data with integer dtype, force bin width >= 1.
Bins of width < 1 don't make sense for integer data, they just add a bunch of spurious, unpopulated bins. (Perhaps an even better improvement would be to make sure that, when using integer data, the binwidth is also integer, so that each bin always covers the same number of possible values, but I guess that's possibly a more domain-specific issue.) Before the PR: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0. , 0.45, 0.9 , 1.35, 1.8 , 2.25, 2.7 , 3.15, 3.6 , 4.05, 4.5 , 4.95, 5.4 , 5.85, 6.3 , 6.75, 7.2 , 7.65, 8.1 , 8.55, 9. ]) After: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
1 parent 148db64 commit 36596ed

File tree

3 files changed

+32
-1
lines changed

3 files changed

+32
-1
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
``histogram`` auto-binning now returns bin sizes >=1 for integer input data
2+
---------------------------------------------------------------------------
3+
For integer input data, bin sizes smaller than 1 result in spurious empty
4+
bins. This is now avoided when the number of bins is computed using one of the
5+
algorithms provided by `histogram_bin_edges`.

numpy/lib/_histograms_impl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,8 @@ def _get_bin_edges(a, bins, range, weights):
410410
# Do not call selectors on empty arrays
411411
width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
412412
if width:
413+
if np.issubdtype(a.dtype, np.integer) and width < 1:
414+
width = 1
413415
n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
414416
else:
415417
# Width can be zero for some estimators, e.g. FD when
@@ -625,6 +627,9 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
625627
The simplest and fastest estimator. Only takes into account the
626628
data size.
627629
630+
Additionally, if the data is of integer dtype, then the binwidth will never
631+
be less than 1.
632+
628633
Examples
629634
--------
630635
>>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])

numpy/lib/tests/test_histograms.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ def test_small(self):
469469
'doane': 3, 'sqrt': 2, 'stone': 1}}
470470

471471
for testlen, expectedResults in small_dat.items():
472-
testdat = np.arange(testlen)
472+
testdat = np.arange(testlen).astype(float)
473473
for estimator, expbins in expectedResults.items():
474474
a, b = np.histogram(testdat, estimator)
475475
assert_equal(len(a), expbins, err_msg="For the {0} estimator "
@@ -592,6 +592,27 @@ def test_signed_integer_data(self, bins):
592592
assert_array_equal(hist, hist32)
593593
assert_array_equal(edges, edges32)
594594

595+
@pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
596+
'stone', 'rice', 'sturges'])
597+
def test_integer(self, bins):
598+
"""
599+
Test that bin width for integer data is at least 1.
600+
"""
601+
assert_equal(
602+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), bins),
603+
np.arange(9))
604+
605+
def test_integer_non_auto(self):
606+
"""
607+
Test that the bin-width>=1 requirement *only* applies to auto binning.
608+
"""
609+
assert_equal(
610+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), 16),
611+
np.arange(17) / 2)
612+
assert_equal(
613+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), [.1, .2]),
614+
[.1, .2])
615+
595616
def test_simple_weighted(self):
596617
"""
597618
Check that weighted data raises a TypeError

0 commit comments

Comments
 (0)
0