8000 ENH: When histogramming data with integer dtype, force bin width >= 1. · numpy/numpy@7b38ee4 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7b38ee4

Browse files
committed
ENH: When histogramming data with integer dtype, force bin width >= 1.
Bins of width < 1 don't make sense for integer data, they just add a bunch of spurious, unpopulated bins. (Perhaps an even better improvement would be to make sure that, when using integer data, the binwidth is also integer, so that each bin always covers the same number of possible values, but I guess that's possibly a more domain-specific issue.) Before the PR: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0. , 0.45, 0.9 , 1.35, 1.8 , 2.25, 2.7 , 3.15, 3.6 , 4.05, 4.5 , 4.95, 5.4 , 5.85, 6.3 , 6.75, 7.2 , 7.65, 8.1 , 8.55, 9. ]) After: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
1 parent 1efe554 commit 7b38ee4

File tree

3 files changed

+33
-1
lines changed

3 files changed

+33
-1
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
``histogram`` auto-binning now returns bin sizes >=1 for integer input data
2+
---------------------------------------------------------------------------
3+
For integer input data, bin sizes smaller than 1 result in spurious empty
4+
bins. This is now avoided when the number of bins is computed using one of the
5+
algorithms provided by `histogram_bin_edges`.

numpy/lib/_histograms_impl.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,8 @@ def _get_bin_edges(a, bins, range, weights):
408408
# Do not call selectors on empty arrays
409409
width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
410410
if width:
411+
if np.issubdtype(a.dtype, np.integer) and width < 1:
412+
width = 1
411413
n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
412414
else:
413415
# Width can be zero for some estimators, e.g. FD when
@@ -623,6 +625,9 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
623625
The simplest and fastest estimator. Only takes into account the
624626
data size.
625627
628+
Additionally, if the data is of integer dtype, then the binwidth will never
629+
be less than 1.
630+
626631
Examples
627632
--------
628633
>>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])

numpy/lib/tests/test_histograms.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def test_small(self):
471471
'doane': 3, 'sqrt': 2, 'stone': 1}}
472472

473473
for testlen, expectedResults in small_dat.items():
474-
testdat = np.arange(testlen)
474+
testdat = np.arange(testlen).astype(float)
475475
for estimator, expbins in expectedResults.items():
476476
a, b = np.histogram(testdat, estimator)
477477
assert_equal(len(a), expbins, err_msg="For the {0} estimator "
@@ -594,6 +594,28 @@ def test_signed_integer_data(self, bins):
594594
assert_array_equal(hist, hist32)
595595
assert_array_equal(edges, edges32)
596596

597+
@pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
598+
'stone', 'rice', 'sturges'])
599+
def test_integer(self, bins):
600+
"""
601+
Test that bin width for integer data is at least 1.
602+
"""
603+
for bins in estimator_list:
604+
assert_equal(
605+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), bins),
606+
np.arange(9))
607+
608+
def test_integer_non_auto(self):
609+
"""
610+
Test that the bin-width>=1 requirement *only* applies to auto binning.
611+
"""
612+
assert_equal(
613+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), 16),
614+
np.arange(17) / 2)
615+
assert_equal(
616+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), [.1, .2]),
617+
[.1, .2])
618+
597619
def test_simple_weighted(self):
598620
"""
599621
Check that weighted data raises a TypeError

0 commit comments

Comments
 (0)
0