8000 ENH: When histogramming data with integer dtype, force bin width >= 1. · numpy/numpy@e53f683 · GitHub
[go: up one dir, main page]

Skip to content

Commit e53f683

Browse files
committed
ENH: When histogramming data with integer dtype, force bin width >= 1.
Bins of width < 1 don't make sense for integer data, they just add a bunch of spurious, unpopulated bins. (Perhaps an even better improvement would be to make sure that, when using integer data, the binwidth is also integer, so that each bin always covers the same number of possible values, but I guess that's possibly a more domain-specific issue.) Before the PR: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0. , 0.45, 0.9 , 1.35, 1.8 , 2.25, 2.7 , 3.15, 3.6 , 4.05, 4.5 , 4.95, 5.4 , 5.85, 6.3 , 6.75, 7.2 , 7.65, 8.1 , 8.55, 9. ]) After: In [1]: np.histogram_bin_edges(np.tile(np.arange(10), 1000), "auto") Out[1]: array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
1 parent d2476ff commit e53f683

File tree

2 files changed

+27
-1
lines changed

2 files changed

+27
-1
lines changed

numpy/lib/histograms.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,8 @@ def _get_bin_edges(a, bins, range, weights):
346346
# Do not call selectors on empty arrays
347347
width = _hist_bin_selectors[bin_name](a)
348348
if width:
349+
if np.issubdtype(a.dtype, np.integer) and width < 1:
350+
width = 1
349351
n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
350352
else:
351353
# Width can be zero for some estimators, e.g. FD when
@@ -553,6 +555,9 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
553555
The simplest and fastest estimator. Only takes into account the
554556
data size.
555557
558+
Additionally, if the data is of integer dtype, then the binwidth will never
559+
be less than 1.
560+
556561
Examples
557562
--------
558563
>>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])

numpy/lib/tests/test_histograms.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@ def test_small(self):
478478
'doane': 3, 'sqrt': 2}}
479479

480480
for testlen, expectedResults in small_dat.items():
481-
testdat = np.arange(testlen)
481+
testdat = np.arange(testlen).astype(float)
482482
for estimator, expbins in expectedResults.items():
483483
a, b = np.histogram(testdat, estimator)
484484
assert_equal(len(a), expbins, err_msg="For the {0} estimator "
@@ -575,6 +575,27 @@ def test_simple_range(self):
575575
msg += " with datasize of {0}".format(testlen)
576576
assert_equal(len(a), numbins, err_msg=msg)
577577

578+
def test_integer(self):
579+
"""
580+
Test that bin width for integer data is at least 1.
581+
"""
582+
estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
583+
for estimator in estimator_list:
584+
assert_equal(
585+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), estimator),
586+
np.arange(9))
587+
588+
def test_integer_non_auto(self):
589+
"""
590+
Test that the bin-width>=1 requirement *only* applies to auto binning.
591+
"""
592+
assert_equal(
593+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), 16),
594+
np.arange(17) / 2)
595+
assert_equal(
596+
np.histogram_bin_edges(np.tile(np.arange(9), 1000), [.1, .2]),
597+
[.1, .2])
598+
578599
def test_simple_weighted(self):
579600
"""
580601
Check that weighted data raises a TypeError

0 commit comments

Comments
 (0)
0