8000 BUG: Incorrect handling of range in `histogram` with automatic bins. · numpy/numpy@127eb9e · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 127eb9e

Browse files
committed
BUG: Incorrect handling of range in histogram with automatic bins.
Fixes #7411. Tests and documentation updated. Fixes other small issues with range and bin count computations.
1 parent 1429c60 commit 127eb9e

File tree

2 files changed

+82
-77
lines changed

2 files changed

+82
-77
lines changed

numpy/lib/function_base.py

Lines changed: 70 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,14 @@ def _hist_bin_sqrt(x):
157157
Parameters
158158
----------
159159
x : array_like
160-
Input data that is to be histogrammed.
160+
Input data that is to be histogrammed, trimmed to range. May not
161+
be empty.
161162
162163
Returns
163164
-------
164-
n : An estimate of the optimal bin count for the given data.
165+
w : An estimate of the optimal bin width for the given data.
165166
"""
166-
return int(np.ceil(np.sqrt(x.size)))
167+
return x.ptp() / np.sqrt(x.size)
167168

168169

169170
def _hist_bin_sturges(x):
@@ -178,13 +179,14 @@ def _hist_bin_sturges(x):
178179
Parameters
179180
----------
180181
x : array_like
181-
Input data that is to be histogrammed.
182+
Input data that is to be histogrammed, trimmed to range. May not
183+
be empty.
182184
183185
Returns
184186
-------
185-
n : An estimate of the optimal bin count for the given data.
187+
w : An estimate of the optimal bin width for the given data.
186188
"""
187-
return int(np.ceil(np.log2(x.size))) + 1
189+
return x.ptp() / np.ceil(np.log2(x.size) + 1.0)
188190

189191

190192
def _hist_bin_rice(x):
@@ -200,13 +202,14 @@ def _hist_bin_rice(x):
200202
Parameters
201203
----------
202204
x : array_like
203-
Input data that is to be histogrammed.
205+
Input data that is to be histogrammed, trimmed to range. May not
206+
be empty.
204207
205208
Returns
206209
-------
207-
n : An estimate of the optimal bin count for the given data.
210+
w : An estimate of the optimal bin width for the given data.
208211
"""
209-
return int(np.ceil(2 * x.size ** (1.0 / 3)))
212+
return x.ptp() / (2.0 * x.size ** (1.0 / 3))
210213

211214

212215
def _hist_bin_scott(x):
@@ -220,16 +223,14 @@ def _hist_bin_scott(x):
220223
Parameters
221224
----------
222225
x : array_like
223-
Input data that is to be histogrammed.
226+
Input data that is to be histogrammed, trimmed to range. May not
227+
be empty.
224228
225229
Returns
226230
-------
227-
n : An estimate of the optimal bin count for the given data.
231+
w : An estimate of the optimal bin width for the given data.
228232
"""
229-
h = (24 * np.pi**0.5 / x.size)**(1.0 / 3) * np.std(x)
230-
if h > 0:
231-
return int(np.ceil(x.ptp() / h))
232-
return 1
233+
return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
233234

234235

235236
def _hist_bin_doane(x):
@@ -243,38 +244,39 @@ def _hist_bin_doane(x):
243244
Parameters
244245
----------
245246
x : array_like
246-
Input data that is to be histogrammed.
247+
Input data that is to be histogrammed, trimmed to range. May not
248+
be empty.
247249
248250
Returns
249251
-------
250-
n : An estimate of the optimal bin count for the given data.
252+
w : An estimate of the optimal bin width for the given data.
251253
"""
252254
if x.size > 2:
253255
sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
254256
sigma = np.std(x)
255-
if sigma > 0:
257+
if sigma > 0.0:
256258
# These three operations add up to
257259
# g1 = np.mean(((x - np.mean(x)) / sigma)**3)
258260
# but use only one temp array instead of three
259261
temp = x - np.mean(x)
260262
np.true_divide(temp, sigma, temp)
261263
np.power(temp, 3, temp)
262264
g1 = np.mean(temp)
263-
return int(np.ceil(1.0 + np.log2(x.size) +
264-
np.log2(1.0 + np.absolute(g1) / sg1)))
265-
return 1
265+
return x.ptp() / (1.0 + np.log2(x.size) +
266+
np.log2(1.0 + np.absolute(g1) / sg1))
267+
return 0.0
266268

267269

268270
def _hist_bin_fd(x):
269271
"""
270272
The Freedman-Diaconis histogram bin estimator.
271273
272-
The Freedman-Diaconis rule uses interquartile range (IQR)
273-
binwidth. It is considered a variation of the Scott rule with more
274-
robustness as the IQR is less affected by outliers than the standard
275-
deviation. However, the IQR depends on fewer points than the
276-
standard deviation, so it is less accurate, especially for long
277-
tailed distributions.
274+
The Freedman-Diaconis rule uses interquartile range (IQR) to
275+
estimate binwidth. It is considered a variation of the Scott rule
276+
with more robustness as the IQR is less affected by outliers than
277+
the standard deviation. However, the IQR depends on fewer points
278+
than the standard deviation, so it is less accurate, especially for
279+
long tailed distributions.
278280
279281
If the IQR is 0, this function returns 1 for the number of bins.
280282
Binwidth is inversely proportional to the cube root of data size
@@ -283,46 +285,42 @@ def _hist_bin_fd(x):
283285
Parameters
284286
----------
285287
x : array_like
286-
Input data that is to be histogrammed.
288+
Input data that is to be histogrammed, trimmed to range. May not
289+
be empty.
287290
288291
Returns
289292
-------
290-
n : An estimate of the optimal bin count for the given data.
293+
w : An estimate of the optimal bin width for the given data.
291294
"""
292295
iqr = np.subtract(*np.percentile(x, [75, 25]))
293-
294-
if iqr > 0:
295-
h = (2 * iqr * x.size ** (-1.0 / 3))
296-
return int(np.ceil(x.ptp() / h))
297-
298-
# If iqr is 0, default number of bins is 1
299-
return 1
296+
return 2.0 * iqr * x.size ** (-1.0 / 3.0)
300297

301298

302299
def _hist_bin_auto(x):
303300
"""
304-
Histogram bin estimator that uses the maximum of the
301+
Histogram bin estimator that uses the minimum width of the
305302
Freedman-Diaconis and Sturges estimators.
306303
307-
The FD estimator is usually the most robust method, but it tends to
308-
be too small for small `x`. The Sturges estimator is quite good for
309-
small (<1000) datasets and is the default in the R language. This
310-
method gives good off the shelf behaviour.
304+
The FD estimator is usually the most robust method, but its width
305+
estimate tends to be too large for small `x`. The Sturges estimator
306+
is quite good for small (<1000) datasets and is the default in the R
307+
language. This method gives good off the shelf behaviour.
311308
312309
Parameters
313310
----------
314311
x : array_like
315-
Input data that is to be histogrammed.
312+
Input data that is to be histogrammed, trimmed to range. May not
313+
be empty.
316314
317315
Returns
318316
-------
319-
n : An estimate of the optimal bin count for the given data.
317+
w : An estimate of the optimal bin width for the given data.
320318
321319
See Also
322320
--------
323321
_hist_bin_fd, _hist_bin_sturges
324322
"""
325-
return max(_hist_bin_fd(x), _hist_bin_sturges(x))
323+
return min(_hist_bin_fd(x), _hist_bin_sturges(x))
326324

327325

328326
# Private dict initialized at module load time
@@ -548,36 +546,53 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
548546
weights = weights.ravel()
549547
a = a.ravel()
550548

551-
if (range is not None):
552-
mn, mx = range
553-
if (mn > mx):
554-
raise ValueError(
555-
'max must be larger than min in range parameter.')
556-
if not np.all(np.isfinite([mn, mx])):
557-
raise ValueError(
558-
'range parameter must be finite.')
549+
# Do not modify the original value of range so we can check for `None`
550+
if range is None:
551+
if a.size == 0:
552+
# handle empty arrays. Can't determine range, so use 0-1.
553+
mn, mx = 0.0, 1.0
554+
else:
555+
mn, mx = a.min() + 0.0, a.max() + 0.0
556+
else:
557+
mn, mx = [mi + 0.0 for mi in range]
558+
if mn > mx:
559+
raise ValueError(
560+
'max must be larger than min in range parameter.')
561+
if not np.all(np.isfinite([mn, mx])):
562+
raise ValueError(
563+
'range parameter must be finite.')
564+
if mn == mx:
565+
mn -= 0.5
566+
mx += 0.5
559567

560568
if isinstance(bins, basestring):
561569
# if `bins` is a string for an automatic method,
562570
# this will replace it with the number of bins calculated
563571
if bins not in _hist_bin_selectors:
564-
raise ValueError("{0} not a valid estimator for `bins`".format(bins))
572+
raise ValueError("{} not a valid estimator for bins".format(bins))
565573
if weights is not None:
566574
raise TypeError("Automated estimation of the number of "
567575
"bins is not supported for weighted data")
568576
# Make a reference to `a`
569577
b = a
570578
# Update the reference if the range needs truncation
571579
if range is not None:
572-
mn, mx = range
573580
keep = (a >= mn)
574581
keep &= (a <= mx)
575582
if not np.logical_and.reduce(keep):
576583
b = a[keep]
584+
577585
if b.size == 0:
578586
bins = 1
579587
else:
580-
bins = _hist_bin_selectors[bins](b)
588+
# Do not call selectors on empty arrays
589+
width = _hist_bin_selectors[bins](b)
590+
if width:
591+
bins = int(np.ceil((mx - mn) / width))
592+
else:
593+
# Width can be zero for some estimators, e.g. FD when
594+
# the IQR of the data is zero.
595+
bins = 1
581596

582597
# Histogram is an integer or a float array depending on the weights.
583598
if weights is None:
@@ -593,16 +608,6 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
593608
if np.isscalar(bins) and bins < 1:
594609
raise ValueError(
595610
'`bins` should be a positive integer.')
596-
if range is None:
597-
if a.size == 0:
598-
# handle empty arrays. Can't determine range, so use 0-1.
599-
range = (0, 1)
600-
else:
601-
range = (a.min(), a.max())
602-
mn, mx = [mi + 0.0 for mi in range]
603-
if mn == mx:
604-
mn -= 0.5
605-
mx += 0.5
606611
# At this point, if the weights are not integer, floating point, or
607612
# complex, we have to use the slow algorithm.
608613
if weights is not None and not (np.can_cast(weights.dtype, np.double) or

numpy/lib/tests/test_function_base.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,9 +1432,9 @@ def test_simple(self):
14321432
for testlen, expectedResults in basic_test.items():
14331433
# Create some sort of non uniform data to test with
14341434
# (2 peak uniform mixture)
1435-
x1 = np.linspace(-10, -1, testlen/5 * 2)
1436-
x2 = np.linspace(1,10, testlen/5 * 3)
1437-
x = np.hstack((x1, x2))
1435+
x1 = np.linspace(-10, -1, testlen // 5 * 2)
1436+
x2 = np.linspace(1, 10, testlen // 5 * 3)
1437+
x = np.concatenate((x1, x2))
14381438
for estimator, numbins in expectedResults.items():
14391439
a, b = np.histogram(x, estimator)
14401440
assert_equal(len(a), numbins, err_msg="For the {0} estimator "
@@ -1446,7 +1446,7 @@ def test_small(self):
14461446
adaptive methods, especially the FD method. All bin numbers have been
14471447
precalculated.
14481448
"""
1449-
small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1,
1449+
small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
14501450
'doane': 1, 'sqrt': 1},
14511451
2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
14521452
'doane': 1, 'sqrt': 2},
@@ -1474,8 +1474,8 @@ def test_novariance(self):
14741474
Primarily for Scott and FD as the SD and IQR are both 0 in this case
14751475
"""
14761476
novar_dataset = np.ones(100)
1477-
novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8,
1478-
'doane': 1, 'sqrt': 10, 'auto': 8}
1477+
novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
1478+
'doane': 1, 'sqrt': 1, 'auto': 1}
14791479

14801480
for estimator, numbins in novar_resultdict.items():
14811481
a, b = np.histogram(novar_dataset, estimator)
@@ -1510,14 +1510,14 @@ def test_simple_range(self):
15101510
the shouldn't change.
15111511
"""
15121512
# some basic sanity checking, with some fixed data. Checking for the correct number of bins
1513-
basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7},
1514-
500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10},
1515-
5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}
1513+
basic_test = {50: {'fd': 8, 'scott': 8, 'rice': 15, 'sturges': 14, 'auto': 14},
1514+
500: {'fd': 15, 'scott': 16, 'rice': 32, 'sturges': 20, 'auto': 20},
1515+
5000: {'fd': 33, 'scott': 33, 'rice': 69, 'sturges': 28, 'auto': 33}}
15161516

15171517
for testlen, expectedResults in basic_test.items():
1518-
# create some sort of non uniform data to test with (2 peak uniform mixture)
1519-
x1 = np.linspace(-10, -1, testlen/5 * 2)
1520-
x2 = np.linspace(1, 10, testlen/5 * 3)
1518+
# create some sort of non uniform data to test with (3 peak uniform mixture)
1519+
x1 = np.linspace(-10, -1, testlen // 5 * 2)
1520+
x2 = np.linspace(1, 10, testlen // 5 * 3)
15211521
x3 = np.linspace(-100, -50, testlen)
15221522
x = np.hstack((x1, x2, x3))
15231523
for estimator, numbins in expectedResults.items():

0 commit comments

Comments
 (0)
0