@@ -157,13 +157,14 @@ def _hist_bin_sqrt(x):
157
157
Parameters
158
158
----------
159
159
x : array_like
160
- Input data that is to be histogrammed.
160
+ Input data that is to be histogrammed, trimmed to range. May not
161
+ be empty.
161
162
162
163
Returns
163
164
-------
164
- n : An estimate of the optimal bin count for the given data.
165
+ w : An estimate of the optimal bin width for the given data.
165
166
"""
166
- return int ( np . ceil ( np .sqrt (x .size )) )
167
+ return x . ptp () / np .sqrt (x .size )
167
168
168
169
169
170
def _hist_bin_sturges (x ):
@@ -178,13 +179,14 @@ def _hist_bin_sturges(x):
178
179
Parameters
179
180
----------
180
181
x : array_like
181
- Input data that is to be histogrammed.
182
+ Input data that is to be histogrammed, trimmed to range. May not
183
+ be empty.
182
184
183
185
Returns
184
186
-------
185
- n : An estimate of the optimal bin count for the given data.
187
+ w : An estimate of the optimal bin width for the given data.
186
188
"""
187
- return int ( np .ceil (np .log2 (x .size ))) + 1
189
+ return x . ptp () / np .ceil (np .log2 (x .size ) + 1.0 )
188
190
189
191
190
192
def _hist_bin_rice (x ):
@@ -200,13 +202,14 @@ def _hist_bin_rice(x):
200
202
Parameters
201
203
----------
202
204
x : array_like
203
- Input data that is to be histogrammed.
205
+ Input data that is to be histogrammed, trimmed to range. May not
206
+ be empty.
204
207
205
208
Returns
206
209
-------
207
- n : An estimate of the optimal bin count for the given data.
210
+ w : An estimate of the optimal bin width for the given data.
208
211
"""
209
- return int ( np . ceil ( 2 * x .size ** (1.0 / 3 ) ))
212
+ return x . ptp () / ( 2.0 * x .size ** (1.0 / 3 ))
210
213
211
214
212
215
def _hist_bin_scott (x ):
@@ -220,16 +223,14 @@ def _hist_bin_scott(x):
220
223
Parameters
221
224
----------
222
225
x : array_like
223
- Input data that is to be histogrammed.
226
+ Input data that is to be histogrammed, trimmed to range. May not
227
+ be empty.
224
228
225
229
Returns
226
230
-------
227
- n : An estimate of the optimal bin count for the given data.
231
+ w : An estimate of the optimal bin width for the given data.
228
232
"""
229
- h = (24 * np .pi ** 0.5 / x .size )** (1.0 / 3 ) * np .std (x )
230
- if h > 0 :
231
- return int (np .ceil (x .ptp () / h ))
232
- return 1
233
+ return (24.0 * np .pi ** 0.5 / x .size )** (1.0 / 3.0 ) * np .std (x )
233
234
234
235
235
236
def _hist_bin_doane (x ):
@@ -243,38 +244,39 @@ def _hist_bin_doane(x):
243
244
Parameters
244
245
----------
245
246
x : array_like
246
- Input data that is to be histogrammed.
247
+ Input data that is to be histogrammed, trimmed to range. May not
248
+ be empty.
247
249
248
250
Returns
249
251
-------
250
- n : An estimate of the optimal bin count for the given data.
252
+ w : An estimate of the optimal bin width for the given data.
251
253
"""
252
254
if x .size > 2 :
253
255
sg1 = np .sqrt (6.0 * (x .size - 2 ) / ((x .size + 1.0 ) * (x .size + 3 )))
254
256
sigma = np .std (x )
255
- if sigma > 0 :
257
+ if sigma > 0.0 :
256
258
# These three operations add up to
257
259
# g1 = np.mean(((x - np.mean(x)) / sigma)**3)
258
260
# but use only one temp array instead of three
259
261
temp = x - np .mean (x )
260
262
np .true_divide (temp , sigma , temp )
261
263
np .power (temp , 3 , temp )
262
264
g1 = np .mean (temp )
263
- return int ( np . ceil (1.0 + np .log2 (x .size ) +
264
- np .log2 (1.0 + np .absolute (g1 ) / sg1 ) ))
265
- return 1
265
+ return x . ptp () / (1.0 + np .log2 (x .size ) +
266
+ np .log2 (1.0 + np .absolute (g1 ) / sg1 ))
267
+ return 0.0
266
268
267
269
268
270
def _hist_bin_fd (x ):
269
271
"""
270
272
The Freedman-Diaconis histogram bin estimator.
271
273
272
- The Freedman-Diaconis rule uses interquartile range (IQR)
273
- binwidth. It is considered a variation of the Scott rule with more
274
- robustness as the IQR is less affected by outliers than the standard
275
- deviation. However, the IQR depends on fewer points than the
276
- standard deviation, so it is less accurate, especially for long
277
- tailed distributions.
274
+ The Freedman-Diaconis rule uses interquartile range (IQR) to
275
+ estimate binwidth. It is considered a variation of the Scott rule
276
+ with more robustness as the IQR is less affected by outliers than
277
+ the standard deviation. However, the IQR depends on fewer points
278
+ than the standard deviation, so it is less accurate, especially for
279
+ long tailed distributions.
278
280
279
281
If the IQR is 0, this function returns 1 for the number of bins.
280
282
Binwidth is inversely proportional to the cube root of data size
@@ -283,46 +285,42 @@ def _hist_bin_fd(x):
283
285
Parameters
284
286
----------
285
287
x : array_like
286
- Input data that is to be histogrammed.
288
+ Input data that is to be histogrammed, trimmed to range. May not
289
+ be empty.
287
290
288
291
Returns
289
292
-------
290
- n : An estimate of the optimal bin count for the given data.
293
+ w : An estimate of the optimal bin width for the given data.
291
294
"""
292
295
iqr = np .subtract (* np .percentile (x , [75 , 25 ]))
293
-
294
- if iqr > 0 :
295
- h = (2 * iqr * x .size ** (- 1.0 / 3 ))
296
- return int (np .ceil (x .ptp () / h ))
297
-
298
- # If iqr is 0, default number of bins is 1
299
- return 1
296
+ return 2.0 * iqr * x .size ** (- 1.0 / 3.0 )
300
297
301
298
302
299
def _hist_bin_auto (x ):
303
300
"""
304
- Histogram bin estimator that uses the maximum of the
301
+ Histogram bin estimator that uses the minimum width of the
305
302
Freedman-Diaconis and Sturges estimators.
306
303
307
- The FD estimator is usually the most robust method, but it tends to
308
- be too small for small `x`. The Sturges estimator is quite good for
309
- small (<1000) datasets and is the default in the R language. This
310
- method gives good off the shelf behaviour.
304
+ The FD estimator is usually the most robust method, but its width
305
+ estimate tends to be too large for small `x`. The Sturges estimator
306
+ is quite good for small (<1000) datasets and is the default in the R
307
+ language. This method gives good off the shelf behaviour.
311
308
312
309
Parameters
313
310
----------
314
311
x : array_like
315
- Input data that is to be histogrammed.
312
+ Input data that is to be histogrammed, trimmed to range. May not
313
+ be empty.
316
314
317
315
Returns
318
316
-------
319
- n : An estimate of the optimal bin count for the given data.
317
+ w : An estimate of the optimal bin width for the given data.
320
318
321
319
See Also
322
320
--------
323
321
_hist_bin_fd, _hist_bin_sturges
324
322
"""
325
- return max (_hist_bin_fd (x ), _hist_bin_sturges (x ))
323
+ return min (_hist_bin_fd (x ), _hist_bin_sturges (x ))
326
324
327
325
328
326
# Private dict initialized at module load time
@@ -548,36 +546,53 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
548
546
weights = weights .ravel ()
549
547
a = a .ravel ()
550
548
551
- if (range is not None ):
552
- mn , mx = range
553
- if (mn > mx ):
554
- raise ValueError (
555
- 'max must be larger than min in range parameter.' )
556
- if not np .all (np .isfinite ([mn , mx ])):
557
- raise ValueError (
558
- 'range parameter must be finite.' )
549
+ # Do not modify the original value of range so we can check for `None`
550
+ if range is None :
551
+ if a .size == 0 :
552
+ # handle empty arrays. Can't determine range, so use 0-1.
553
+ mn , mx = 0.0 , 1.0
554
+ else :
555
+ mn , mx = a .min () + 0.0 , a .max () + 0.0
556
+ else :
557
+ mn , mx = [mi + 0.0 for mi in range ]
558
+ if mn > mx :
559
+ raise ValueError (
560
+ 'max must be larger than min in range parameter.' )
561
+ if not np .all (np .isfinite ([mn , mx ])):
562
+ raise ValueError (
563
+ 'range parameter must be finite.' )
564
+ if mn == mx :
565
+ mn -= 0.5
566
+ mx += 0.5
559
567
560
568
if isinstance (bins , basestring ):
561
569
# if `bins` is a string for an automatic method,
562
570
# this will replace it with the number of bins calculated
563
571
if bins not in _hist_bin_selectors :
564
- raise ValueError ("{0 } not a valid estimator for ` bins` " .format (bins ))
572
+ raise ValueError ("{} not a valid estimator for bins" .format (bins ))
565
573
if weights is not None :
566
574
raise TypeError ("Automated estimation of the number of "
567
575
"bins is not supported for weighted data" )
568
576 # Make a reference to `a`
569
577
b = a
570
578
# Update the reference if the range needs truncation
571
579
if range is not None :
572
- mn , mx = range
573
580
keep = (a >= mn )
574
581
keep &= (a <= mx )
575
582
if not np .logical_and .reduce (keep ):
576
583
b = a [keep ]
584
+
577
585
if b .size == 0 :
578
586
bins = 1
579
587
else :
580
- bins = _hist_bin_selectors [bins ](b )
588
+ # Do not call selectors on empty arrays
589
+ width = _hist_bin_selectors [bins ](b )
590
+ if width :
591
+ bins = int (np .ceil ((mx - mn ) / width ))
592
+ else :
593
+ # Width can be zero for some estimators, e.g. FD when
594
+ # the IQR of the data is zero.
595
+ bins = 1
581
596
582
597
# Histogram is an integer or a float array depending on the weights.
583
598
if weights is None :
@@ -593,16 +608,6 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
593
608
if np .isscalar (bins ) and bins < 1 :
594
609
raise ValueError (
595
610
'`bins` should be a positive integer.' )
596
- if range is None :
597
- if a .size == 0 :
598
- # handle empty arrays. Can't determine range, so use 0-1.
599
- range = (0 , 1 )
600
- else :
601
- range = (a .min (), a .max ())
602
- mn , mx = [mi + 0.0 for mi in range ]
603
- if mn == mx :
604
- mn -= 0.5
605
- mx += 0.5
606
611
# At this point, if the weights are not integer, floating point, or
607
612
# complex, we have to use the slow algorithm.
608
613
if weights is not None and not (np .can_cast (weights .dtype , np .double ) or
0 commit comments