@@ -76,150 +76,191 @@ def iterable(y):
76
76
return True
77
77
78
78
79
- def _hist_optim_numbins_estimator ( a , estimator , data_range = None , data_weights = None ):
79
+ def _hist_bin_sqrt ( x ):
80
80
"""
81
- A helper function to be called from ``histogram`` to deal with
82
- estimating optimal number of bins.
81
+ Square root histogram bin estimator.
83
82
84
- A description of the estimators can be found at
85
- https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
83
+ Used by many programs for its simplicity.
86
84
87
85
Parameters
88
86
----------
89
- a : array_like
90
- The data with which to estimate the number of bins
91
- estimator: str
92
- If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane',
93
- 'rice', 'sturges', 'sqrt'], this function will choose the
94
- appropriate estimation method and return the optimal number of
95
- bins it calculates.
96
- data_range: tuple (min, max)
97
- The range that the data to be binned should be restricted to.
98
- data_weights:
99
- weights are not supported, so this field must be empty or None.
87
+ x : array_like
88
+ Input data that is to be histogrammed.
89
+
90
+ Returns
91
+ -------
92
+ n : An estimate of the optimal bin count for the given data.
100
93
"""
101
- if a .size == 0 :
102
- return 1
94
+ return int (np .ceil (np .sqrt (x .size )))
103
95
104
- if data_weights is not None :
105
- raise TypeError ("Automated estimation of the number of "
106
- "bins is not supported for weighted data" )
107
96
108
- if data_range is not None :
109
- mn , mx = data_range
110
- keep = (a >= mn )
111
- keep &= (a <= mx )
112
- if not np .logical_and .reduce (keep ):
113
- a = a [keep ]
97
+ def _hist_bin_sturges (x ):
98
+ """
99
+ Sturges histogram bin estimator.
114
100
115
- def sqrt (x ):
116
- """
117
- Square Root Estimator
101
+ A very simplistic estimator based on the assumption of normality of
102
+ the data. This estimator has poor performance for non-normal data,
103
+ which becomes especially obvious for large data sets. The estimate
104
+ depends only on size of the data.
118
105
119
- Used by many programs for its simplicity.
120
- """
121
- return np .ceil (np .sqrt (x .size ))
106
+ Parameters
107
+ ----------
108
+ x : array_like
109
+ Input data that is to be histogrammed.
122
110
123
- def sturges (x ):
124
- """
125
- Sturges Estimator
111
+ Returns
112
+ -------
113
+ n : An estimate of the optimal bin count for the given data.
114
+ """
115
+ return int (np .ceil (np .log2 (x .size ))) + 1
126
116
127
- A very simplistic estimator based on the assumption of normality
128
- of the data. Poor performance for non-normal data, especially
129
- obvious for large ``x``. Depends only on size of the data.
130
- """
131
- return np .ceil (np .log2 (x .size )) + 1
132
117
133
- def rice (x ):
134
- """
135
- Rice Estimator
118
+ def _hist_bin_rice (x ):
119
+ """
120
+ Rice histogram bin estimator.
136
121
137
- Another simple estimator, with no normality assumption. It has
138
- better performance for large data, but tends to overestimate
139
- number of bins. The number of bins is proportional to the cube
140
- root of data size (asymptotically optimal). Depends only on size
141
- of the data.
142
- """
143
- return np .ceil (2 * x .size ** (1.0 / 3 ))
122
+ Another simple estimator with no normality assumption. It has better
123
+ performance for large data than Sturges, but tends to overestimate
124
+ the number of bins. The number of bins is proportional to the cube
125
+ root of data size (asymptotically optimal). The estimate depends
126
+ only on size of the data.
144
127
145
- def scott (x ):
146
- """
147
- Scott Estimator
128
+ Parameters
129
+ ----------
130
+ x : array_like
131
+ Input data that is to be histogrammed.
148
132
149
- The binwidth is proportional to the standard deviation of the
150
- data and inversely proportional to the cube root of data size
151
- (asymptotically optimal).
152
- """
153
- h = (24 * np .pi ** 0.5 / x .size )** (1.0 / 3 ) * np .std (x )
154
- if h > 0 :
155
- return np .ceil (x .ptp () / h )
156
- return 1
133
+ Returns
134
+ -------
135
+ n : An estimate of the optimal bin count for the given data.
136
+ """
137
+ return int (np .ceil (2 * x .size ** (1.0 / 3 )))
157
138
158
- def doane (x ):
159
- """
160
- Doane's Estimator
161
139
162
- Improved version of Sturges' formula which works better for
163
- non-normal data. See
164
- http://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
165
- """
166
- if x .size > 2 :
167
- sg1 = np .sqrt (6.0 * (x .size - 2 ) / ((x .size + 1.0 ) * (x .size + 3 )))
168
- sigma = np .std (x )
169
- if sigma > 0 :
170
- # These three operations add up to
171
- # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
172
- # but use only one temp array instead of three
173
- temp = x - np .mean (x )
174
- np .true_divide (temp , sigma , temp )
175
- np .power (temp , 3 , temp )
176
- g1 = np .mean (temp )
177
- return np .ceil (1.0 + np .log2 (x .size ) +
178
- np .log2 (1.0 + np .absolute (g1 ) / sg1 ))
179
- return 1
180
-
181
- def fd (x ):
182
- """
183
- Freedman Diaconis Estimator
140
+ def _hist_bin_scott (x ):
141
+ """
142
+ Scott histogram bin estimator.
184
143
185
- The interquartile range (IQR) is used for binwidth, making this
186
- variation of the Scott rule more robust, as the IQR is less
187
- affected by outliers than the standard deviation. However, the
188
- IQR depends on fewer points than the standard deviation, so it
189
- is less accurate, especially for long tailed distributions.
144
+ The binwidth is proportional to the standard deviation of the data
145
+ and inversely proportional to the cube root of data size
146
+ (asymptotically optimal).
190
147
191
- If the IQR is 0, we return 1 for the number of bins. Binwidth is
192
- inversely proportional to the cube root of data size
193
- (asymptotically optimal).
194
- """
195
- iqr = np .subtract (* np .percentile (x , [75 , 25 ]))
148
+ Parameters
149
+ ----------
150
+ x : array_like
151
+ Input data that is to be histogrammed.
196
152
197
- if iqr > 0 :
198
- h = (2 * iqr * x .size ** (- 1.0 / 3 ))
199
- return np .ceil (x .ptp () / h )
153
+ Returns
154
+ -------
155
+ n : An estimate of the optimal bin count for the given data.
156
+ """
157
+ h = (24 * np .pi ** 0.5 / x .size )** (1.0 / 3 ) * np .std (x )
158
+ if h > 0 :
159
+ return int (np .ceil (x .ptp () / h ))
160
+ return 1
200
161
201
- # If iqr is 0, default number of bins is 1
202
- return 1
203
162
204
- def auto (x ):
205
- """
206
- The FD estimator is usually the most robust method, but it tends
207
- to be too small for small ``x``. The Sturges estimator is quite
208
- good for small (<1000) datasets and is the default in R. This
209
- method gives good off-the-shelf behaviour.
210
- """
211
- return max (fd (x ), sturges (x ))
163
+ def _hist_bin_doane (x ):
164
+ """
165
+ Doane's histogram bin estimator.
212
166
213
- optimal_numbins_methods = {'sqrt' : sqrt , 'sturges' : sturges ,
214
- 'rice' : rice , 'scott' : scott , 'doane' : doane ,
215
- 'fd' : fd , 'auto' : auto }
216
- try :
217
- estimator_func = optimal_numbins_methods [estimator .lower ()]
218
- except KeyError :
219
- raise ValueError ("{0} not a valid method for `bins`" .format (estimator ))
220
- else :
221
- # these methods return floats, np.histogram requires an int
222
- return int (estimator_func (a ))
167
+ Improved version of Sturges' formula which works better for
168
+ non-normal data. See
169
+ http://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
170
+
171
+ Parameters
172
+ ----------
173
+ x : array_like
174
+ Input data that is to be histogrammed.
175
+
176
+ Returns
177
+ -------
178
+ n : An estimate of the optimal bin count for the given data.
179
+ """
180
+ if x .size > 2 :
181
+ sg1 = np .sqrt (6.0 * (x .size - 2 ) / ((x .size + 1.0 ) * (x .size + 3 )))
182
+ sigma = np .std (x )
183
+ if sigma > 0 :
184
+ # These three operations add up to
185
+ # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
186
+ # but use only one temp array instead of three
187
+ temp = x - np .mean (x )
188
+ np .true_divide (temp , sigma , temp )
189
+ np .power (temp , 3 , temp )
190
+ g1 = np .mean (temp )
191
+ return int (np .ceil (1.0 + np .log2 (x .size ) +
192
+ np .log2 (1.0 + np .absolute (g1 ) / sg1 )))
193
+ return 1
194
+
195
+
196
+ def _hist_bin_fd (x ):
197
+ """
198
+ The Freedman-Diaconis histogram bin estimator.
199
+
200
+ The Freedman-Diaconis rule uses interquartile range (IQR)
201
+ binwidth. It is considered a variation of the Scott rule with more
202
+ robustness as the IQR is less affected by outliers than the standard
203
+ deviation. However, the IQR depends on fewer points than the
204
+ standard deviation, so it is less accurate, especially for long
205
+ tailed distributions.
206
+
207
+ If the IQR is 0, this function returns 1 for the number of bins.
208
+ Binwidth is inversely proportional to the cube root of data size
209
+ (asymptotically optimal).
210
+
211
+ Parameters
212
+ ----------
213
+ x : array_like
214
+ Input data that is to be histogrammed.
215
+
216
+ Returns
217
+ -------
218
+ n : An estimate of the optimal bin count for the given data.
219
+ """
220
+ iqr = np .subtract (* np .percentile (x , [75 , 25 ]))
221
+
222
+ if iqr > 0 :
223
+ h = (2 * iqr * x .size ** (- 1.0 / 3 ))
224
+ return int (np .ceil (x .ptp () / h ))
225
+
226
+ # If iqr is 0, default number of bins is 1
227
+ return 1
228
+
229
+
230
+ def _hist_bin_auto (x ):
231
+ """
232
+ Histogram bin estimator that uses the maximum of the
233
+ Freedman-Diaconis and Sturges estimators.
234
+
235
+ The FD estimator is usually the most robust method, but it tends to
236
+ be too small for small `x`. The Sturges estimator is quite good for
237
+ small (<1000) datasets and is the default in the R language. This
238
+ method gives good off the shelf behaviour.
239
+
240
+ Parameters
241
+ ----------
242
+ x : array_like
243
+ Input data that is to be histogrammed.
244
+
245
+ Returns
246
+ -------
247
+ n : An estimate of the optimal bin count for the given data.
248
+
249
+ See Also
250
+ --------
251
+ _hist_bin_fd, _hist_bin_sturges
252
+ """
253
+ return max (_hist_bin_fd (x ), _hist_bin_sturges (x ))
254
+
255
+
256
+ # Private dict initialized at module load time
257
+ _hist_bin_selectors = {'auto' : _hist_bin_auto ,
258
+ 'doane' : _hist_bin_doane ,
259
+ 'fd' : _hist_bin_fd ,
260
+ 'rice' : _hist_bin_rice ,
261
+ 'scott' : _hist_bin_scott ,
262
+ 'sqrt' : _hist_bin_sqrt ,
263
+ 'sturges' : _hist_bin_sturges }
223
264
224
265
225
266
def histogram (a , bins = 10 , range = None , normed = False , weights = None ,
@@ -241,9 +282,9 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
241
282
242
283
If `bins` is a string from the list below, `histogram` will use
243
284
the method chosen to calculate the optimal number of bins (see
244
- Notes for more detail on the estimators). For visualisation, we
245
- suggest using the 'auto' option. Weighted data is not supported
246
- for automated bin size selection.
285
+ ` Notes` for more detail on the estimators). For visualisation,
286
+ using the 'auto' option is suggested . Weighted data is not
287
+ supported for automated bin size selection.
247
288
248
289
'auto'
249
290
Maximum of the 'sturges' and 'fd' estimators. Provides good
@@ -342,7 +383,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
342
383
value will usually be chosen, while larger datasets will usually
343
384
default to FD. Avoids the overly conservative behaviour of FD
344
385
and Sturges for small and large datasets respectively.
345
- Switchover point usually happens when ``x .size`` is around 1000.
386
+ Switchover point is usually :math:`a .size \approx 1000` .
346
387
347
388
'FD' (Freedman Diaconis Estimator)
348
389
.. math:: h = 2 \frac{IQR}{n^{1/3}}
@@ -444,11 +485,27 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
444
485
raise ValueError (
445
486
'range parameter must be finite.' )
446
487
447
-
448
488
if isinstance (bins , basestring ):
449
- bins = _hist_optim_numbins_estimator (a , bins , range , weights )
450
489
# if `bins` is a string for an automatic method,
451
490
# this will replace it with the number of bins calculated
491
+ if bins not in _hist_bin_selectors :
492
+ raise ValueError ("{0} not a valid estimator for `bins`" .format (bins ))
493
+ if weights is not None :
494
+ raise TypeError ("Automated estimation of the number of "
495
+ "bins is not supported for weighted data" )
496
+ # Make a reference to `a`
497
+ b = a
498
+ # Update the reference if the range needs truncation
499
+ if range is not None :
500
+ mn , mx = range
501
+ keep = (a >= mn )
502
+ keep &= (a <= mx )
503
+ if not np .logical_and .reduce (keep ):
504
+ b = a [keep ]
505
+ if b .size == 0 :
506
+ bins = 1
507
+ else :
508
+ bins = _hist_bin_selectors [bins ](b )
452
509
453
510
# Histogram is an integer or a float array depending on the weights.
454
511
if weights is None :
0 commit comments