@@ -78,56 +78,100 @@ def iterable(y):
78
78
79
79
def _hist_optim_numbins_estimator (a , estimator ):
80
80
"""
81
- A helper function to be called from histogram to deal with estimating optimal number of bins
81
+ A helper function to be called from ``histogram`` to deal with
82
+ estimating optimal number of bins.
83
+
84
+ A description of the estimators can be found at
85
+ https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
82
86
83
87
estimator: str
84
- If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function
85
- will choose the appropriate estimator and return it's estimate for the optimal
86
- number of bins.
88
+ If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane',
89
+ 'rice', 'sturges', 'sqrt'], this function will choose the
90
+ appropriate estimation method and return the optimal number of
91
+ bins it calculates.
87
92
"""
88
93
if a .size == 0 :
89
94
return 1
90
95
96
+ def sqrt (x ):
97
+ """
98
+ Square Root Estimator
99
+
100
+ Used by many programs for its simplicity.
101
+ """
102
+ return np .ceil (np .sqrt (x .size ))
103
+
91
104
def sturges (x ):
92
105
"""
93
106
Sturges Estimator
94
- A very simplistic estimator based on the assumption of normality of the data
95
- Poor performance for non-normal data, especially obvious for large X.
96
- Depends only on size of the data.
107
+
108
+ A very simplistic estimator based on the assumption of normality
109
+ of the data. Poor performance for non-normal data, especially
110
+ obvious for large ``x``. Depends only on size of the data.
97
111
"""
98
112
return np .ceil (np .log2 (x .size )) + 1
99
113
100
114
def rice (x ):
101
115
"""
102
116
Rice Estimator
103
- Another simple estimator, with no normality assumption.
104
- It has better performance for large data, but tends to overestimate number of bins.
105
- The number of bins is proportional to the cube root of data size (asymptotically optimal)
106
- Depends only on size of the data
117
+
118
+ Another simple estimator, with no normality assumption. It has
119
+ better performance for large data, but tends to overestimate
120
+ number of bins. The number of bins is proportional to the cube
121
+ root of data size (asymptotically optimal). Depends only on size
122
+ of the data.
107
123
"""
108
124
return np .ceil (2 * x .size ** (1.0 / 3 ))
109
125
110
126
def scott (x ):
111
127
"""
112
128
Scott Estimator
113
- The binwidth is proportional to the standard deviation of the data and
114
- inversely proportional to the cube root of data size (asymptotically optimal)
115
129
130
+ The binwidth is proportional to the standard deviation of the
131
+ data and inversely proportional to the cube root of data size
132
+ (asymptotically optimal).
116
133
"""
117
- h = 3.5 * x . std () * x .size ** ( - 1.0 / 3 )
134
+ h = ( 24 * np . pi ** 0.5 / x .size ) ** ( 1.0 / 3 ) * np . std ( x )
118
135
if h > 0 :
119
136
return np .ceil (x .ptp () / h )
120
137
return 1
121
138
139
+ def doane (x ):
140
+ """
141
+ Doane's Estimator
142
+
143
+ Improved version of Sturges' formula which works better for
144
+ non-normal data. See
145
+ http://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
146
+ """
147
+ if x .size > 2 :
148
+ sg1 = np .sqrt (6.0 * (x .size - 2 ) / ((x .size + 1.0 ) * (x .size + 3 )))
149
+ sigma = np .std (x )
150
+ if sigma > 0 :
151
+ # These three operations add up to
152
+ # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
153
+ # but use only one temp array instead of three
154
+ temp = x - np .mean (x )
155
+ np .true_divide (temp , sigma , temp )
156
+ np .power (temp , 3 , temp )
157
+ g1 = np .mean (temp )
158
+ return np .ceil (1.0 + np .log2 (x .size ) +
159
+ np .log2 (1.0 + np .absolute (g1 ) / sg1 ))
160
+ return 1
161
+
122
162
def fd (x ):
123
163
"""
124
- Freedman Diaconis rule using interquartile range (IQR) for binwidth
125
- Considered a variation of the Scott rule with more robustness as the IQR
126
- is less affected by outliers than the standard deviation. However the IQR depends on
127
- fewer points than the sd so it is less accurate, especially for long tailed distributions.
164
+ Freedman Diaconis Estimator
128
165
129
- If the IQR is 0, we return 1 for the number of bins.
130
- Binwidth is inversely proportional to the cube root of data size (asymptotically optimal)
166
+ The interquartile range (IQR) is used for binwidth, making this
167
+ variation of the Scott rule more robust, as the IQR is less
168
+ affected by outliers than the standard deviation. However, the
169
+ IQR depends on fewer points than the standard deviation, so it
170
+ is less accurate, especially for long tailed distributions.
171
+
172
+ If the IQR is 0, we return 1 for the number of bins. Binwidth is
173
+ inversely proportional to the cube root of data size
174
+ (asymptotically optimal).
131
175
"""
132
176
iqr = np .subtract (* np .percentile (x , [75 , 25 ]))
133
177
@@ -140,14 +184,15 @@ def fd(x):
140
184
141
185
def auto (x ):
142
186
"""
143
- The FD estimator is usually the most robust method, but it tends to be too small
144
- for small X . The Sturges estimator is quite good for small (<1000) datasets and is
145
- the default in R.
146
- This method gives good off the shelf behaviour.
187
+ The FD estimator is usually the most robust method, but it tends
188
+ to be too small for small ``x`` . The Sturges estimator is quite
189
+ good for small (<1000) datasets and is the default in R. This
190
+ method gives good off- the- shelf behaviour.
147
191
"""
148
192
return max (fd (x ), sturges (x ))
149
193
150
- optimal_numbins_methods = {'sturges' : sturges , 'rice' : rice , 'scott' : scott ,
194
+ optimal_numbins_methods = {'sqrt' : sqrt , 'sturges' : sturges ,
195
+ 'rice' : rice , 'scott' : scott , 'doane' : doane ,
151
196
'fd' : fd , 'auto' : auto }
152
197
try :
153
198
estimator_func = optimal_numbins_methods [estimator .lower ()]
@@ -169,66 +214,79 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
169
214
Input data. The histogram is computed over the flattened array.
170
215
bins : int or sequence of scalars or str, optional
171
216
If `bins` is an int, it defines the number of equal-width
172
- bins in the given range (10, by default). If `bins` is a sequence,
173
- it defines the bin edges, including the rightmost edge, allowing
174
- for non-uniform bin widths.
217
+ bins in the given range (10, by default). If `bins` is a
218
+ sequence, it defines the bin edges, including the rightmost
219
+ edge, allowing for non-uniform bin widths.
175
220
176
221
.. versionadded:: 1.11.0
177
222
178
- If `bins` is a string from the list below, `histogram` will use the method
179
- chosen to calculate the optimal number of bins (see Notes for more detail
180
- on the estimators). For visualisation, we suggest using the 'auto' option.
223
+ If `bins` is a string from the list below, `histogram` will use
224
+ the method chosen to calculate the optimal number of bins (see
225
+ Notes for more detail on the estimators). For visualisation, we
226
+ suggest using the 'auto' option.
181
227
182
228
'auto'
183
- Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance
229
+ Maximum of the 'sturges' and 'fd' estimators. Provides good
230
+ all round performance
184
231
185
232
'fd' (Freedman Diaconis Estimator)
186
- Robust (resilient to outliers) estimator that takes into account data
187
- variability and data size .
233
+ Robust (resilient to outliers) estimator that takes into
234
+ account data variability and data size .
235
+
236
+ 'doane'
237
+ An improved version of Sturges' estimator that works better
238
+ with non-normal datasets.
188
239
189
240
'scott'
190
241
Less robust estimator that that takes into account data
191
242
variability and data size.
192
243
193
244
'rice'
194
- Estimator does not take variability into account, only data size.
195
- Commonly overestimates number of bins required.
245
+ Estimator does not take variability into account, only data
246
+ size. Commonly overestimates number of bins required.
196
247
197
248
'sturges'
198
- R's default method, only accounts for data size. Only optimal for
199
- gaussian data and underestimates number of bins for large non-gaussian datasets.
249
+ R's default method, only accounts for data size. Only
250
+ optimal for gaussian data and underestimates number of bins
251
+ for large non-gaussian datasets.
252
+
253
+ 'sqrt'
254
+ Square root (of data size) estimator, used by Excel and
255
+ other programs for its speed and simplicity.
200
256
201
257
range : (float, float), optional
202
258
The lower and upper range of the bins. If not provided, range
203
259
is simply ``(a.min(), a.max())``. Values outside the range are
204
260
ignored.
205
261
normed : bool, optional
206
262
This keyword is deprecated in Numpy 1.6 due to confusing/buggy
207
- behavior. It will be removed in Numpy 2.0. Use the density keyword
208
- instead.
209
- If False, the result will contain the number of samples
210
- in each bin. If True, the result is the value of the
211
- probability *density* function at the bin, normalized such that
212
- the *integral* over the range is 1. Note that this latter behavior is
213
- known to be buggy with unequal bin widths; use `density` instead.
263
+ behavior. It will be removed in Numpy 2.0. Use the `` density``
264
+ keyword instead. If ``False``, the result will contain the
265
+ number of samples in each bin. If ``True``, the result is the
266
+ value of the probability *density* function at the bin,
267
+ normalized such that the *integral* over the range is 1. Note
268
+ that this latter behavior is known to be buggy with unequal bin
269
+ widths; use `` density` ` instead.
214
270
weights : array_like, optional
215
- An array of weights, of the same shape as `a`. Each value in `a`
216
- only contributes its associated weight towards the bin count
217
- (instead of 1). If `normed` is True, the weights are normalized,
218
- so that the integral of the density over the range remains 1
271
+ An array of weights, of the same shape as `a`. Each value in
272
+ `a` only contributes its associated weight towards the bin count
273
+ (instead of 1). If `density` is True, the weights are
274
+ normalized, so that the integral of the density over the range
275
+ remains 1.
219
276
density : bool, optional
220
- If False, the result will contain the number of samples
221
- in each bin. If True, the result is the value of the
277
+ If `` False`` , the result will contain the number of samples in
278
+ each bin. If `` True`` , the result is the value of the
222
279
probability *density* function at the bin, normalized such that
223
280
the *integral* over the range is 1. Note that the sum of the
224
281
histogram values will not be equal to 1 unless bins of unity
225
282
width are chosen; it is not a probability *mass* function.
226
- Overrides the `normed` keyword if given.
283
+
284
+ Overrides the ``normed`` keyword if given.
227
285
228
286
Returns
229
287
-------
230
288
hist : array
231
- The values of the histogram. See `normed ` and `weights` for a
289
+ The values of the histogram. See `density ` and `weights` for a
232
290
description of the possible semantics.
233
291
bin_edges : array of dtype float
234
292
Return the bin edges ``(length(hist)+1)``.
@@ -240,56 +298,77 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
240
298
241
299
Notes
242
300
-----
243
- All but the last (righthand-most) bin is half-open. In other words, if
244
- `bins` is::
301
+ All but the last (righthand-most) bin is half-open. In other words,
302
+ if `bins` is::
245
303
246
304
[1, 2, 3, 4]
247
305
248
- then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the
249
- second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
250
- 4.
306
+ then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
307
+ the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
308
+ *includes* 4.
251
309
252
310
.. versionadded:: 1.11.0
253
311
254
- The methods to estimate the optimal number of bins are well found in literature,
255
- and are inspired by the choices R provides for histogram visualisation.
256
- Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal,
257
- which is why it appears in most estimators.
258
- These are simply plug-in methods that give good starting points for number of bins.
259
- In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins
312
+ The methods to estimate the optimal number of bins are well found in
313
+ literature, and are inspired by the choices R provides for histogram
314
+ visualisation. Note that having the number of bins proportional to
315
+ :math:`n^{1/3}` is asymptotically optimal, which is why it appears
316
+ in most estimators. These are simply plug-in methods that give good
317
+ starting points for number of bins. In the equations below,
318
+ :math:`h` is the binwidth and :math:`n_h` is the number of bins.
260
319
261
320
'Auto' (maximum of the 'Sturges' and 'FD' estimators)
262
- A compromise to get a good value. For small datasets the sturges
263
- value will usually be chosen, while larger datasets will usually default to FD.
264
- Avoids the overly conservative behaviour of FD and Sturges for small and
265
- large datasets respectively. Switchover point is usually x.size~1000.
321
+ A compromise to get a good value. For small datasets the Sturges
322
+ value will usually be chosen, while larger datasets will usually
323
+ default to FD. Avoids the overly conservative behaviour of FD
324
+ and Sturges for small and large datasets respectively.
325
+ Switchover point usually happens when ``x.size`` is around 1000.
266
326
267
327
'FD' (Freedman Diaconis Estimator)
268
328
.. math:: h = 2 \\ frac{IQR}{n^{1/3}}
269
329
The binwidth is proportional to the interquartile range (IQR)
270
330
and inversely proportional to cube root of a.size. Can be too
271
- conservative for small datasets, but is quite good
272
- for large datasets. The IQR is very robust to outliers.
331
+ conservative for small datasets, but is quite good for large
332
+ datasets. The IQR is very robust to outliers.
273
333
274
334
'Scott'
275
- .. math:: h = \\ frac{3.5\\ sigma}{n^{1/3}}
276
- The binwidth is proportional to the standard deviation (sd) of the data
277
- and inversely proportional to cube root of a.size. Can be too
278
- conservative for small datasets, but is quite good
279
- for large datasets. The sd is not very robust to outliers. Values
280
- are very similar to the Freedman Diaconis Estimator in the absence of outliers.
335
+ .. math:: h = \\ sigma \\ sqrt[3]{\\ frac{24 * \\ sqrt{\\ pi}}{n}}
336
+ The binwidth is proportional to the standard deviation of the
337
+ data and inversely proportional to cube root of ``x.size``. Can
338
+ be too conservative for small datasets, but is quite good for
339
+ large datasets. The standard deviation is not very robust to
340
+ outliers. Values are very similar to the Freedman-Diaconis
341
+ estimator in the absence of outliers.
281
342
282
343
'Rice'
283
344
.. math:: n_h = \\ left\\ lceil 2n^{1/3} \\ right\\ rceil
284
- The number of bins is only proportional to cube root of a.size.
285
- It tends to overestimate the number of bins
286
- and it does not take into account data variability.
345
+ The number of bins is only proportional to cube root of
346
+ ``a.size``. It tends to overestimate the number of bins and it
347
+ does not take into account data variability.
287
348
288
349
'Sturges'
289
350
.. math:: n_h = \\ left\\ lceil \\ log _{2}n+1 \\ right\\ rceil
290
- The number of bins is the base2 log of a.size.
291
- This estimator assumes normality of data and is too conservative for larger,
292
- non-normal datasets. This is the default method in R's `hist` method.
351
+ The number of bins is the base 2 log of ``a.size``. This
352
+ estimator assumes normality of data and is too conservative for
353
+ larger, non-normal datasets. This is the default method in R's
354
+ ``hist`` method.
355
+
356
+ 'Doane'
357
+ .. math:: n_h = \\ left\\ lceil 1 + \\ log_{2}(n) +
358
+ \\ log_{2}(1 + \\ frac{\\ left g_1 \\ right}{\\ sigma_{g_1})}
359
+ \\ right\\ rceil
360
+
361
+ g_1 = mean[(\\ frac{x - \\ mu}{\\ sigma})^3]
362
+
F438
363
+ \\ sigma_{g_1} = \\ sqrt{\\ frac{6(n - 2)}{(n + 1)(n + 3)}}
364
+
365
+ An improved version of Sturges' formula that produces better
366
+ estimates for non-normal datasets.
367
+
368
+ 'Sqrt'
369
+ .. math:: n_h = \\ left\\ lceil \\ sqrt n \\ right\\ rceil
370
+ The simplest and fastest estimator. Only takes into account the
371
+ data size.
293
372
294
373
Examples
295
374
--------
@@ -311,7 +390,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
311
390
312
391
.. versionadded:: 1.11.0
313
392
314
- Automated Bin Selection Methods example, using 2 peak random data with 2000 points
393
+ Automated Bin Selection Methods example, using 2 peak random data
394
+ with 2000 points:
315
395
316
396
>>> import matplotlib.pyplot as plt
317
397
>>> rng = np.random.RandomState(10) # deterministic random data
0 commit comments