10000 Added 'doane' and 'sqrt' estimators to np.histogram in numpy.function… · numpy/numpy@b8b5561 · GitHub
[go: up one dir, main page]

Skip to content

Commit b8b5561

Browse files
committed
Added 'doane' and 'sqrt' estimators to np.histogram in numpy.function_base
1 parent 47b6c2b commit b8b5561

File tree

4 files changed

+209
-118
lines changed

4 files changed

+209
-118
lines changed

doc/release/1.12.0-notes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ Instead of using ``usecol=(n,)`` to read the nth column of a file
9191
it is now allowed to use ``usecol=n``. Also the error message is
9292
more user friendly when a non-integer is passed as a column index.
9393

94+
Additional estimators for ``histogram``
95+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
96+
Added 'doane' and 'sqrt' estimators to ``histogram`` via the ``bins`` argument.
9497

9598
Changes
9699
=======

numpy/lib/function_base.py

Lines changed: 163 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -78,56 +78,100 @@ def iterable(y):
7878

7979
def _hist_optim_numbins_estimator(a, estimator):
8080
"""
81-
A helper function to be called from histogram to deal with estimating optimal number of bins
81+
A helper function to be called from ``histogram`` to deal with
82+
estimating optimal number of bins.
83+
84+
A description of the estimators can be found at
85+
https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
8286
8387
estimator: str
84-
If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function
85-
will choose the appropriate estimator and return it's estimate for the optimal
86-
number of bins.
88+
If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane',
89+
'rice', 'sturges', 'sqrt'], this function will choose the
90+
appropriate estimation method and return the optimal number of
91+
bins it calculates.
8792
"""
8893
if a.size == 0:
8994
return 1
9095

96+
def sqrt(x):
97+
"""
98+
Square Root Estimator
99+
100+
Used by many programs for its simplicity.
101+
"""
102+
return np.ceil(np.sqrt(x.size))
103+
91104
def sturges(x):
92105
"""
93106
Sturges Estimator
94-
A very simplistic estimator based on the assumption of normality of the data
95-
Poor performance for non-normal data, especially obvious for large X.
96-
Depends only on size of the data.
107+
108+
A very simplistic estimator based on the assumption of normality
109+
of the data. Poor performance for non-normal data, especially
110+
obvious for large ``x``. Depends only on size of the data.
97111
"""
98112
return np.ceil(np.log2(x.size)) + 1
99113

100114
def rice(x):
101115
"""
102116
Rice Estimator
103-
Another simple estimator, with no normality assumption.
104-
It has better performance for large data, but tends to overestimate number of bins.
105-
The number of bins is proportional to the cube root of data size (asymptotically optimal)
106-
Depends only on size of the data
117+
118+
Another simple estimator, with no normality assumption. It has
119+
better performance for large data, but tends to overestimate
120+
number of bins. The number of bins is proportional to the cube
121+
root of data size (asymptotically optimal). Depends only on size
122+
of the data.
107123
"""
108124
return np.ceil(2 * x.size ** (1.0 / 3))
109125

110126
def scott(x):
111127
"""
112128
Scott Estimator
113-
The binwidth is proportional to the standard deviation of the data and
114-
inversely proportional to the cube root of data size (asymptotically optimal)
115129
130+
The binwidth is proportional to the standard deviation of the
131+
data and inversely proportional to the cube root of data size
132+
(asymptotically optimal).
116133
"""
117-
h = 3.5 * x.std() * x.size ** (-1.0 / 3)
134+
h = (24 * np.pi**0.5 / x.size)**(1.0 / 3) * np.std(x)
118135
if h > 0:
119136
return np.ceil(x.ptp() / h)
120137
return 1
121138

139+
def doane(x):
140+
"""
141+
Doane's Estimator
142+
143+
Improved version of Sturges' formula which works better for
144+
non-normal data. See
145+
http://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
146+
"""
147+
if x.size > 2:
148+
sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
149+
sigma = np.std(x)
150+
if sigma > 0:
151+
# These three operations add up to
152+
# g1 = np.mean(((x - np.mean(x)) / sigma)**3)
153+
# but use only one temp array instead of three
154+
temp = x - np.mean(x)
155+
np.true_divide(temp, sigma, temp)
156+
np.power(temp, 3, temp)
157+
g1 = np.mean(temp)
158+
return np.ceil(1.0 + np.log2(x.size) +
159+
np.log2(1.0 + np.absolute(g1) / sg1))
160+
return 1
161+
122162
def fd(x):
123163
"""
124-
Freedman Diaconis rule using interquartile range (IQR) for binwidth
125-
Considered a variation of the Scott rule with more robustness as the IQR
126-
is less affected by outliers than the standard deviation. However the IQR depends on
127-
fewer points than the sd so it is less accurate, especially for long tailed distributions.
164+
Freedman Diaconis Estimator
128165
129-
If the IQR is 0, we return 1 for the number of bins.
130-
Binwidth is inversely proportional to the cube root of data size (asymptotically optimal)
166+
The interquartile range (IQR) is used for binwidth, making this
167+
variation of the Scott rule more robust, as the IQR is less
168+
affected by outliers than the standard deviation. However, the
169+
IQR depends on fewer points than the standard deviation, so it
170+
is less accurate, especially for long tailed distributions.
171+
172+
If the IQR is 0, we return 1 for the number of bins. Binwidth is
173+
inversely proportional to the cube root of data size
174+
(asymptotically optimal).
131175
"""
132176
iqr = np.subtract(*np.percentile(x, [75, 25]))
133177

@@ -140,14 +184,15 @@ def fd(x):
140184

141185
def auto(x):
142186
"""
143-
The FD estimator is usually the most robust method, but it tends to be too small
144-
for small X. The Sturges estimator is quite good for small (<1000) datasets and is
145-
the default in R.
146-
This method gives good off the shelf behaviour.
187+
The FD estimator is usually the most robust method, but it tends
188+
to be too small for small ``x``. The Sturges estimator is quite
189+
good for small (<1000) datasets and is the default in R. This
190+
method gives good off-the-shelf behaviour.
147191
"""
148192
return max(fd(x), sturges(x))
149193

150-
optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott,
194+
optimal_numbins_methods = {'sqrt': sqrt, 'sturges': sturges,
195+
'rice': rice, 'scott': scott, 'doane': doane,
151196
'fd': fd, 'auto': auto}
152197
try:
153198
estimator_func = optimal_numbins_methods[estimator.lower()]
@@ -169,66 +214,79 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
169214
Input data. The histogram is computed over the flattened array.
170215
bins : int or sequence of scalars or str, optional
171216
If `bins` is an int, it defines the number of equal-width
172-
bins in the given range (10, by default). If `bins` is a sequence,
173-
it defines the bin edges, including the rightmost edge, allowing
174-
for non-uniform bin widths.
217+
bins in the given range (10, by default). If `bins` is a
218+
sequence, it defines the bin edges, including the rightmost
219+
edge, allowing for non-uniform bin widths.
175220
176221
.. versionadded:: 1.11.0
177222
178-
If `bins` is a string from the list below, `histogram` will use the method
179-
chosen to calculate the optimal number of bins (see Notes for more detail
180-
on the estimators). For visualisation, we suggest using the 'auto' option.
223+
If `bins` is a string from the list below, `histogram` will use
224+
the method chosen to calculate the optimal number of bins (see
225+
Notes for more detail on the estimators). For visualisation, we
226+
suggest using the 'auto' option.
181227
182228
'auto'
183-
Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance
229+
Maximum of the 'sturges' and 'fd' estimators. Provides good
230+
all round performance
184231
185232
'fd' (Freedman Diaconis Estimator)
186-
Robust (resilient to outliers) estimator that takes into account data
187-
variability and data size .
233+
Robust (resilient to outliers) estimator that takes into
234+
account data variability and data size .
235+
236+
'doane'
237+
An improved version of Sturges' estimator that works better
238+
with non-normal datasets.
188239
189240
'scott'
190241
Less robust estimator that that takes into account data
191242
variability and data size.
192243
193244
'rice'
194-
Estimator does not take variability into account, only data size.
195-
Commonly overestimates number of bins required.
245+
Estimator does not take variability into account, only data
246+
size. Commonly overestimates number of bins required.
196247
197248
'sturges'
198-
R's default method, only accounts for data size. Only optimal for
199-
gaussian data and underestimates number of bins for large non-gaussian datasets.
249+
R's default method, only accounts for data size. Only
250+
optimal for gaussian data and underestimates number of bins
251+
for large non-gaussian datasets.
252+
253+
'sqrt'
254+
Square root (of data size) estimator, used by Excel and
255+
other programs for its speed and simplicity.
200256
201257
range : (float, float), optional
202258
The lower and upper range of the bins. If not provided, range
203259
is simply ``(a.min(), a.max())``. Values outside the range are
204260
ignored.
205261
normed : bool, optional
206262
This keyword is deprecated in Numpy 1.6 due to confusing/buggy
207-
behavior. It will be removed in Numpy 2.0. Use the density keyword
208-
instead.
209-
If False, the result will contain the number of samples
210-
in each bin. If True, the result is the value of the
211-
probability *density* function at the bin, normalized such that
212-
the *integral* over the range is 1. Note that this latter behavior is
213-
known to be buggy with unequal bin widths; use `density` instead.
263+
behavior. It will be removed in Numpy 2.0. Use the ``density``
264+
keyword instead. If ``False``, the result will contain the
265+
number of samples in each bin. If ``True``, the result is the
266+
value of the probability *density* function at the bin,
267+
normalized such that the *integral* over the range is 1. Note
268+
that this latter behavior is known to be buggy with unequal bin
269+
widths; use ``density`` instead.
214270
weights : array_like, optional
215-
An array of weights, of the same shape as `a`. Each value in `a`
216-
only contributes its associated weight towards the bin count
217-
(instead of 1). If `normed` is True, the weights are normalized,
218-
so that the integral of the density over the range remains 1
271+
An array of weights, of the same shape as `a`. Each value in
272+
`a` only contributes its associated weight towards the bin count
273+
(instead of 1). If `density` is True, the weights are
274+
normalized, so that the integral of the density over the range
275+
remains 1.
219276
density : bool, optional
220-
If False, the result will contain the number of samples
221-
in each bin. If True, the result is the value of the
277+
If ``False``, the result will contain the number of samples in
278+
each bin. If ``True``, the result is the value of the
222279
probability *density* function at the bin, normalized such that
223280
the *integral* over the range is 1. Note that the sum of the
224281
histogram values will not be equal to 1 unless bins of unity
225282
width are chosen; it is not a probability *mass* function.
226-
Overrides the `normed` keyword if given.
283+
284+
Overrides the ``normed`` keyword if given.
227285
228286
Returns
229287
-------
230288
hist : array
231-
The values of the histogram. See `normed` and `weights` for a
289+
The values of the histogram. See `density` and `weights` for a
232290
description of the possible semantics.
233291
bin_edges : array of dtype float
234292
Return the bin edges ``(length(hist)+1)``.
@@ -240,56 +298,77 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
240298
241299
Notes
242300
-----
243-
All but the last (righthand-most) bin is half-open. In other words, if
244-
`bins` is::
301+
All but the last (righthand-most) bin is half-open. In other words,
302+
if `bins` is::
245303
246304
[1, 2, 3, 4]
247305
248-
then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the
249-
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
250-
4.
306+
then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
307+
the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
308+
*includes* 4.
251309
252310
.. versionadded:: 1.11.0
253311
254-
The methods to estimate the optimal number of bins are well found in literature,
255-
and are inspired by the choices R provides for histogram visualisation.
256-
Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal,
257-
which is why it appears in most estimators.
258-
These are simply plug-in methods that give good starting points for number of bins.
259-
In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins
312+
The methods to estimate the optimal number of bins are well found in
313+
literature, and are inspired by the choices R provides for histogram
314+
visualisation. Note that having the number of bins proportional to
315+
:math:`n^{1/3}` is asymptotically optimal, which is why it appears
316+
in most estimators. These are simply plug-in methods that give good
317+
starting points for number of bins. In the equations below,
318+
:math:`h` is the binwidth and :math:`n_h` is the number of bins.
260319
261320
'Auto' (maximum of the 'Sturges' and 'FD' estimators)
262-
A compromise to get a good value. For small datasets the sturges
263-
value will usually be chosen, while larger datasets will usually default to FD.
264-
Avoids the overly conservative behaviour of FD and Sturges for small and
265-
large datasets respectively. Switchover point is usually x.size~1000.
321+
A compromise to get a good value. For small datasets the Sturges
322+
value will usually be chosen, while larger datasets will usually
323+
default to FD. Avoids the overly conservative behaviour of FD
324+
and Sturges for small and large datasets respectively.
325+
Switchover point usually happens when ``x.size`` is around 1000.
266326
267327
'FD' (Freedman Diaconis Estimator)
268328
.. math:: h = 2 \\frac{IQR}{n^{1/3}}
269329
The binwidth is proportional to the interquartile range (IQR)
270330
and inversely proportional to cube root of a.size. Can be too
271-
conservative for small datasets, but is quite good
272-
for large datasets. The IQR is very robust to outliers.
331+
conservative for small datasets, but is quite good for large
332+
datasets. The IQR is very robust to outliers.
273333
274334
'Scott'
275-
.. math:: h = \\frac{3.5\\sigma}{n^{1/3}}
276-
The binwidth is proportional to the standard deviation (sd) of the data
277-
and inversely proportional to cube root of a.size. Can be too
278-
conservative for small datasets, but is quite good
279-
for large datasets. The sd is not very robust to outliers. Values
280-
are very similar to the Freedman Diaconis Estimator in the absence of outliers.
335+
.. math:: h = \\sigma \\sqrt[3]{\\frac{24 * \\sqrt{\\pi}}{n}}
336+
The binwidth is proportional to the standard deviation of the
337+
data and inversely proportional to cube root of ``x.size``. Can
338+
be too conservative for small datasets, but is quite good for
339+
large datasets. The standard deviation is not very robust to
340+
outliers. Values are very similar to the Freedman-Diaconis
341+
estimator in the absence of outliers.
281342
282343
'Rice'
283344
.. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil
284-
The number of bins is only proportional to cube root of a.size.
285-
It tends to overestimate the number of bins
286-
and it does not take into account data variability.
345+
The number of bins is only proportional to cube root of
346+
``a.size``. It tends to overestimate the number of bins and it
347+
does not take into account data variability.
287348
288349
'Sturges'
289350
.. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil
290-
The number of bins is the base2 log of a.size.
291-
This estimator assumes normality of data and is too conservative for larger,
292-
non-normal datasets. This is the default method in R's `hist` method.
351+
The number of bins is the base 2 log of ``a.size``. This
352+
estimator assumes normality of data and is too conservative for
353+
larger, non-normal datasets. This is the default method in R's
354+
``hist`` method.
355+
356+
'Doane'
357+
.. math:: n_h = \\left\\lceil 1 + \\log_{2}(n) +
358+
\\log_{2}(1 + \\frac{\\left g_1 \\right}{\\sigma_{g_1})}
359+
\\right\\rceil
360+
361+
g_1 = mean[(\\frac{x - \\mu}{\\sigma})^3]
362+
F438 363+
\\sigma_{g_1} = \\sqrt{\\frac{6(n - 2)}{(n + 1)(n + 3)}}
364+
365+
An improved version of Sturges' formula that produces better
366+
estimates for non-normal datasets.
367+
368+
'Sqrt'
369+
.. math:: n_h = \\left\\lceil \\sqrt n \\right\\rceil
370+
The simplest and fastest estimator. Only takes into account the
371+
data size.
293372
294373
Examples
295374
--------
@@ -311,7 +390,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
311390
312391
.. versionadded:: 1.11.0
313392
314-
Automated Bin Selection Methods example, using 2 peak random data with 2000 points
393+
Automated Bin Selection Methods example, using 2 peak random data
394+
with 2000 points:
315395
316396
>>> import matplotlib.pyplot as plt
317397
>>> rng = np.random.RandomState(10) # deterministic random data

numpy/lib/nanfunctions.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -919,21 +919,15 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
919919
* nearest: ``i`` or ``j``, whichever is nearest.
920920
* midpoint: ``(i + j) / 2``.
921921
keepdims : bool, optional
922-
<<<<<<< 35b5f5be1ffffada84c8be207e7b8b196a58f786
923922
If this is set to True, the axes which are reduced are left in
924923
the result as dimensions with size one. With this option, the
925924
result will broadcast correctly against the original array `a`.
926-
=======
927-
If this is set to True, the axes which are reduced are left
928-
in the result as dimensions with size one. With this option,
929-
the result will broadcast correctly against the original `a`.
930925
931926
If this is anything but the default value it will be passed
932927
through (in the special case of an empty array) to the
933928
`mean` function of the underlying array. If the array is
934929
a sub-class and `mean` does not have the kwarg `keepdims` this
935930
will raise a RuntimeError.
936-
>>>>>>> BUG: many functions silently drop `keepdims` kwarg
937931
938932
Returns
939933
-------

0 commit comments

Comments
 (0)
0