8000 ENH: Add 'ise' estimator to np.histogram · numpy/numpy@219a52c · GitHub
[go: up one dir, main page]

Skip to content

Commit 219a52c

Browse files
committed
ENH: Add 'ise' estimator to np.histogram
1 parent 7027014 commit 219a52c

File tree

3 files changed

+101
-23
lines changed

3 files changed

+101
-23
lines changed

doc/release/1.16.0-notes.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,13 @@ of:
123123
New Features
124124
============
125125

126+
Integrated squared error (ISE) estimator added to ``histogram``
127+
---------------------------------------------------------------
128+
This method (``bins='stone'``) for optimizing the bin number is a generalization of the
129+
Scott's rule. The Scott's rule assumes the distribution is approximately
130+
Normal, while the ISE is a nonparametric method based on cross-validation.
131+
https://en.wikipedia.org/wiki/Histogram#Minimizing_cross-validation_estimated_squared_error
132+
126133
``max_rows`` keyword added for ``np.loadtxt``
127134
---------------------------------------------
128135
New keyword ``max_rows`` in `numpy.loadtxt` sets the maximum rows of the

numpy/lib/histograms.py

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
_range = range
2222

2323

24-
def _hist_bin_sqrt(x):
24+
def _hist_bin_sqrt(x, range):
2525
"""
2626
Square root histogram bin estimator.
2727
@@ -38,10 +38,11 @@ def _hist_bin_sqrt(x):
3838
-------
3939
h : An estimate of the optimal bin width for the given data.
4040
"""
41+
del range # unused
4142
return x.ptp() / np.sqrt(x.size)
4243

4344

44-
def _hist_bin_sturges(x):
45+
def _hist_bin_sturges(x, range):
4546
"""
4647
Sturges histogram bin estimator.
4748
@@ -60,10 +61,11 @@ def _hist_bin_sturges(x):
6061
-------
6162
h : An estimate of the optimal bin width for the given data.
6263
"""
64+
del range # unused
6365
return x.ptp() / (np.log2(x.size) + 1.0)
6466

6567

66-
def _hist_bin_rice(x):
68+
def _hist_bin_rice(x, range):
6769
"""
6870
Rice histogram bin estimator.
6971
@@ -83,10 +85,11 @@ def _hist_bin_rice(x):
8385
-------
8486
h : An estimate of the optimal bin width for the given data.
8587
"""
88+
del range # unused
8689
return x.ptp() / (2.0 * x.size ** (1.0 / 3))
8790

8891

89-
def _hist_bin_scott(x):
92+
def _hist_bin_scott(x, range):
9093
"""
9194
Scott histogram bin estimator.
9295
@@ -104,10 +107,49 @@ def _hist_bin_scott(x):
104107
-------
105108
h : An estimate of the optimal bin width for the given data.
106109
"""
110+
del range # unused
107111
return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
108112

109113

110-
def _hist_bin_doane(x):
114+
def _hist_bin_ise(x, range):
115+
"""
116+
Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
117+
118+
The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
119+
The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
120+
https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
121+
122+
Parameters
123+
----------
124+
x : array_like
125+
Input data that is to be histogrammed, trimmed to range. May not
126+
be empty.
127+
range : (float, float)
128+
The lower and upper range of the bins.
129+
130+
Returns
131+
-------
132+
h : An estimate of the optimal bin width for the given data.
133+
"""
134+
135+
n = x.size
136+
ptp_x = np.ptp(x)
137+
if n <= 1 or ptp_x == 0:
138+
return 0
139+
140+
def jhat(nbins):
141+
hh = ptp_x / nbins
142+
p_k = np.histogram(x, bins=nbins, range=range)[0] / n
143+
return (2 - (n + 1) * p_k.dot(p_k)) / hh
144+
145+
nbins_upper_bound = max(100, int(np.sqrt(n)))
146+
nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
147+
if nbins == nbins_upper_bound:
148+
warnings.warn("The number of bins estimated may be suboptimal.", RuntimeWarning, stacklevel=2)
149+
return ptp_x / nbins
150+
151+
152+
def _hist_bin_doane(x, range):
111153
"""
112154
Doane's histogram bin estimator.
113155
@@ -125,6 +167,7 @@ def _hist_bin_doane(x):
125167
-------
126168
h : An estimate of the optimal bin width for the given data.
127169
"""
170+
del range # unused
128171
if x.size > 2:
129172
sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
130173
sigma = np.std(x)
@@ -141,7 +184,7 @@ def _hist_bin_doane(x):
141184
return 0.0
142185

143186

144-
def _hist_bin_fd(x):
187+
def _hist_bin_fd(x, range):
145188
"""
146189
The Freedman-Diaconis histogram bin estimator.
147190
@@ -166,11 +209,12 @@ def _hist_bin_fd(x):
166209
-------
167210
h : An estimate of the optimal bin width for the given data.
168211
"""
212+
del range # unused
169213
iqr = np.subtract(*np.percentile(x, [75, 25]))
170214
return 2.0 * iqr * x.size ** (-1.0 / 3.0)
171215

172216

173-
def _hist_bin_auto(x):
217+
def _hist_bin_auto(x, range):
174218
"""
175219
Histogram bin estimator that uses the minimum width of the
176220
Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
@@ -204,16 +248,18 @@ def _hist_bin_auto(x):
204248
--------
205249
_hist_bin_fd, _hist_bin_sturges
206250
"""
207-
fd_bw = _hist_bin_fd(x)
208-
sturges_bw = _hist_bin_sturges(x)
251+
fd_bw = _hist_bin_fd(x, range)
252+
sturges_bw = _hist_bin_sturges(x, range)
253+
del range # unused
209254
if fd_bw:
210255
return min(fd_bw, sturges_bw)
211256
else:
212257
# limited variance, so we return a len dependent bw estimator
213258
return sturges_bw
214259

215260
# Private dict initialized at module load time
216-
_hist_bin_selectors = {'auto': _hist_bin_auto,
261+
_hist_bin_selectors = {'stone': _hist_bin_ise,
262+
'auto': _hist_bin_auto,
217263
'doane': _hist_bin_doane,
218264
'fd': _hist_bin_fd,
219265
'rice': _hist_bin_rice,
@@ -348,7 +394,7 @@ def _get_bin_edges(a, bins, range, weights):
348394
n_equal_bins = 1
349395
else:
350396
# Do not call selectors on empty arrays
351-
width = _hist_bin_selectors[bin_name](a)
397+
width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
352398
if width:
353399
n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
354400
else:
@@ -450,6 +496,11 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
450496
Less robust estimator that that takes into account data
451497
variability and data size.
452498
499+
'stone'
500+
Estimator based on minimizing the estimated integrated
501+
squared error (ISE) and can be regarded as a generalization
502+
of Scott's rule.
503+
453504
'rice'
454505
Estimator does not take variability into account, only data
455506
size. Commonly overestimates number of bins required.

numpy/lib/tests/test_histograms.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ class TestHistogramOptimBinNums(object):
431431

432432
def test_empty(self):
433433
estimator_list = ['fd', 'scott', 'rice', 'sturges',
434-
'doane', 'sqrt', 'auto']
434+
'doane', 'sqrt', 'auto', 'stone']
435435
# check it can deal with empty data
436436
for estimator in estimator_list:
437437
a, b = histogram([], bins=estimator)
@@ -447,11 +447,11 @@ def test_simple(self):
447447
# Some basic sanity checking, with some fixed data.
448448
# Checking for the correct number of bins
449449
basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7,
450-
'doane': 8, 'sqrt': 8, 'auto': 7},
450+
'doane': 8, 'sqrt': 8, 'auto': 7, 'stone': 2},
451451
500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10,
452-
'doane': 12, 'sqrt': 23, 'auto': 10},
452+
'doane': 12, 'sqrt': 23, 'auto': 10, 'stone': 9},
453453
5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14,
454-
'doane': 17, 'sqrt': 71, 'auto': 17}}
454+
'doane': 17, 'sqrt': 71, 'auto': 17, 'stone': 20}}
455455

456456
for testlen, expectedResults in basic_test.items():
457457
# Create some sort of non uniform data to test with
@@ -471,11 +471,11 @@ def test_small(self):
471471
precalculated.
472472
"""
473473
small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
474-
'doane': 1, 'sqrt': 1},
474+
'doane': 1, 'sqrt': 1, 'stone': 1},
475475
2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
476-
'doane': 1, 'sqrt': 2},
476+
'doane': 1, 'sqrt': 2, 'stone': 1},
477477
3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3,
478-
'doane': 3, 'sqrt': 2}}
478+
'doane': 3, 'sqrt': 2, 'stone': 1}}
479479

480480
for testlen, expectedResults in small_dat.items():
481481
testdat = np.arange(testlen)
@@ -499,7 +499,7 @@ def test_novariance(self):
499499
"""
500500
novar_dataset = np.ones(100)
501501
novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
502-
'doane': 1, 'sqrt': 1, 'auto': 1}
502+
'doane': 1, 'sqrt': 1, 'auto': 1, 'stone': 1}
503503

504504
for estimator, numbins in novar_resultdict.items():
505505
a, b = np.histogram(novar_dataset, estimator)
@@ -538,12 +538,32 @@ def test_outlier(self):
538538
xcenter = np.linspace(-10, 10, 50)
539539
outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
540540

541-
outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11}
541+
outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11, 'stone': 6}
542542

543543
for estimator, numbins in outlier_resultdict.items():
544544
a, b = np.histogram(outlier_dataset, estimator)
545545
assert_equal(len(a), numbins)
546546

547+
def test_scott_vs_ise(self):
548+
"""Verify that Scott's rule and the ISE based method converges for normally distributed data"""
549+
550+
def nbins_ratio(seed, size):
551+
rng = np.random.RandomState(seed)
552+
x = rng.normal(loc=0, scale=2, size=size)
553+
a, b = len(np.histogram(x, 'stone')[0]), len(np.histogram(x, 'scott')[0])
554+
return a / (a + b)
555+
556+
ll = [[nbins_ratio(seed, size) for size in np.geomspace(start=10, stop=100, num=4).round().astype(int)]
557+
for seed in range(256)]
558+
559+
# the average difference between the two methods decreases as the dataset size increases.
560+
assert_almost_equal(abs(np.mean(ll, axis=0) - 0.5),
561+
[0.1065248,
562+
0.0968844,
563+
0.0331818,
564+
0.0178057],
565+
decimal=3)
566+
547567
def test_simple_range(self):
548568
"""
549569
Straightforward testing with a mixture of linspace data (for
@@ -555,11 +575,11 @@ def test_simple_range(self):
555575
# Checking for the correct number of bins
556576
basic_test = {
557577
50: {'fd': 8, 'scott': 8, 'rice': 15,
558-
'sturges': 14, 'auto': 14},
578+
'sturges': 14, 'auto': 14, 'stone': 8},
559579
500: {'fd': 15, 'scott': 16, 'rice': 32,
560-
'sturges': 20, 'auto': 20},
580+
'sturges': 20, 'auto': 20, 'stone': 80},
561581
5000: {'fd': 33, 'scott': 33, 'rice': 69,
562-
'sturges': 27, 'auto': 33}
582+
'sturges': 27, 'auto': 33, 'stone': 80}
563583
}
564584

565585
for testlen, expectedResults in basic_test.items():

0 commit comments

Comments
 (0)
0