numpy
diff --git a/‎doc/release/1.16.0-notes.rst
Lines changed: 7 additions & 0 deletions b/‎doc/release/1.16.0-notes.rst
Lines changed: 7 additions & 0 deletions
diff --git a/‎numpy/lib/histograms.py
Lines changed: 62 additions & 11 deletions b/‎numpy/lib/histograms.py
Lines changed: 62 additions & 11 deletions
diff --git a/‎numpy/lib/tests/test_histograms.py
Lines changed: 32 additions & 12 deletions b/‎numpy/lib/tests/test_histograms.py
Lines changed: 32 additions & 12 deletions
@@ -123,6 +123,13 @@ of:
 New Features
 ============
 
+Integrated squared error (ISE) estimator added to ``histogram``
+---------------------------------------------------------------
+This method (``bins='stone'``) for optimizing the bin number is a generalization of the
+Scott's rule. The Scott's rule assumes the distribution is approximately
+Normal, while the ISE is a nonparametric method based on cross-validation.
+https://en.wikipedia.org/wiki/Histogram#Minimizing_cross-validation_estimated_squared_error
+
 ``max_rows`` keyword added for ``np.loadtxt``
 ---------------------------------------------
 New keyword ``max_rows`` in `numpy.loadtxt` sets the maximum rows of the
 
@@ -21,7 +21,7 @@
 _range = range
 
 
-def _hist_bin_sqrt(x):
+def _hist_bin_sqrt(x, range):
     """
     Square root histogram bin estimator.
 
@@ -38,10 +38,11 @@ def _hist_bin_sqrt(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / np.sqrt(x.size)
 
 
-def _hist_bin_sturges(x):
+def _hist_bin_sturges(x, range):
     """
     Sturges histogram bin estimator.
 
@@ -60,10 +61,11 @@ def _hist_bin_sturges(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / (np.log2(x.size) + 1.0)
 
 
-def _hist_bin_rice(x):
+def _hist_bin_rice(x, range):
     """
     Rice histogram bin estimator.
 
@@ -83,10 +85,11 @@ def _hist_bin_rice(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / (2.0 * x.size ** (1.0 / 3))
 
 
-def _hist_bin_scott(x):
+def _hist_bin_scott(x, range):
     """
     Scott histogram bin estimator.
 
@@ -104,10 +107,49 @@ def _hist_bin_scott(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
 
 
-def _hist_bin_doane(x):
+def _hist_bin_ise(x, range):
+    """
+    Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
+
+    The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
+    The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
+    https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
+
+    Parameters
+    ----------
+    x : array_like
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
+    range : (float, float)
+        The lower and upper range of the bins.
+
+    Returns
+    -------
+    h : An estimate of the optimal bin width for the given data.
+    """
+
+    n = x.size
+    ptp_x = np.ptp(x)
+    if n <= 1 or ptp_x == 0:
+        return 0
+
+    def jhat(nbins):
+        hh = ptp_x / nbins
+        p_k = np.histogram(x, bins=nbins, range=range)[0] / n
+        return (2 - (n + 1) * p_k.dot(p_k)) / hh
+
+    nbins_upper_bound = max(100, int(np.sqrt(n)))
+    nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
+    if nbins == nbins_upper_bound:
+        warnings.warn("The number of bins estimated may be suboptimal.", RuntimeWarning, stacklevel=2)
+    return ptp_x / nbins
+
+
+def _hist_bin_doane(x, range):
     """
     Doane's histogram bin estimator.
 
@@ -125,6 +167,7 @@ def _hist_bin_doane(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     if x.size > 2:
         sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
         sigma = np.std(x)
@@ -141,7 +184,7 @@ def _hist_bin_doane(x):
     return 0.0
 
 
-def _hist_bin_fd(x):
+def _hist_bin_fd(x, range):
     """
     The Freedman-Diaconis histogram bin estimator.
 
@@ -166,11 +209,12 @@ def _hist_bin_fd(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     iqr = np.subtract(*np.percentile(x, [75, 25]))
     return 2.0 * iqr * x.size ** (-1.0 / 3.0)
 
 
-def _hist_bin_auto(x):
+def _hist_bin_auto(x, range):
     """
     Histogram bin estimator that uses the minimum width of the
     Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
@@ -204,16 +248,18 @@ def _hist_bin_auto(x):
     --------
     _hist_bin_fd, _hist_bin_sturges
     """
-    fd_bw = _hist_bin_fd(x)
-    sturges_bw = _hist_bin_sturges(x)
+    fd_bw = _hist_bin_fd(x, range)
+    sturges_bw = _hist_bin_sturges(x, range)
+    del range  # unused
     if fd_bw:
         return min(fd_bw, sturges_bw)
     else:
         # limited variance, so we return a len dependent bw estimator
         return sturges_bw
 
 # Private dict initialized at module load time
-_hist_bin_selectors = {'auto': _hist_bin_auto,
+_hist_bin_selectors = {'stone': _hist_bin_ise,
+                       'auto': _hist_bin_auto,
                        'doane': _hist_bin_doane,
                        'fd': _hist_bin_fd,
                        'rice': _hist_bin_rice,
@@ -348,7 +394,7 @@ def _get_bin_edges(a, bins, range, weights):
             n_equal_bins = 1
         else:
             # Do not call selectors on empty arrays
-            width = _hist_bin_selectors[bin_name](a)
+            width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
             if width:
                 n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
             else:
@@ -450,6 +496,11 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
             Less robust estimator that that takes into account data
             variability and data size.
 
+        'stone'
+            Estimator based on minimizing the estimated integrated
+            squared error (ISE) and can be regarded as a generalization
+            of Scott's rule.
+
         'rice'
             Estimator does not take variability into account, only data
             size. Commonly overestimates number of bins required.
 
@@ -431,7 +431,7 @@ class TestHistogramOptimBinNums(object):
 
     def test_empty(self):
         estimator_list = ['fd', 'scott', 'rice', 'sturges',
-                          'doane', 'sqrt', 'auto']
+                          'doane', 'sqrt', 'auto', 'stone']
         # check it can deal with empty data
         for estimator in estimator_list:
             a, b = histogram([], bins=estimator)
@@ -447,11 +447,11 @@ def test_simple(self):
         # Some basic sanity checking, with some fixed data.
         # Checking for the correct number of bins
         basic_test = {50:   {'fd': 4,  'scott': 4,  'rice': 8,  'sturges': 7,
-                             'doane': 8, 'sqrt': 8, 'auto': 7},
+                             'doane': 8, 'sqrt': 8, 'auto': 7, 'stone': 2},
                       500:  {'fd': 8,  'scott': 8,  'rice': 16, 'sturges': 10,
-                             'doane': 12, 'sqrt': 23, 'auto': 10},
+                             'doane': 12, 'sqrt': 23, 'auto': 10, 'stone': 9},
                       5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14,
-                             'doane': 17, 'sqrt': 71, 'auto': 17}}
+                             'doane': 17, 'sqrt': 71, 'auto': 17, 'stone': 20}}
 
         for testlen, expectedResults in basic_test.items():
             # Create some sort of non uniform data to test with
@@ -471,11 +471,11 @@ def test_small(self):
         precalculated.
         """
         small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
-                         'doane': 1, 'sqrt': 1},
+                         'doane': 1, 'sqrt': 1, 'stone': 1},
                      2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
-                         'doane': 1, 'sqrt': 2},
+                         'doane': 1, 'sqrt': 2, 'stone': 1},
                      3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3,
-                         'doane': 3, 'sqrt': 2}}
+                         'doane': 3, 'sqrt': 2, 'stone': 1}}
 
         for testlen, expectedResults in small_dat.items():
             testdat = np.arange(testlen)
@@ -499,7 +499,7 @@ def test_novariance(self):
         """
         novar_dataset = np.ones(100)
         novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
-                            'doane': 1, 'sqrt': 1, 'auto': 1}
+                            'doane': 1, 'sqrt': 1, 'auto': 1, 'stone': 1}
 
         for estimator, numbins in novar_resultdict.items():
             a, b = np.histogram(novar_dataset, estimator)
@@ -538,12 +538,32 @@ def test_outlier(self):
         xcenter = np.linspace(-10, 10, 50)
         outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
 
-        outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11}
+        outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11, 'stone': 6}
 
         for estimator, numbins in outlier_resultdict.items():
             a, b = np.histogram(outlier_dataset, estimator)
             assert_equal(len(a), numbins)
 
+    def test_scott_vs_ise(self):
+        """Verify that Scott's rule and the ISE based method converges for normally distributed data"""
+
+        def nbins_ratio(seed, size):
+            rng = np.random.RandomState(seed)
+            x = rng.normal(loc=0, scale=2, size=size)
+            a, b = len(np.histogram(x, 'stone')[0]), len(np.histogram(x, 'scott')[0])
+            return a / (a + b)
+
+        ll = [[nbins_ratio(seed, size) for size in np.geomspace(start=10, stop=100, num=4).round().astype(int)]
+              for seed in range(256)]
+
+        # the average difference between the two methods decreases as the dataset size increases.
+        assert_almost_equal(abs(np.mean(ll, axis=0) - 0.5),
+                            [0.1065248,
+                             0.0968844,
+                             0.0331818,
+                             0.0178057],
+                            decimal=3)
+
     def test_simple_range(self):
         """
         Straightforward testing with a mixture of linspace data (for
@@ -555,11 +575,11 @@ def test_simple_range(self):
         # Checking for the correct number of bins
         basic_test = {
                       50:   {'fd': 8,  'scott': 8,  'rice': 15,
-                             'sturges': 14, 'auto': 14},
+                             'sturges': 14, 'auto': 14, 'stone': 8},
                       500:  {'fd': 15, 'scott': 16, 'rice': 32,
-                             'sturges': 20, 'auto': 20},
+                             'sturges': 20, 'auto': 20, 'stone': 80},
                       5000: {'fd': 33, 'scott': 33, 'rice': 69,
-                             'sturges': 27, 'auto': 33}
+                             'sturges': 27, 'auto': 33, 'stone': 80}
                      }
 
         for testlen, expectedResults in basic_test.items():