scikit-learn
diff --git a/‎sklearn/model_selection/__init__.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/model_selection/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sklearn/model_selection/_split.py
Lines changed: 162 additions & 2 deletions b/‎sklearn/model_selection/_split.py
Lines changed: 162 additions & 2 deletions
diff --git a/‎sklearn/model_selection/tests/test_split.py
Lines changed: 121 additions & 23 deletions b/‎sklearn/model_selection/tests/test_split.py
Lines changed: 121 additions & 23 deletions
@@ -2,6 +2,7 @@
 from ._split import KFold
 from ._split import LabelKFold
 from ._split import StratifiedKFold
+from ._split import BinnedStratifiedKFold
 from ._split import LeaveOneLabelOut
 from ._split import LeaveOneOut
 from ._split import LeavePLabelOut
@@ -40,6 +41,7 @@
            'RandomizedSearchCV',
            'ShuffleSplit',
            'StratifiedKFold',
+           'BinnedStratifiedKFold',
            'StratifiedShuffleSplit',
            'check_cv',
            'cross_val_predict',
 
@@ -44,6 +44,7 @@
            'ShuffleSplit',
            'LabelShuffleSplit',
            'StratifiedKFold',
+           'BinnedStratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
            'train_test_split',
@@ -635,6 +636,165 @@ def split(self, X, y, labels=None):
         """
         return super(StratifiedKFold, self).split(X, y, labels)
 
+
+class BinnedStratifiedKFold(_BaseKFold):
+    """Stratified K-Folds cross-validator
+
+    Provides train/test indices to split data in train/test sets.
+
+    This cross-validation object is a variation of KFold that returns
+    stratified folds. The folds are made by preserving the percentage of
+    samples for each class.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    shuffle : boolean, optional
+        Whether to shuffle each stratification of the data before splitting
+        into batches.
+
+    random_state : None, int or RandomState
+        When shuffle=True, pseudo-random number generator state used for
+        shuffling. If None, use default numpy RNG for shuffling.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import BinnedStratifiedKFold
+    >>> y = np.arange(11.0)
+    >>> np.random.seed(0)
+    >>> np.random.shuffle(y)
+    >>> X = y + 0.1* np.random.randn(len(y))
+    >>> skf = BinnedStratifiedKFold(y, n_folds=3)
+    >>> len(skf)
+    3
+    >>> print(skf)  # doctest: +NORMALIZE_WHITESPACE
+    sklearn.cross_validation.BinnedStratifiedKFold(n=11, n_folds=3,
+    shuffle=False, random_state=None)
+    >>> indarr = np.zeros(len(y), dtype=bool)
+    >>> for train_index, test_index in skf:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [ 1  2  3  4  5  8 10] TEST: [0 6 7 9]
+    TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1  5 10]
+    TRAIN: [ 0  1  5  6  7  9 10] TEST: [2 3 4 8] 
+
+    Notes
+    -----
+    All the folds have size floor(n_samples / n_folds) or
+    floor(n_samples / n_folds) +1,
+    the length is assigned randomly (even if no shuffling is requested)
+    to balance the variance between folds.
+
+    See also
+    --------
+    StratifiedKFold -- stratified k-fold generator for classification data
+    """
+
+    def __init__(self, n_folds=3, shuffle=False, random_state=None):
+        super(BinnedStratifiedKFold, self).__init__(n_folds, shuffle,
+                                                    random_state)
+
+    def _make_test_folds(self, X, y=None, labels=None):
+        if y is None:
+            if hasattr(X, "shape") and \
+               (len(X.shape) == 1 or all(X.shape[1:] == 1)):
+                y = X
+            else:
+                raise ValueError("no y has been supplied; "
+                                 "first argument is not a valid y")
+        n_samples = len(y)
+        self.n_samples = n_samples
+        n_folds = self.n_folds
+        yinds = np.arange(n_samples)
+        "reorder the labels according to the ordering of `y`"
+        sorter0 = np.argsort(y)
+        yinds = yinds[sorter0]
+
+        self.n_classes = n_samples // n_folds + int(n_samples % n_folds != 0)
+
+        if n_samples // n_folds > 1:
+            n_items_boundary_cls = n_folds * (n_samples // n_folds // 2)
+            "assign lower `n_folds*(n_classes//2 )` labels to the lower class"
+            lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds)
+            "assign upper `n_folds*(n_classes//2 )` labels to the upper class"
+            upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds)
+            """assign the remainder labels to the middle class;
+            add -1 as a filling value;  shuffle"""
+            middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls]
+            middleclasses = np.hstack([
+                    middleclasses,
+                    -np.ones(n_folds - len(middleclasses) % n_folds, dtype=int)
+                    ])
+            middleclasses = middleclasses.reshape(-1, n_folds)
+
+            rng = check_random_state(self.random_state)
+            rng.shuffle(middleclasses.T)
+            middleclasses = middleclasses.reshape(-1, n_folds)
+            self._test_masks = np.vstack([
+                        lowerclasses,
+                        middleclasses,
+                        upperclasses]).T
+            "to do : middle class rebalancing"
+        elif n_samples > self.n_classes:
+            """put the lower half in one piece, and the rest into a ragged array;
+            the central values will remain unpaired
+            """
+            lowerclasses = yinds[:n_folds].reshape(-1, n_folds)
+            upperclasses = yinds[n_folds:]
+            upperclasses = np.hstack([
+                    upperclasses,
+                    -np.ones(n_folds - len(upperclasses) % n_folds, dtype=int)
+                    ])
+
+            self._test_masks = np.vstack([lowerclasses, upperclasses]).T
+
+        if self.shuffle:
+            rng.shuffle(self._test_masks)
+        "remove missing values from the middle class"
+        self._test_masks = [y[y != -1] for y in self._test_masks]
+
+        test_folds = np.empty(n_samples,  dtype=np.int)
+        for nn, fold_masks in enumerate(self._test_masks):
+            test_folds[fold_masks] = nn
+        return test_folds
+
+    def _iter_test_masks(self, X, y=None, labels=None):
+        test_folds = self._make_test_folds(X, y)
+        for i in range(self.n_folds):
+            yield test_folds == i
+
+    def split(self, X, y=None, labels=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like, shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        labels : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Returns
+        -------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        return super(BinnedStratifiedKFold, self).split(X, y, labels)
+
+
 class LeaveOneLabelOut(BaseCrossValidator):
     """Leave One Label Out cross-validator
 
@@ -1193,8 +1353,8 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
     Validation helper to check if the test/test sizes are meaningful wrt to the
     size of the data (n_samples)
     """
-    if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'
-            and test_size >= n_samples):
+    if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
+            test_size >= n_samples):
         raise ValueError('test_size=%d should be smaller than the number of '
                          'samples %d' % (test_size, n_samples))
 
 
@@ -29,6 +29,7 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import BinnedStratifiedKFold
 from sklearn.model_selection import LabelKFold
 from sklearn.model_selection import LeaveOneOut
 from sklearn.model_selection import LeaveOneLabelOut
@@ -140,34 +141,27 @@ def test_cross_validator_with_default_params():
     X_1d = np.array([1, 2, 3, 4])
     y = np.array([1, 1, 2, 2])
     labels = np.array([1, 2, 3, 4])
-    loo = LeaveOneOut()
-    lpo = LeavePOut(p)
-    kf = KFold(n_folds)
-    skf = StratifiedKFold(n_folds)
-    lolo = LeaveOneLabelOut()
-    lopo = LeavePLabelOut(p)
-    ss = ShuffleSplit(random_state=0)
-    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2
-
-    loo_repr = "LeaveOneOut()"
-    lpo_repr = "LeavePOut(p=2)"
-    kf_repr = "KFold(n_folds=2, random_state=None, shuffle=False)"
-    skf_repr = "StratifiedKFold(n_folds=2, random_state=None, shuffle=False)"
-    lolo_repr = "LeaveOneLabelOut()"
-    lopo_repr = "LeavePLabelOut(n_labels=2)"
-    ss_repr = ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, "
-               "train_size=None)")
-    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
+    cvs = [
+        (LeaveOneOut(), "LeaveOneOut()", n_samples),
+        (LeavePOut(p), "LeavePOut(p=%u)" % p, comb(n_samples, p) ),
+        (KFold(n_folds), "KFold(n_folds=2, random_state=None, shuffle=False)", n_folds),
+        (StratifiedKFold(n_folds), ("StratifiedKFold(n_folds=2, "
+                    "random_state=None, shuffle=False)"), n_folds),
+        (LeaveOneLabelOut(), "LeaveOneLabelOut()", n_unique_labels),
+        (LeavePLabelOut(p), "LeavePLabelOut(n_labels=%u)" % p, comb(n_unique_labels, p) ),
+        (ShuffleSplit(random_state=0), ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, "
+                       "train_size=None)"), n_iter),
+        (PredefinedSplit([1, 1, 2, 2]), "PredefinedSplit(test_fold=array([1, 1, 2, 2]))", 2),
+          ]
+    # n_splits = np of unique folds = 2
 
     n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds,
                 n_unique_labels, comb(n_unique_labels, p), n_iter, 2]
 
-    for i, (cv, cv_repr) in enumerate(zip(
-            [loo, lpo, kf, skf, lolo, lopo, ss, ps],
-            [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
-             ss_repr, ps_repr])):
+    for i, (cv, cv_repr, n_splits_ ) in enumerate(cvs):
+        print( cv, cv_repr, n_splits_ )
         # Test if get_n_splits works correctly
-        assert_equal(n_splits[i], cv.get_n_splits(X, y, labels))
+        assert_equal(n_splits_, cv.get_n_splits(X, y, labels))
 
         # Test if the cross-validator works as expected even if
         # the data is 1d
@@ -379,6 +373,110 @@ def test_stratifiedkfold_balance():
             assert_equal(np.sum(sizes), i)
 
 
+def test_binnedstratifiedkfold_balance():
+    for i in range(11, 17):
+        n_folds = 2 + int(10*np.random.rand())
+        y = np.random.randn(i)
+        np.random.shuffle(y)
+        sizes = []
+
+        cv = BinnedStratifiedKFold(n_folds,
+                  shuffle=False, random_state=None)
+        bskf = cv.split(y)
+
+        bins = np.array([np.percentile(y, q) for q in range(n_folds)])
+        for  train_index, test_index in bskf:
+            sizes.append(
+                        len(test_index)
+                        )
+        assert_true((np.max(sizes) - np.min(sizes)) <= 1)
+        assert_equal(np.sum(sizes), i)
+
+
+def test_binnedstratifiedkfold_bin_spacing():
+    "check if the binned `y` falls into bins of equal size (+/- 1)"
+    for _ in range(10):
+        n_folds = 2 + int(10*np.random.rand())
+        y = np.random.randn(30)
+        np.random.shuffle(y)
+
+        cv = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False,
+                                   random_state=None)
+        bskf = cv.split(y)
+        #bins = np.percentile(y, np.arange(n_folds))
+        bins = np.array([np.percentile(y, q) for q in range(n_folds)])
+
+        for  train_index, test_index in bskf:
+            y_test = y[test_index]
+            hist_test, _ = np.histogram( y_test, bins = bins )
+            assert_true(all(abs(hist_test - np.mean(hist_test)) <= 1),
+                        msg = "y_test falls into bins of too ragged sizes")
+
+            y_train = y[train_index]
+            hist_train, _ = np.histogram( y_test, bins = bins )
+            assert_true(all(abs(hist_train - np.mean(hist_train)) <= 1),
+                        msg = "y_train falls into bins of too ragged sizes")
+
+
+def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds():
+    """check if BinnedStratifiedKFold performs on average better than KFold in terms of
+    lower between-fold variance of fold mean(y_test) and fold std(y_test)
+    """
+    binned_has_more_stable_std_list = []
+    binned_has_more_stable_mean_list = []
+
+    for trial in range(100):
+        n_folds = 2 + int(10*np.random.rand())
+        y = np.random.randn(30)
+        np.random.shuffle(y)
+        ymeans_binned = []
+        ystds_binned = []
+
+        cv_bs = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False,
+                                   random_state=None)
+        bskf = cv_bs.split(y)
+
+        cv = KFold(n_folds = n_folds,
+                        shuffle = True, random_state = None)
+        kf = cv.split(y)
+
+        #bins = np.percentile(y, np.arange(n_folds))
+        bins = np.array([np.percentile(y, q) for q in range(n_folds)])
+
+        for  train_index, test_index in bskf:
+            y_test = y[test_index]
+            ymeans_binned.append(y_test.mean())
+            ystds_binned.append(y_test.std())
+            hist_, _ = np.histogram(y[test_index], bins = bins)
+
+            assert_true(all(abs(hist_ - np.mean(hist_)) <= 1),
+                        msg="too ragged bins")
+
+        ymeans_regular = []
+        ystds_regular = []
+        for  train_index_reg, test_index_reg in kf:
+            ymeans_regular.append(y[test_index_reg].mean())
+            ystds_regular.append(y[test_index_reg].std())
+
+        binned_has_more_stable_std = np.std(ystds_regular) > np.std(ystds_binned)
+        binned_has_more_stable_std_list.append(binned_has_more_stable_std)
+
+        binned_has_more_stable_mean = np.std(ymeans_regular) > np.std(ymeans_binned)
+        binned_has_more_stable_mean_list.append(binned_has_more_stable_mean)
+
+    binned_has_more_stable_std_fraction = np.mean(binned_has_more_stable_std_list)
+    binned_has_more_stable_mean_fraction = np.mean(binned_has_more_stable_mean_list)
+
+    assert_greater( binned_has_more_stable_std_fraction, 0.5)
+    assert_greater( binned_has_more_stable_mean_fraction, 0.5)
+    print(" std(y_test) of BinnedStratifiedKFold was more stable than "
+          "one of KFold in\t%.2f%% cases" % \
+          (100.0*binned_has_more_stable_std_fraction))
+    print("mean(y_test) of BinnedStratifiedKFold was more stable than "
+          "one of KFold in\t%.2f%% cases" % \
+          (100.0*binned_has_more_stable_mean_fraction))
+
+
 def test_shuffle_kfold():
     # Check the indices are shuffled properly
     kf = KFold(3)