raghavrv
diff --git a/‎sklearn/model_selection/__init__.py
Lines changed: 6 additions & 4 deletions b/‎sklearn/model_selection/__init__.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎sklearn/model_selection/_split.py
Lines changed: 121 additions & 54 deletions b/‎sklearn/model_selection/_split.py
Lines changed: 121 additions & 54 deletions
diff --git a/‎sklearn/model_selection/tests/test_split.py
Lines changed: 46 additions & 2 deletions b/‎sklearn/model_selection/tests/test_split.py
Lines changed: 46 additions & 2 deletions
@@ -7,6 +7,7 @@
 from ._split import LeavePLabelOut
 from ._split import LeavePOut
 from ._split import ShuffleSplit
+from ._split import LabelShuffleSplit
 from ._split import StratifiedShuffleSplit
 from ._split import PredefinedSplit
 from ._split import train_test_split
@@ -27,7 +28,8 @@
 __all__ = ('BaseCrossValidator', 'GridSearchCV', 'KFold', 'LabelKFold',
            'LeaveOneLabelOut', 'LeaveOneOut', 'LeavePLabelOut', 'LeavePOut',
            'ParameterGrid', 'ParameterSampler', 'PredefinedSplit',
-           'RandomizedSearchCV', 'ShuffleSplit', 'StratifiedKFold',
-           'StratifiedShuffleSplit', 'check_cv', 'cross_val_predict',
-           'cross_val_score', 'fit_grid_point', 'learning_curve',
-           'permutation_test_score', 'train_test_split', 'validation_curve')
+           'RandomizedSearchCV', 'ShuffleSplit', 'LabelShuffleSplit',
+           'StratifiedKFold', 'StratifiedShuffleSplit', 'check_cv',
+           'cross_val_predict', 'cross_val_score', 'fit_grid_point',
+           'learning_curve', 'permutation_test_score', 'train_test_split',
+           'validation_curve')
@@ -39,6 +39,7 @@
            'LeavePLabelOut',
            'LeavePOut',
            'ShuffleSplit',
+           'LabelShuffleSplit',
            'StratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
@@ -825,70 +826,70 @@ def get_n_splits(self, X=None, y=None, labels=None):
         return self.n_iter
 
 
-def _validate_shuffle_split_init(test_size, train_size):
-    if test_size is None and train_size is None:
-        raise ValueError('test_size and train_size can not both be None')
+class LabelShuffleSplit(ShuffleSplit):
+    '''Shuffle-Labels-Out cross-validation iterator
 
-    if test_size is not None:
-        if np.asarray(test_size).dtype.kind == 'f':
-            if test_size >= 1.:
-                raise ValueError(
-                    'test_size=%f should be smaller '
-                    'than 1.0 or be an integer' % test_size)
-        elif np.asarray(test_size).dtype.kind != 'i':
-            # int values are checked during split based on the input
-            raise ValueError("Invalid value for test_size: %r" % test_size)
+    Provides randomized train/test indices to split data according to a
+    third-party provided label. This label information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
 
-    if train_size is not None:
-        if np.asarray(train_size).dtype.kind == 'f':
-            if train_size >= 1.:
-                raise ValueError("train_size=%f should be smaller "
-                                 "than 1.0 or be an integer" % train_size)
-            elif ((np.asarray(test_size).dtype.kind == 'f') and
-                    ((train_size + test_size) > 1.)):
-                raise ValueError('The sum of test_size and train_size = %f, '
-                                 'should be smaller than 1.0. Reduce '
-                                 'test_size and/or train_size.' %
-                                 (train_size + test_size))
-        elif np.asarray(train_size).dtype.kind != 'i':
-            # int values are checked during split based on the input
-            raise ValueError("Invalid value for train_size: %r" % train_size)
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
 
+    The difference between LeavePLabelOut and LabelShuffleSplit is that
+    the former generates splits using all subsets of size ``p`` unique labels,
+    whereas LabelShuffleSplit generates a user-determined number of random
+    test splits, each with a user-determined fraction of unique labels.
 
-def _validate_shuffle_split(n, test_size, train_size):
-    if ((test_size is not None) and (np.asarray(test_size).dtype.kind == 'i')
-            and (test_size >= n)):
-        raise ValueError('test_size=%d should be smaller '
-                         'than the number of samples %d' % (test_size, n))
+    For exa
6D40
mple, a less computationally intensive alternative to
+    ``LeavePLabelOut(p=10)`` would be
+    ``LabelShuffleSplit(test_size=10, n_iter=100)``.
 
-    if ((train_size is not None) and (np.asarray(train_size).dtype.kind == 'i')
-            and (train_size >= n)):
-        raise ValueError("train_size=%d should be smaller "
-                         "than the number of samples %d" % (train_size, n))
+    Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
+    not to samples, as in ShuffleSplit.
 
-    if np.asarray(test_size).dtype.kind == 'f':
-        n_test = ceil(test_size * n)
-    elif np.asarray(test_size).dtype.kind == 'i':
-        n_test = float(test_size)
 
-    if train_size is None:
-        n_train = n - n_test
-    else:
-        if np.asarray(train_size).dtype.kind == 'f':
-            n_train = floor(train_size * n)
-        else:
-            n_train = float(train_size)
+    Parameters
+    ----------
+    n_iter : int (default 5)
+        Number of re-shuffling & splitting iterations.
 
-    if test_size is None:
-        n_test = n - n_train
+    test_size : float (default 0.2), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the test split. If
+        int, represents the absolute number of test labels. If None,
+        the value is automatically set to the complement of the train size.
 
-    if n_train + n_test > n:
-        raise ValueError('The sum of train_size and test_size = %d, '
-                         'should be smaller than the number of '
-                         'samples %d. Reduce test_size and/or '
-                         'train_size.' % (n_train + n_test, n))
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the train split. If
+        int, represents the absolute number of train labels. If None,
+        the value is automatically set to the complement of the test size.
 
-    return int(n_train), int(n_test)
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+    '''
+
+    def __init__(self, n_iter=5, test_size=0.2, train_size=None,
+                 random_state=None):
+        super(LabelShuffleSplit, self).__init__(
+            n_iter=n_iter,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state)
+
+
+    def _iter_indices(self, X, y, labels):
+        classes, label_indices = np.unique(labels, return_inverse=True)
+        for label_train, label_test in super(
+                LabelShuffleSplit, self)._iter_indices(X=classes):
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.in1d(label_indices, label_train))
+            test = np.flatnonzero(np.in1d(label_indices, label_test))
+
+            yield train, test
 
 
 class StratifiedShuffleSplit(BaseShuffleSplit):
@@ -1018,6 +1019,72 @@ def get_n_splits(self, X=None, y=None, labels=None):
         return self.n_iter
 
 
+def _validate_shuffle_split_init(test_size, train_size):
+    if test_size is None and train_size is None:
+        raise ValueError('test_size and train_size can not both be None')
+
+    if test_size is not None:
+        if np.asarray(test_size).dtype.kind == 'f':
+            if test_size >= 1.:
+                raise ValueError(
+                    'test_size=%f should be smaller '
+                    'than 1.0 or be an integer' % test_size)
+        elif np.asarray(test_size).dtype.kind != 'i':
+            # int values are checked during split based on the input
+            raise ValueError("Invalid value for test_size: %r" % test_size)
+
+    if train_size is not None:
+        if np.asarray(train_size).dtype.kind == 'f':
+            if train_size >= 1.:
+                raise ValueError("train_size=%f should be smaller "
+                                 "than 1.0 or be an integer" % train_size)
+            elif ((np.asarray(test_size).dtype.kind == 'f') and
+                    ((train_size + test_size) > 1.)):
+                raise ValueError('The sum of test_size and train_size = %f, '
+                                 'should be smaller than 1.0. Reduce '
+                                 'test_size and/or train_size.' %
+                                 (train_size + test_size))
+        elif np.asarray(train_size).dtype.kind != 'i':
+            # int values are checked during split based on the input
+            raise ValueError("Invalid value for train_size: %r" % train_size)
+
+
+def _validate_shuffle_split(n, test_size, train_size):
+    if ((test_size is not None) and (np.asarray(test_size).dtype.kind == 'i')
+            and (test_size >= n)):
+        raise ValueError('test_size=%d should be smaller '
+                         'than the number of samples %d' % (test_size, n))
+
+    if ((train_size is not None) and (np.asarray(train_size).dtype.kind == 'i')
+            and (train_size >= n)):
+        raise ValueError("train_size=%d should be smaller "
+                         "than the number of samples %d" % (train_size, n))
+
+    if np.asarray(test_size).dtype.kind == 'f':
+        n_test = ceil(test_size * n)
+    elif np.asarray(test_size).dtype.kind == 'i':
+        n_test = float(test_size)
+
+    if train_size is None:
+        n_train = n - n_test
+    else:
+        if np.asarray(train_size).dtype.kind == 'f':
+            n_train = floor(train_size * n)
+        else:
+            n_train = float(train_size)
+
+    if test_size is None:
+        n_test = n - n_train
+
+    if n_train + n_test > n:
+        raise ValueError('The sum of train_size and test_size = %d, '
+                         'should be smaller than the number of '
+                         'samples %d. Reduce test_size and/or '
+                         'train_size.' % (n_train + n_test, n))
+
+    return int(n_train), int(n_test)
+
+
 class PredefinedSplit(BaseCrossValidator):
     """Predefined split cross-validator
 
 
@@ -8,6 +8,7 @@
 from itertools import combinations
 
 from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
@@ -30,6 +31,7 @@
 from sklearn.model_selection import LeavePOut
 from sklearn.model_selection import LeavePLabelOut
 from sklearn.model_selection import ShuffleSplit
+from sklearn.model_selection import LabelShuffleSplit
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import check_cv
@@ -566,6 +568,48 @@ def test_predefinedsplit_with_kfold_split():
     assert_array_equal(ps_test, kf_test)
 
 
+def test_label_shuffle_split():
+    labels = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+          ]
+
+    for l in labels:
+        X = y = np.ones(len(l))
+        n_iter = 6
+        test_size = 1./3
+        slo = LabelShuffleSplit(n_iter, test_size=test_size, random_state=0)
+
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert_equal(slo.get_n_splits(X, y, labels=l), n_iter)
+
+        l_unique = np.unique(l)
+
+        for train, test in slo.split(X, y, labels=l):
+            # First test: no train label is in the test set and vice versa
+            l_train_unique = np.unique(l[train])
+            l_test_unique = np.unique(l[test])
+            assert_false(np.any(np.in1d(l[train], l_test_unique)))
+            assert_false(np.any(np.in1d(l[test], l_train_unique)))
+
+            # Second test: train and test add up to all the data
+            assert_equal(l[train].size + l[test].size, l.size)
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.intersect1d(train, test), [])
+
+            # Fourth test:
+            # unique train and test labels are correct, +- 1 for rounding error
+            assert_true(abs(len(l_test_unique) -
+                            round(test_size * len(l_unique))) <= 1)
+            assert_true(abs(len(l_train_unique) -
+                            round((1.0 - test_size) * len(l_unique))) <= 1)
+
+
 def test_leave_label_out_changing_labels():
     # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
     # the labels variable is changed before calling split
@@ -790,7 +834,7 @@ def test_label_kfold():
     ideal_n_labels_per_fold = n_samples // n_folds
 
     len(np.unique(labels))
-    # Get the test fold indices from the test set indices of each fold 
+    # Get the test fold indices from the test set indices of each fold
     folds = np.zeros(n_samples)
     for i, (_, test) in enumerate(LabelKFold(n_folds).split(X, y, labels)):
         folds[test] = i
@@ -827,7 +871,7 @@ def test_label_kfold():
 
     X = y = np.ones(n_samples)
 
-    # Get the test fold indices from the test set indices of each fold 
+    # Get the test fold indices from the test set indices of each fold
     folds = np.zeros(n_samples)
     for i, (_, test) in enumerate(LabelKFold(n_folds).split(X, y, labels)):
         folds[test] = i