scikit-learn · bmcfee · Apr 13, 2015 · Apr 13, 2015 · Apr 13, 2015 · Apr 13, 2015
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and
 the proportion of samples in on each side of the train / test split.
 
 
+Label-Shuffle-Split
+-------------------
+
+:class:`LabelShuffleSplit`
+
+The :class:`LabelShuffleSplit` iterator behaves as a combination of 
+:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a 
+sequence of randomized partitions in which a subset of labels are held
+out for each split.
+
+Here is a usage example::
+
+  >>> from sklearn.cross_validation import LabelShuffleSplit
+
+  >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
+  >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5,
+  ...                        random_state=0)
+  >>> for train, test in slo:
+  ...     print("%s %s" % (train, test))
+  ...
+  [0 1 2 3] [4 5 6 7]
+  [2 3 6 7] [0 1 4 5]
+  [2 3 4 5] [0 1 6 7]
+  [4 5 6 7] [0 1 2 3]
+
+This class is useful when the behavior of :class:`LeavePLabelsOut` is
+desired, but the number of labels is large enough that generating all
+possible partitions with :math:`P` labels withheld would be prohibitively
+expensive.  In such a scenario, :class:`LabelShuffleSplit` provides
+a random sample (with replacement) of the train / test splits
+generated by :class:`LeavePLabelsOut`.
+
+
+
 Predefined Fold-Splits / Validation-Sets
 ----------------------------------------
 

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -33,6 +33,10 @@ New features
      function into a ``Pipeline``-compatible transformer object.
      By Joe Jevnik.
 
+   - :class:`cross_validation.LabelShuffleSplit` generates random train-test splits,
+     similar to :class:`cross_validation.ShuffleSplit`, except that the splits are 
+     conditioned on a label array. By `Brian McFee`_.
+
 Enhancements
 ............
 
@@ -3574,3 +3578,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Thomas Unterthiner: https://github.com/untom
 
 .. _Loic Esteve: https://github.com/lesteve
+
+.. _Brian McFee: https://bmcfee.github.io
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -41,6 +41,7 @@
            'StratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
+           'LabelShuffleSplit',
            'check_cv',
            'cross_val_score',
            'cross_val_predict',
@@ -962,6 +963,94 @@ def __len__(self):
         return len(self.unique_folds)
 
 
+class LabelShuffleSplit(ShuffleSplit):
+    '''Shuffle-Labels-Out cross-validation iterator
+
+    Provides randomized train/test indices to split data according to a
+    third-party provided label. This label information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
+
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePLabelOut and LabelShuffleSplit is that
+    the former generates splits using all subsets of size ``p`` unique labels,
+    whereas LabelShuffleSplit generates a user-determined number of random
+    test splits, each with a user-determined fraction of unique labels.
+
+    For example, a less computationally intensive alternative to
+    ``LeavePLabelOut(labels, p=10)`` would be
+    ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
+
+    Note: The parameters ``test_size`` and ``train_size`` refer to labels, and 
+    not to samples, as in ShuffleSplit.
+
+
+    Parameters
+    ----------
+    labels :  array, [n_samples]
+        Labels of samples
+
+    n_iter : int (default 5)
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float (default 0.2), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the test split. If
+        int, represents the absolute number of test labels. If None,
+        the value is automatically set to the complement of the train size.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the train split. If
+        int, represents the absolute number of train labels. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+    '''
+
+    def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
+                 random_state=None):
+
+        classes, label_indices = np.unique(labels, return_inverse=True)
+
+        super(LabelShuffleSplit, self).__init__(
+            len(classes),
+            n_iter=n_iter,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state)
+
+        self.labels = labels
+        self.classes = classes
+        self.label_indices = label_indices
+
+    def __repr__(self):
+        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
+                'random_state=%s)' % (
+                    self.__class__.__name__,
+                    self.labels,
+                    self.n_iter,
+                    str(self.test_size),
+                    self.random_state,
+                ))
+
+    def __len__(self):
+        return self.n_iter
+
+    def _iter_indices(self):
+
+        for label_train, label_test in super(LabelShuffleSplit, self)._iter_indices():
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.in1d(self.label_indices, label_train))
+            test = np.flatnonzero(np.in1d(self.label_indices, label_test))
+
+            yield train, test
+
+
 ##############################################################################
 def _index_param_value(X, v, indices):
     """Private helper function for parameter value indexing."""

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -483,6 +483,45 @@ def test_predefinedsplit_with_kfold_split():
     assert_array_equal(ps_test, kf_test)
 
 
+def test_label_shuffle_split():
+    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+          ]
+
+    for y in ys:
+        n_iter = 6
+        test_size = 1./3
+        slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
+                                     random_state=0)
+
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert_equal(len(slo), n_iter)
+
+        y_unique = np.unique(y)
+
        for train, test in slo:
+            # First test: no train label is in the test set and vice versa
+            y_train_unique = np.unique(y[train])
+            y_test_unique = np.unique(y[test])
+            assert_false(np.any(np.in1d(y[train], y_test_unique)))
+            assert_false(np.any(np.in1d(y[test], y_train_unique)))
+
+            # Second test: train and test add up to all the data
+            assert_equal(y[train].size + y[test].size, y.size)
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
+
+            # Fourth test: # unique train and test labels are correct, +- 1 for rounding error
+            assert_true(abs(len(y_test_unique) - round(test_size * len(y_unique))) <= 1)
+            assert_true(abs(len(y_train_unique) - round((1.0 - test_size) * len(y_unique))) <= 1)
+
+
 def test_leave_label_out_changing_labels():
     # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
     # the labels variable is changed before calling __iter__