diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 9c25cfb417fd4..53afdf53550b1 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and the proportion of samples in on each side of the train / test split. +Label-Shuffle-Split +------------------- + +:class:`LabelShuffleSplit` + +The :class:`LabelShuffleSplit` iterator behaves as a combination of +:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a +sequence of randomized partitions in which a subset of labels are held +out for each split. + +Here is a usage example:: + + >>> from sklearn.cross_validation import LabelShuffleSplit + + >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] + >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, + ... random_state=0) + >>> for train, test in slo: + ... print("%s %s" % (train, test)) + ... + [0 1 2 3] [4 5 6 7] + [2 3 6 7] [0 1 4 5] + [2 3 4 5] [0 1 6 7] + [4 5 6 7] [0 1 2 3] + +This class is useful when the behavior of :class:`LeavePLabelsOut` is +desired, but the number of labels is large enough that generating all +possible partitions with :math:`P` labels withheld would be prohibitively +expensive. In such a scenario, :class:`LabelShuffleSplit` provides +a random sample (with replacement) of the train / test splits +generated by :class:`LeavePLabelsOut`. + + + Predefined Fold-Splits / Validation-Sets ---------------------------------------- diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e45e208890ee5..bb719018c8e82 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -33,6 +33,10 @@ New features function into a ``Pipeline``-compatible transformer object. By Joe Jevnik. + - :class:`cross_validation.LabelShuffleSplit` generates random train-test splits, + similar to :class:`cross_validation.ShuffleSplit`, except that the splits are + conditioned on a label array. By `Brian McFee`_. + Enhancements ............ @@ -3574,3 +3578,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Thomas Unterthiner: https://github.com/untom .. _Loic Esteve: https://github.com/lesteve + +.. _Brian McFee: https://bmcfee.github.io diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index fa7c7f210bc05..011bf4d559cff 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -41,6 +41,7 @@ 'StratifiedKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', + 'LabelShuffleSplit', 'check_cv', 'cross_val_score', 'cross_val_predict', @@ -962,6 +963,94 @@ def __len__(self): return len(self.unique_folds) +class LabelShuffleSplit(ShuffleSplit): + '''Shuffle-Labels-Out cross-validation iterator + + Provides randomized train/test indices to split data according to a + third-party provided label. This label information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + For instance the labels could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePLabelOut and LabelShuffleSplit is that + the former generates splits using all subsets of size ``p`` unique labels, + whereas LabelShuffleSplit generates a user-determined number of random + test splits, each with a user-determined fraction of unique labels. + + For example, a less computationally intensive alternative to + ``LeavePLabelOut(labels, p=10)`` would be + ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``. + + Note: The parameters ``test_size`` and ``train_size`` refer to labels, and + not to samples, as in ShuffleSplit. + + + Parameters + ---------- + labels : array, [n_samples] + Labels of samples + + n_iter : int (default 5) + Number of re-shuffling & splitting iterations. + + test_size : float (default 0.2), int, or None + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the test split. If + int, represents the absolute number of test labels. If None, + the value is automatically set to the complement of the train size. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the train split. If + int, represents the absolute number of train labels. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + ''' + + def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None, + random_state=None): + + classes, label_indices = np.unique(labels, return_inverse=True) + + super(LabelShuffleSplit, self).__init__( + len(classes), + n_iter=n_iter, + test_size=test_size, + train_size=train_size, + random_state=random_state) + + self.labels = labels + self.classes = classes + self.label_indices = label_indices + + def __repr__(self): + return ('%s(labels=%s, n_iter=%d, test_size=%s, ' + 'random_state=%s)' % ( + self.__class__.__name__, + self.labels, + self.n_iter, + str(self.test_size), + self.random_state, + )) + + def __len__(self): + return self.n_iter + + def _iter_indices(self): + + for label_train, label_test in super(LabelShuffleSplit, self)._iter_indices(): + # these are the indices of classes in the partition + # invert them into data indices + + train = np.flatnonzero(np.in1d(self.label_indices, label_train)) + test = np.flatnonzero(np.in1d(self.label_indices, label_test)) + + yield train, test + + ############################################################################## def _index_param_value(X, v, indices): """Private helper function for parameter value indexing.""" diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index b33e2b4c279d5..5d068b0782837 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -483,6 +483,45 @@ def test_predefinedsplit_with_kfold_split(): assert_array_equal(ps_test, kf_test) +def test_label_shuffle_split(): + ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + ] + + for y in ys: + n_iter = 6 + test_size = 1./3 + slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size, + random_state=0) + + # Make sure the repr works + repr(slo) + + # Test that the length is correct + assert_equal(len(slo), n_iter) + + y_unique = np.unique(y) + + for train, test in slo: + # First test: no train label is in the test set and vice versa + y_train_unique = np.unique(y[train]) + y_test_unique = np.unique(y[test]) + assert_false(np.any(np.in1d(y[train], y_test_unique))) + assert_false(np.any(np.in1d(y[test], y_train_unique))) + + # Second test: train and test add up to all the data + assert_equal(y[train].size + y[test].size, y.size) + + # Third test: train and test are disjoint + assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) + + # Fourth test: # unique train and test labels are correct, +- 1 for rounding error + assert_true(abs(len(y_test_unique) - round(test_size * len(y_unique))) <= 1) + assert_true(abs(len(y_train_unique) - round((1.0 - test_size) * len(y_unique))) <= 1) + + def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__