From ebfea02267c763b4ecda92af5f65e56bb2db8157 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 11:12:53 -0400 Subject: [PATCH 01/21] added ShuffleLabelsOut cv iterator --- sklearn/cross_validation.py | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 5e51f68cf117d..4a369c4d6fc9d 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -41,6 +41,7 @@ 'StratifiedKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', + 'ShuffleLabelsOut', 'check_cv', 'cross_val_score', 'cross_val_predict', @@ -944,6 +945,82 @@ def __len__(self): return len(self.unique_folds) +class ShuffleLabelsOut(ShuffleSplit): + '''Shuffle-Labels-Out cross-validation iterator + + Provides randomized train/test indices to split data according to a + third-party provided label. This label information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + For instance the labels could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePLabelOut and ShuffleLabelsOut is that + the former generates splits using all subsets of size `p` unique labels, + whereas ShuffleLabelsOut generates a user-determined number of random + test splits, each with `p` unique labels. + + + Parameters + ---------- + y : array, [n_samples] + Labels of samples + + n_iter : int (default 5) + Number of re-shuffling & splitting iterations. + + test_size : float (default 0.2), int, or None + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the test split. If + int, represents the absolute number of test labels. If None, + the value is automatically set to the complement of the train size. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the train split. If + int, represents the absolute number of train labels. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + ''' + + def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, + random_state=None): + + classes, y_indices = np.unique(y, return_inverse=True) + + super(ShuffleLabelsOut, self).__init__( + len(classes), n_iter, test_size, train_size, random_state) + + self.classes = classes + self.y_indices = y_indices + + def __repr__(self): + return ('%s(labels=%s, n_iter=%d, test_size=%s, ' + 'random_state=%s)' % ( + self.__class__.__name__, + self.y, + self.n_iter, + str(self.test_size), + self.random_state, + )) + + def __len__(self): + return self.n_iter + + def _iter_indices(self): + + for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices(): + # these are the indices of classes in the partition + # invert them into data indices + + train = np.flatnonzero(np.in1d(self.y_indices, y_train)) + test = np.flatnonzero(np.in1d(self.y_indices, y_test)) + + yield train, test + + ############################################################################## def _index_param_value(X, v, indices): """Private helper function for parameter value indexing.""" From 36c9c3def8d060ef1150bad16ec18ef250349db5 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 11:37:54 -0400 Subject: [PATCH 02/21] fixed tests for shufflelabelsout --- sklearn/tests/test_cross_validation.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 786bf561ec5e2..1297bc3c3b2fc 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -483,6 +483,29 @@ def test_predefinedsplit_with_kfold_split(): assert_array_equal(ps_test, kf_test) +def test_shuffle_labels_out(): + ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + ] + + for y in ys: + slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33, + random_state=0) + + for train, test in slo: + # First test: no train label is in the test set and vice versa + assert_false(np.any(np.in1d(y[train], np.unique(y[test])))) + assert_false(np.any(np.in1d(y[test], np.unique(y[train])))) + + # Second test: train and test add up to all the data + assert_equal(y[train].size + y[test].size, y.size) + + # Third test: train and test are disjoint + assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) + + def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__ From 3493ec7d77ee805d0f535914a16a7658d9dd31e2 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 15:39:09 -0400 Subject: [PATCH 03/21] updated docstring --- sklearn/cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 4a369c4d6fc9d..45b37fb387c4a 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -958,7 +958,7 @@ class ShuffleLabelsOut(ShuffleSplit): The difference between LeavePLabelOut and ShuffleLabelsOut is that the former generates splits using all subsets of size `p` unique labels, whereas ShuffleLabelsOut generates a user-determined number of random - test splits, each with `p` unique labels. + test splits, each with a user-determined fraction of unique labels. Parameters From 62e1996b366af4b975d18a0d816f83e532268ce7 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Sat, 18 Apr 2015 11:48:44 -0400 Subject: [PATCH 04/21] Fixed an error in call to the super constructor --- sklearn/cross_validation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 45b37fb387c4a..3a247bdb14659 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -991,7 +991,11 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, classes, y_indices = np.unique(y, return_inverse=True) super(ShuffleLabelsOut, self).__init__( - len(classes), n_iter, test_size, train_size, random_state) + len(classes), + n_iter=n_iter, + test_size=test_size, + train_size=train_size, + random_state=random_state) self.classes = classes self.y_indices = y_indices From ba7f81e648a419427c7ed6cb0d70e94ff2310d12 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 12 May 2015 19:41:21 -0400 Subject: [PATCH 05/21] fixed repr, variable names in ShuffleLabelsOut --- sklearn/cross_validation.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 3a247bdb14659..85e0229a400bb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -963,7 +963,7 @@ class ShuffleLabelsOut(ShuffleSplit): Parameters ---------- - y : array, [n_samples] + labels : array, [n_samples] Labels of samples n_iter : int (default 5) @@ -985,10 +985,10 @@ class ShuffleLabelsOut(ShuffleSplit): Pseudo-random number generator state used for random sampling. ''' - def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, + def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None, random_state=None): - classes, y_indices = np.unique(y, return_inverse=True) + classes, label_indices = np.unique(labels, return_inverse=True) super(ShuffleLabelsOut, self).__init__( len(classes), @@ -997,14 +997,15 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, train_size=train_size, random_state=random_state) + self.labels = labels self.classes = classes - self.y_indices = y_indices + self.label_indices = label_indices def __repr__(self): return ('%s(labels=%s, n_iter=%d, test_size=%s, ' 'random_state=%s)' % ( self.__class__.__name__, - self.y, + self.labels, self.n_iter, str(self.test_size), self.random_state, @@ -1015,12 +1016,12 @@ def __len__(self): def _iter_indices(self): - for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices(): + for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices(): # these are the indices of classes in the partition # invert them into data indices - train = np.flatnonzero(np.in1d(self.y_indices, y_train)) - test = np.flatnonzero(np.in1d(self.y_indices, y_test)) + train = np.flatnonzero(np.in1d(self.label_indices, label_train)) + test = np.flatnonzero(np.in1d(self.label_indices, label_test)) yield train, test From a0a27648f129ecd42c5a658030d33ed8a8e1a4a4 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 12 May 2015 19:46:43 -0400 Subject: [PATCH 06/21] added length and repr tests to ShuffleLabelsOut --- sklearn/tests/test_cross_validation.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 1297bc3c3b2fc..757ed301101fd 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -491,9 +491,16 @@ def test_shuffle_labels_out(): ] for y in ys: - slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33, + n_iter = 6 + slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33, random_state=0) + # Make sure the repr works + repr(slo) + + # Test that the length is correct + assert_equal(len(slo), n_iter) + for train, test in slo: # First test: no train label is in the test set and vice versa assert_false(np.any(np.in1d(y[train], np.unique(y[test])))) From 0030d32ffaf78b92034e8714fb1fab1ec74ca6b0 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 23 Jun 2015 13:56:36 -0400 Subject: [PATCH 07/21] added documentation for ShuffleLabelsOut --- doc/modules/cross_validation.rst | 34 ++++++++++++++++++++++++++++++++ doc/whats_new.rst | 6 ++++++ sklearn/cross_validation.py | 7 +++++++ 3 files changed, 47 insertions(+) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 0aa6bf1e3b692..0a4592e7020bd 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and the proportion of samples in on each side of the train / test split. +Shuffle-Labels-Out +------------------ + +:class:`ShuffleLabelsOut` + +The :class:`ShuffleLabelsOut` iterator behaves as a combination of +:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a +sequence of randomized partitions in which a subset of labels are held +out for each split. + +Here is a usage example:: + + >>> from sklearn.cross_validation import ShuffleLabelsOut + + >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] + >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5, + ... random_state=0) + >>> for train, test in slo: + ... print("%s %s" % (train, test)) + ... + [0 1 2 3] [4 5 6 7] + [2 3 6 7] [0 1 4 5] + [2 3 4 5] [0 1 6 7] + [4 5 6 7] [0 1 2 3] + +This class is useful when the behavior of :class:`LeavePLabelsOut` is +desired, but the number of labels is large enough that generating all +possible partitions with :math:`P` labels withheld would be prohibitively +expensive. In such a scenario, :class:`ShuffleLabelsOut` provides +a random sample (with replacement) of the train / test splits +generated by :class:`LeavePLabelsOut`. + + + Predefined Fold-Splits / Validation-Sets ---------------------------------------- diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 41ba47868fa00..634a66619cca3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -13,6 +13,10 @@ Changelog New features ............ + - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits, + similer to `ShuffleSplit`, except that the splits are conditioned on a label array. + By `Brian McFee`_. + Enhancements ............ @@ -3445,3 +3449,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Eric Martin: http://ericmart.in .. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241 + +.. _Brian McFee: https://bmcfee.github.io diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 85e0229a400bb..18d7740ef2e1a 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -960,6 +960,13 @@ class ShuffleLabelsOut(ShuffleSplit): whereas ShuffleLabelsOut generates a user-determined number of random test splits, each with a user-determined fraction of unique labels. + For example, a less computationally intensive alternative to + `LeavePLabelOut(labels, p=10)` would be + `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`. + + Note: The parameters `test_size` and `train_size` refer to labels, and not + to samples, as in ShuffleSplit. + Parameters ---------- From 13c15fcda493143e5daa05172d226d994b20c9a4 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 09:45:59 -0400 Subject: [PATCH 08/21] updated ShuffleLabelsOut in whats_new --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 634a66619cca3..a72ce275e9f9d 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -14,8 +14,8 @@ New features ............ - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits, - similer to `ShuffleSplit`, except that the splits are conditioned on a label array. - By `Brian McFee`_. + similar to :class:`cross_validation.ShuffleSplit`, except that the splits are + conditioned on a label array. By `Brian McFee`_. Enhancements ............ From 56698920ff467c19acc56074c3ac3bc07ddf5f2b Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 09:47:38 -0400 Subject: [PATCH 09/21] ShuffleLabelsOut updated docstring for double-backtick --- sklearn/cross_validation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 18d7740ef2e1a..46a0f1409ecdb 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -956,16 +956,16 @@ class ShuffleLabelsOut(ShuffleSplit): and thus allow for cross-validation against time-based splits. The difference between LeavePLabelOut and ShuffleLabelsOut is that - the former generates splits using all subsets of size `p` unique labels, + the former generates splits using all subsets of size ``p`` unique labels, whereas ShuffleLabelsOut generates a user-determined number of random test splits, each with a user-determined fraction of unique labels. For example, a less computationally intensive alternative to - `LeavePLabelOut(labels, p=10)` would be - `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`. + ``LeavePLabelOut(labels, p=10)`` would be + ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``. - Note: The parameters `test_size` and `train_size` refer to labels, and not - to samples, as in ShuffleSplit. + Note: The parameters ``test_size`` and ``train_size`` refer to labels, and + not to samples, as in ShuffleSplit. Parameters From f0c4a7507e18def166f419b02ea33d7b0e382c8c Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 11:12:53 -0400 Subject: [PATCH 10/21] added ShuffleLabelsOut cv iterator --- sklearn/cross_validation.py | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index fa7c7f210bc05..9d5a8c65fb84f 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -41,6 +41,7 @@ 'StratifiedKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', + 'ShuffleLabelsOut', 'check_cv', 'cross_val_score', 'cross_val_predict', @@ -962,6 +963,82 @@ def __len__(self): return len(self.unique_folds) +class ShuffleLabelsOut(ShuffleSplit): + '''Shuffle-Labels-Out cross-validation iterator + + Provides randomized train/test indices to split data according to a + third-party provided label. This label information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + For instance the labels could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePLabelOut and ShuffleLabelsOut is that + the former generates splits using all subsets of size `p` unique labels, + whereas ShuffleLabelsOut generates a user-determined number of random + test splits, each with `p` unique labels. + + + Parameters + ---------- + y : array, [n_samples] + Labels of samples + + n_iter : int (default 5) + Number of re-shuffling & splitting iterations. + + test_size : float (default 0.2), int, or None + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the test split. If + int, represents the absolute number of test labels. If None, + the value is automatically set to the complement of the train size. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the labels to include in the train split. If + int, represents the absolute number of train labels. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + ''' + + def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, + random_state=None): + + classes, y_indices = np.unique(y, return_inverse=True) + + super(ShuffleLabelsOut, self).__init__( + len(classes), n_iter, test_size, train_size, random_state) + + self.classes = classes + self.y_indices = y_indices + + def __repr__(self): + return ('%s(labels=%s, n_iter=%d, test_size=%s, ' + 'random_state=%s)' % ( + self.__class__.__name__, + self.y, + self.n_iter, + str(self.test_size), + self.random_state, + )) + + def __len__(self): + return self.n_iter + + def _iter_indices(self): + + for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices(): + # these are the indices of classes in the partition + # invert them into data indices + + train = np.flatnonzero(np.in1d(self.y_indices, y_train)) + test = np.flatnonzero(np.in1d(self.y_indices, y_test)) + + yield train, test + + ############################################################################## def _index_param_value(X, v, indices): """Private helper function for parameter value indexing.""" From d668f3b2660ed9cf72c2e9974a26acc1318686d4 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 11:37:54 -0400 Subject: [PATCH 11/21] fixed tests for shufflelabelsout --- sklearn/tests/test_cross_validation.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index b33e2b4c279d5..550816f530641 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -483,6 +483,29 @@ def test_predefinedsplit_with_kfold_split(): assert_array_equal(ps_test, kf_test) +def test_shuffle_labels_out(): + ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + ] + + for y in ys: + slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33, + random_state=0) + + for train, test in slo: + # First test: no train label is in the test set and vice versa + assert_false(np.any(np.in1d(y[train], np.unique(y[test])))) + assert_false(np.any(np.in1d(y[test], np.unique(y[train])))) + + # Second test: train and test add up to all the data + assert_equal(y[train].size + y[test].size, y.size) + + # Third test: train and test are disjoint + assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) + + def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__ From 068cb38a8aba7b13a583c7d9c13edd2e585aaf05 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Mon, 13 Apr 2015 15:39:09 -0400 Subject: [PATCH 12/21] updated docstring --- sklearn/cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 9d5a8c65fb84f..10d024e9c8ae1 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -976,7 +976,7 @@ class ShuffleLabelsOut(ShuffleSplit): The difference between LeavePLabelOut and ShuffleLabelsOut is that the former generates splits using all subsets of size `p` unique labels, whereas ShuffleLabelsOut generates a user-determined number of random - test splits, each with `p` unique labels. + test splits, each with a user-determined fraction of unique labels. Parameters From 74c265774a8c4e978020a19b70ec42765adda747 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Sat, 18 Apr 2015 11:48:44 -0400 Subject: [PATCH 13/21] Fixed an error in call to the super constructor --- sklearn/cross_validation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 10d024e9c8ae1..0fb7e5f6beb64 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1009,7 +1009,11 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, classes, y_indices = np.unique(y, return_inverse=True) super(ShuffleLabelsOut, self).__init__( - len(classes), n_iter, test_size, train_size, random_state) + len(classes), + n_iter=n_iter, + test_size=test_size, + train_size=train_size, + random_state=random_state) self.classes = classes self.y_indices = y_indices From 41d3704b4375ff880e1441ffdd421d09f0d230a7 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 12 May 2015 19:41:21 -0400 Subject: [PATCH 14/21] fixed repr, variable names in ShuffleLabelsOut --- sklearn/cross_validation.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 0fb7e5f6beb64..ebd24e9498419 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -981,7 +981,7 @@ class ShuffleLabelsOut(ShuffleSplit): Parameters ---------- - y : array, [n_samples] + labels : array, [n_samples] Labels of samples n_iter : int (default 5) @@ -1003,10 +1003,10 @@ class ShuffleLabelsOut(ShuffleSplit): Pseudo-random number generator state used for random sampling. ''' - def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, + def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None, random_state=None): - classes, y_indices = np.unique(y, return_inverse=True) + classes, label_indices = np.unique(labels, return_inverse=True) super(ShuffleLabelsOut, self).__init__( len(classes), @@ -1015,14 +1015,15 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None, train_size=train_size, random_state=random_state) + self.labels = labels self.classes = classes - self.y_indices = y_indices + self.label_indices = label_indices def __repr__(self): return ('%s(labels=%s, n_iter=%d, test_size=%s, ' 'random_state=%s)' % ( self.__class__.__name__, - self.y, + self.labels, self.n_iter, str(self.test_size), self.random_state, @@ -1033,12 +1034,12 @@ def __len__(self): def _iter_indices(self): - for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices(): + for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices(): # these are the indices of classes in the partition # invert them into data indices - train = np.flatnonzero(np.in1d(self.y_indices, y_train)) - test = np.flatnonzero(np.in1d(self.y_indices, y_test)) + train = np.flatnonzero(np.in1d(self.label_indices, label_train)) + test = np.flatnonzero(np.in1d(self.label_indices, label_test)) yield train, test From 085f38177d84554c3abd368f57d1b4fbd5a4e7df Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 12 May 2015 19:46:43 -0400 Subject: [PATCH 15/21] added length and repr tests to ShuffleLabelsOut --- sklearn/tests/test_cross_validation.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 550816f530641..814525a8b87e8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -491,9 +491,16 @@ def test_shuffle_labels_out(): ] for y in ys: - slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33, + n_iter = 6 + slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33, random_state=0) + # Make sure the repr works + repr(slo) + + # Test that the length is correct + assert_equal(len(slo), n_iter) + for train, test in slo: # First test: no train label is in the test set and vice versa assert_false(np.any(np.in1d(y[train], np.unique(y[test])))) From 915144a95dd7cd959e4cb9b10497a34e77a95756 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Tue, 23 Jun 2015 13:56:36 -0400 Subject: [PATCH 16/21] added documentation for ShuffleLabelsOut --- doc/modules/cross_validation.rst | 34 ++++++++++++++++++++++++++++++++ doc/whats_new.rst | 6 ++++++ sklearn/cross_validation.py | 7 +++++++ 3 files changed, 47 insertions(+) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 9c25cfb417fd4..093ecd9065c7e 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and the proportion of samples in on each side of the train / test split. +Shuffle-Labels-Out +------------------ + +:class:`ShuffleLabelsOut` + +The :class:`ShuffleLabelsOut` iterator behaves as a combination of +:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a +sequence of randomized partitions in which a subset of labels are held +out for each split. + +Here is a usage example:: + + >>> from sklearn.cross_validation import ShuffleLabelsOut + + >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] + >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5, + ... random_state=0) + >>> for train, test in slo: + ... print("%s %s" % (train, test)) + ... + [0 1 2 3] [4 5 6 7] + [2 3 6 7] [0 1 4 5] + [2 3 4 5] [0 1 6 7] + [4 5 6 7] [0 1 2 3] + +This class is useful when the behavior of :class:`LeavePLabelsOut` is +desired, but the number of labels is large enough that generating all +possible partitions with :math:`P` labels withheld would be prohibitively +expensive. In such a scenario, :class:`ShuffleLabelsOut` provides +a random sample (with replacement) of the train / test splits +generated by :class:`LeavePLabelsOut`. + + + Predefined Fold-Splits / Validation-Sets ---------------------------------------- diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e45e208890ee5..5d9b61e3f7f45 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -33,6 +33,10 @@ New features function into a ``Pipeline``-compatible transformer object. By Joe Jevnik. + - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits, + similer to `ShuffleSplit`, except that the splits are conditioned on a label array. + By `Brian McFee`_. + Enhancements ............ @@ -3574,3 +3578,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Thomas Unterthiner: https://github.com/untom .. _Loic Esteve: https://github.com/lesteve + +.. _Brian McFee: https://bmcfee.github.io diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ebd24e9498419..a13be82ee59b0 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -978,6 +978,13 @@ class ShuffleLabelsOut(ShuffleSplit): whereas ShuffleLabelsOut generates a user-determined number of random test splits, each with a user-determined fraction of unique labels. + For example, a less computationally intensive alternative to + `LeavePLabelOut(labels, p=10)` would be + `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`. + + Note: The parameters `test_size` and `train_size` refer to labels, and not + to samples, as in ShuffleSplit. + Parameters ---------- From e2a5ad736d5ba24ecd24583de104a6bc35500060 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 09:45:59 -0400 Subject: [PATCH 17/21] updated ShuffleLabelsOut in whats_new --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5d9b61e3f7f45..602c5258541c1 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -34,8 +34,8 @@ New features By Joe Jevnik. - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits, - similer to `ShuffleSplit`, except that the splits are conditioned on a label array. - By `Brian McFee`_. + similar to :class:`cross_validation.ShuffleSplit`, except that the splits are + conditioned on a label array. By `Brian McFee`_. Enhancements ............ From e2b2a6e82307fbc00b6e8f8772ff2199d9cdf947 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 09:47:38 -0400 Subject: [PATCH 18/21] ShuffleLabelsOut updated docstring for double-backtick --- sklearn/cross_validation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index a13be82ee59b0..64c795700fa35 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -974,16 +974,16 @@ class ShuffleLabelsOut(ShuffleSplit): and thus allow for cross-validation against time-based splits. The difference between LeavePLabelOut and ShuffleLabelsOut is that - the former generates splits using all subsets of size `p` unique labels, + the former generates splits using all subsets of size ``p`` unique labels, whereas ShuffleLabelsOut generates a user-determined number of random test splits, each with a user-determined fraction of unique labels. For example, a less computationally intensive alternative to - `LeavePLabelOut(labels, p=10)` would be - `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`. + ``LeavePLabelOut(labels, p=10)`` would be + ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``. - Note: The parameters `test_size` and `train_size` refer to labels, and not - to samples, as in ShuffleSplit. + Note: The parameters ``test_size`` and ``train_size`` refer to labels, and + not to samples, as in ShuffleSplit. Parameters From ab74b1dba890698b39289e659b4ecab48619a5cd Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 14:14:55 -0400 Subject: [PATCH 19/21] renamed ShuffleLabelsOut to LabelShuffleSplit --- doc/modules/cross_validation.rst | 10 +++++----- doc/whats_new.rst | 2 +- sklearn/cross_validation.py | 14 +++++++------- sklearn/tests/test_cross_validation.py | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 093ecd9065c7e..1b45da9de9771 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -433,19 +433,19 @@ the proportion of samples in on each side of the train / test split. Shuffle-Labels-Out ------------------ -:class:`ShuffleLabelsOut` +:class:`LabelShuffleSplit` -The :class:`ShuffleLabelsOut` iterator behaves as a combination of +The :class:`LabelShuffleSplit` iterator behaves as a combination of :class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a sequence of randomized partitions in which a subset of labels are held out for each split. Here is a usage example:: - >>> from sklearn.cross_validation import ShuffleLabelsOut + >>> from sklearn.cross_validation import LabelShuffleSplit >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] - >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5, + >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, ... random_state=0) >>> for train, test in slo: ... print("%s %s" % (train, test)) @@ -458,7 +458,7 @@ Here is a usage example:: This class is useful when the behavior of :class:`LeavePLabelsOut` is desired, but the number of labels is large enough that generating all possible partitions with :math:`P` labels withheld would be prohibitively -expensive. In such a scenario, :class:`ShuffleLabelsOut` provides +expensive. In such a scenario, :class:`LabelShuffleSplit` provides a random sample (with replacement) of the train / test splits generated by :class:`LeavePLabelsOut`. diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 602c5258541c1..bb719018c8e82 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -33,7 +33,7 @@ New features function into a ``Pipeline``-compatible transformer object. By Joe Jevnik. - - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits, + - :class:`cross_validation.LabelShuffleSplit` generates random train-test splits, similar to :class:`cross_validation.ShuffleSplit`, except that the splits are conditioned on a label array. By `Brian McFee`_. diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 64c795700fa35..011bf4d559cff 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -41,7 +41,7 @@ 'StratifiedKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', - 'ShuffleLabelsOut', + 'LabelShuffleSplit', 'check_cv', 'cross_val_score', 'cross_val_predict', @@ -963,7 +963,7 @@ def __len__(self): return len(self.unique_folds) -class ShuffleLabelsOut(ShuffleSplit): +class LabelShuffleSplit(ShuffleSplit): '''Shuffle-Labels-Out cross-validation iterator Provides randomized train/test indices to split data according to a @@ -973,14 +973,14 @@ class ShuffleLabelsOut(ShuffleSplit): For instance the labels could be the year of collection of the samples and thus allow for cross-validation against time-based splits. - The difference between LeavePLabelOut and ShuffleLabelsOut is that + The difference between LeavePLabelOut and LabelShuffleSplit is that the former generates splits using all subsets of size ``p`` unique labels, - whereas ShuffleLabelsOut generates a user-determined number of random + whereas LabelShuffleSplit generates a user-determined number of random test splits, each with a user-determined fraction of unique labels. For example, a less computationally intensive alternative to ``LeavePLabelOut(labels, p=10)`` would be - ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``. + ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``. Note: The parameters ``test_size`` and ``train_size`` refer to labels, and not to samples, as in ShuffleSplit. @@ -1015,7 +1015,7 @@ def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None, classes, label_indices = np.unique(labels, return_inverse=True) - super(ShuffleLabelsOut, self).__init__( + super(LabelShuffleSplit, self).__init__( len(classes), n_iter=n_iter, test_size=test_size, @@ -1041,7 +1041,7 @@ def __len__(self): def _iter_indices(self): - for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices(): + for label_train, label_test in super(LabelShuffleSplit, self)._iter_indices(): # these are the indices of classes in the partition # invert them into data indices diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 814525a8b87e8..1cf67128eb94e 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -483,7 +483,7 @@ def test_predefinedsplit_with_kfold_split(): assert_array_equal(ps_test, kf_test) -def test_shuffle_labels_out(): +def test_label_shuffle_split(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), @@ -492,8 +492,8 @@ def test_shuffle_labels_out(): for y in ys: n_iter = 6 - slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33, - random_state=0) + slo = cval.LabelShuffleSplit(y, n_iter, test_size=0.33, + random_state=0) # Make sure the repr works repr(slo) From 99c66f0b093e27dc8ac86eae935b0adbf5529e8e Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 14:30:27 -0400 Subject: [PATCH 20/21] LabelShuffleSplit test checks for proper ratios --- sklearn/tests/test_cross_validation.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 1cf67128eb94e..5d068b0782837 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -492,7 +492,8 @@ def test_label_shuffle_split(): for y in ys: n_iter = 6 - slo = cval.LabelShuffleSplit(y, n_iter, test_size=0.33, + test_size = 1./3 + slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size, random_state=0) # Make sure the repr works @@ -501,10 +502,14 @@ def test_label_shuffle_split(): # Test that the length is correct assert_equal(len(slo), n_iter) + y_unique = np.unique(y) + for train, test in slo: # First test: no train label is in the test set and vice versa - assert_false(np.any(np.in1d(y[train], np.unique(y[test])))) - assert_false(np.any(np.in1d(y[test], np.unique(y[train])))) + y_train_unique = np.unique(y[train]) + y_test_unique = np.unique(y[test]) + assert_false(np.any(np.in1d(y[train], y_test_unique))) + assert_false(np.any(np.in1d(y[test], y_train_unique))) # Second test: train and test add up to all the data assert_equal(y[train].size + y[test].size, y.size) @@ -512,6 +517,10 @@ def test_label_shuffle_split(): # Third test: train and test are disjoint assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) + # Fourth test: # unique train and test labels are correct, +- 1 for rounding error + assert_true(abs(len(y_test_unique) - round(test_size * len(y_unique))) <= 1) + assert_true(abs(len(y_train_unique) - round((1.0 - test_size) * len(y_unique))) <= 1) + def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if From 98906df9be0771d0ca16b3032fc25da278a7e0d4 Mon Sep 17 00:00:00 2001 From: Brian McFee Date: Thu, 13 Aug 2015 17:23:46 -0400 Subject: [PATCH 21/21] LabelShuffleSplit documentation header fix --- doc/modules/cross_validation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 1b45da9de9771..53afdf53550b1 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -430,8 +430,8 @@ validation that allows a finer control on the number of iterations and the proportion of samples in on each side of the train / test split. -Shuffle-Labels-Out ------------------- +Label-Shuffle-Split +------------------- :class:`LabelShuffleSplit`