From ebfea02267c763b4ecda92af5f65e56bb2db8157 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 11:12:53 -0400
Subject: [PATCH 01/21] added ShuffleLabelsOut cv iterator

---
 sklearn/cross_validation.py | 77 +++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 5e51f68cf117d..4a369c4d6fc9d 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -41,6 +41,7 @@
            'StratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
+           'ShuffleLabelsOut',
            'check_cv',
            'cross_val_score',
            'cross_val_predict',
@@ -944,6 +945,82 @@ def __len__(self):
         return len(self.unique_folds)
 
 
+class ShuffleLabelsOut(ShuffleSplit):
+    '''Shuffle-Labels-Out cross-validation iterator
+
+    Provides randomized train/test indices to split data according to a
+    third-party provided label. This label information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
+
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePLabelOut and ShuffleLabelsOut is that
+    the former generates splits using all subsets of size `p` unique labels,
+    whereas ShuffleLabelsOut generates a user-determined number of random
+    test splits, each with `p` unique labels.
+
+
+    Parameters
+    ----------
+    y :  array, [n_samples]
+        Labels of samples
+
+    n_iter : int (default 5)
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float (default 0.2), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the test split. If
+        int, represents the absolute number of test labels. If None,
+        the value is automatically set to the complement of the train size.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the train split. If
+        int, represents the absolute number of train labels. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+    '''
+
+    def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
+                 random_state=None):
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+
+        super(ShuffleLabelsOut, self).__init__(
+            len(classes), n_iter, test_size, train_size, random_state)
+
+        self.classes = classes
+        self.y_indices = y_indices
+
+    def __repr__(self):
+        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
+                'random_state=%s)' % (
+                    self.__class__.__name__,
+                    self.y,
+                    self.n_iter,
+                    str(self.test_size),
+                    self.random_state,
+                ))
+
+    def __len__(self):
+        return self.n_iter
+
+    def _iter_indices(self):
+
+        for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices():
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.in1d(self.y_indices, y_train))
+            test = np.flatnonzero(np.in1d(self.y_indices, y_test))
+
+            yield train, test
+
+
 ##############################################################################
 def _index_param_value(X, v, indices):
     """Private helper function for parameter value indexing."""

From 36c9c3def8d060ef1150bad16ec18ef250349db5 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 11:37:54 -0400
Subject: [PATCH 02/21] fixed tests for shufflelabelsout

---
 sklearn/tests/test_cross_validation.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 786bf561ec5e2..1297bc3c3b2fc 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -483,6 +483,29 @@ def test_predefinedsplit_with_kfold_split():
     assert_array_equal(ps_test, kf_test)
 
 
+def test_shuffle_labels_out():
+    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+          ]
+
+    for y in ys:
+        slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33,
+                                    random_state=0)
+
+        for train, test in slo:
+            # First test: no train label is in the test set and vice versa
+            assert_false(np.any(np.in1d(y[train], np.unique(y[test]))))
+            assert_false(np.any(np.in1d(y[test], np.unique(y[train]))))
+
+            # Second test: train and test add up to all the data
+            assert_equal(y[train].size + y[test].size, y.size)
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
+
+
 def test_leave_label_out_changing_labels():
     # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
     # the labels variable is changed before calling __iter__

From 3493ec7d77ee805d0f535914a16a7658d9dd31e2 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 15:39:09 -0400
Subject: [PATCH 03/21] updated docstring

---
 sklearn/cross_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 4a369c4d6fc9d..45b37fb387c4a 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -958,7 +958,7 @@ class ShuffleLabelsOut(ShuffleSplit):
     The difference between LeavePLabelOut and ShuffleLabelsOut is that
     the former generates splits using all subsets of size `p` unique labels,
     whereas ShuffleLabelsOut generates a user-determined number of random
-    test splits, each with `p` unique labels.
+    test splits, each with a user-determined fraction of unique labels.
 
 
     Parameters

From 62e1996b366af4b975d18a0d816f83e532268ce7 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Sat, 18 Apr 2015 11:48:44 -0400
Subject: [PATCH 04/21] Fixed an error in call to the super constructor

---
 sklearn/cross_validation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 45b37fb387c4a..3a247bdb14659 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -991,7 +991,11 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
         classes, y_indices = np.unique(y, return_inverse=True)
 
         super(ShuffleLabelsOut, self).__init__(
-            len(classes), n_iter, test_size, train_size, random_state)
+            len(classes),
+            n_iter=n_iter,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state)
 
         self.classes = classes
         self.y_indices = y_indices

From ba7f81e648a419427c7ed6cb0d70e94ff2310d12 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 12 May 2015 19:41:21 -0400
Subject: [PATCH 05/21] fixed repr, variable names in ShuffleLabelsOut

---
 sklearn/cross_validation.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 3a247bdb14659..85e0229a400bb 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -963,7 +963,7 @@ class ShuffleLabelsOut(ShuffleSplit):
 
     Parameters
     ----------
-    y :  array, [n_samples]
+    labels :  array, [n_samples]
         Labels of samples
 
     n_iter : int (default 5)
@@ -985,10 +985,10 @@ class ShuffleLabelsOut(ShuffleSplit):
         Pseudo-random number generator state used for random sampling.
     '''
 
-    def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
+    def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
                  random_state=None):
 
-        classes, y_indices = np.unique(y, return_inverse=True)
+        classes, label_indices = np.unique(labels, return_inverse=True)
 
         super(ShuffleLabelsOut, self).__init__(
             len(classes),
@@ -997,14 +997,15 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
             train_size=train_size,
             random_state=random_state)
 
+        self.labels = labels
         self.classes = classes
-        self.y_indices = y_indices
+        self.label_indices = label_indices
 
     def __repr__(self):
         return ('%s(labels=%s, n_iter=%d, test_size=%s, '
                 'random_state=%s)' % (
                     self.__class__.__name__,
-                    self.y,
+                    self.labels,
                     self.n_iter,
                     str(self.test_size),
                     self.random_state,
@@ -1015,12 +1016,12 @@ def __len__(self):
 
     def _iter_indices(self):
 
-        for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices():
+        for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices():
             # these are the indices of classes in the partition
             # invert them into data indices
 
-            train = np.flatnonzero(np.in1d(self.y_indices, y_train))
-            test = np.flatnonzero(np.in1d(self.y_indices, y_test))
+            train = np.flatnonzero(np.in1d(self.label_indices, label_train))
+            test = np.flatnonzero(np.in1d(self.label_indices, label_test))
 
             yield train, test
 

From a0a27648f129ecd42c5a658030d33ed8a8e1a4a4 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 12 May 2015 19:46:43 -0400
Subject: [PATCH 06/21] added length and repr tests to ShuffleLabelsOut

---
 sklearn/tests/test_cross_validation.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 1297bc3c3b2fc..757ed301101fd 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -491,9 +491,16 @@ def test_shuffle_labels_out():
           ]
 
     for y in ys:
-        slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33,
+        n_iter = 6
+        slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33,
                                     random_state=0)
 
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert_equal(len(slo), n_iter)
+
         for train, test in slo:
             # First test: no train label is in the test set and vice versa
             assert_false(np.any(np.in1d(y[train], np.unique(y[test]))))

From 0030d32ffaf78b92034e8714fb1fab1ec74ca6b0 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 23 Jun 2015 13:56:36 -0400
Subject: [PATCH 07/21] added documentation for ShuffleLabelsOut

---
 doc/modules/cross_validation.rst | 34 ++++++++++++++++++++++++++++++++
 doc/whats_new.rst                |  6 ++++++
 sklearn/cross_validation.py      |  7 +++++++
 3 files changed, 47 insertions(+)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 0aa6bf1e3b692..0a4592e7020bd 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and
 the proportion of samples in on each side of the train / test split.
 
 
+Shuffle-Labels-Out
+------------------
+
+:class:`ShuffleLabelsOut`
+
+The :class:`ShuffleLabelsOut` iterator behaves as a combination of 
+:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a 
+sequence of randomized partitions in which a subset of labels are held
+out for each split.
+
+Here is a usage example::
+
+  >>> from sklearn.cross_validation import ShuffleLabelsOut
+  
+  >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
+  >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5,
+  ...                        random_state=0)
+  >>> for train, test in slo:
+  ...     print("%s %s" % (train, test))
+  ...
+  [0 1 2 3] [4 5 6 7]
+  [2 3 6 7] [0 1 4 5]
+  [2 3 4 5] [0 1 6 7]
+  [4 5 6 7] [0 1 2 3]
+
+This class is useful when the behavior of :class:`LeavePLabelsOut` is
+desired, but the number of labels is large enough that generating all
+possible partitions with :math:`P` labels withheld would be prohibitively
+expensive.  In such a scenario, :class:`ShuffleLabelsOut` provides
+a random sample (with replacement) of the train / test splits
+generated by :class:`LeavePLabelsOut`.
+
+
+
 Predefined Fold-Splits / Validation-Sets
 ----------------------------------------
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 41ba47868fa00..634a66619cca3 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -13,6 +13,10 @@ Changelog
 New features
 ............
 
+   - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits,
+     similer to `ShuffleSplit`, except that the splits are conditioned on a label array.
+     By `Brian McFee`_.
+
 Enhancements
 ............
 
@@ -3445,3 +3449,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Eric Martin: http://ericmart.in
 
 .. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
+
+.. _Brian McFee: https://bmcfee.github.io
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 85e0229a400bb..18d7740ef2e1a 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -960,6 +960,13 @@ class ShuffleLabelsOut(ShuffleSplit):
     whereas ShuffleLabelsOut generates a user-determined number of random
     test splits, each with a user-determined fraction of unique labels.
 
+    For example, a less computationally intensive alternative to
+    `LeavePLabelOut(labels, p=10)` would be
+    `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`.
+
+    Note: The parameters `test_size` and `train_size` refer to labels, and not
+    to samples, as in ShuffleSplit.
+
 
     Parameters
     ----------

From 13c15fcda493143e5daa05172d226d994b20c9a4 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 09:45:59 -0400
Subject: [PATCH 08/21] updated ShuffleLabelsOut in whats_new

---
 doc/whats_new.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 634a66619cca3..a72ce275e9f9d 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -14,8 +14,8 @@ New features
 ............
 
    - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits,
-     similer to `ShuffleSplit`, except that the splits are conditioned on a label array.
-     By `Brian McFee`_.
+     similar to :class:`cross_validation.ShuffleSplit`, except that the splits are 
+     conditioned on a label array. By `Brian McFee`_.
 
 Enhancements
 ............

From 56698920ff467c19acc56074c3ac3bc07ddf5f2b Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 09:47:38 -0400
Subject: [PATCH 09/21] ShuffleLabelsOut updated docstring for double-backtick

---
 sklearn/cross_validation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 18d7740ef2e1a..46a0f1409ecdb 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -956,16 +956,16 @@ class ShuffleLabelsOut(ShuffleSplit):
     and thus allow for cross-validation against time-based splits.
 
     The difference between LeavePLabelOut and ShuffleLabelsOut is that
-    the former generates splits using all subsets of size `p` unique labels,
+    the former generates splits using all subsets of size ``p`` unique labels,
     whereas ShuffleLabelsOut generates a user-determined number of random
     test splits, each with a user-determined fraction of unique labels.
 
     For example, a less computationally intensive alternative to
-    `LeavePLabelOut(labels, p=10)` would be
-    `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`.
+    ``LeavePLabelOut(labels, p=10)`` would be
+    ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``.
 
-    Note: The parameters `test_size` and `train_size` refer to labels, and not
-    to samples, as in ShuffleSplit.
+    Note: The parameters ``test_size`` and ``train_size`` refer to labels, and 
+    not to samples, as in ShuffleSplit.
 
 
     Parameters

From f0c4a7507e18def166f419b02ea33d7b0e382c8c Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 11:12:53 -0400
Subject: [PATCH 10/21] added ShuffleLabelsOut cv iterator

---
 sklearn/cross_validation.py | 77 +++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index fa7c7f210bc05..9d5a8c65fb84f 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -41,6 +41,7 @@
            'StratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
+           'ShuffleLabelsOut',
            'check_cv',
            'cross_val_score',
            'cross_val_predict',
@@ -962,6 +963,82 @@ def __len__(self):
         return len(self.unique_folds)
 
 
+class ShuffleLabelsOut(ShuffleSplit):
+    '''Shuffle-Labels-Out cross-validation iterator
+
+    Provides randomized train/test indices to split data according to a
+    third-party provided label. This label information can be used to encode
+    arbitrary domain specific stratifications of the samples as integers.
+
+    For instance the labels could be the year of collection of the samples
+    and thus allow for cross-validation against time-based splits.
+
+    The difference between LeavePLabelOut and ShuffleLabelsOut is that
+    the former generates splits using all subsets of size `p` unique labels,
+    whereas ShuffleLabelsOut generates a user-determined number of random
+    test splits, each with `p` unique labels.
+
+
+    Parameters
+    ----------
+    y :  array, [n_samples]
+        Labels of samples
+
+    n_iter : int (default 5)
+        Number of re-shuffling & splitting iterations.
+
+    test_size : float (default 0.2), int, or None
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the test split. If
+        int, represents the absolute number of test labels. If None,
+        the value is automatically set to the complement of the train size.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the labels to include in the train split. If
+        int, represents the absolute number of train labels. If None,
+        the value is automatically set to the complement of the test size.
+
+    random_state : int or RandomState
+        Pseudo-random number generator state used for random sampling.
+    '''
+
+    def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
+                 random_state=None):
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+
+        super(ShuffleLabelsOut, self).__init__(
+            len(classes), n_iter, test_size, train_size, random_state)
+
+        self.classes = classes
+        self.y_indices = y_indices
+
+    def __repr__(self):
+        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
+                'random_state=%s)' % (
+                    self.__class__.__name__,
+                    self.y,
+                    self.n_iter,
+                    str(self.test_size),
+                    self.random_state,
+                ))
+
+    def __len__(self):
+        return self.n_iter
+
+    def _iter_indices(self):
+
+        for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices():
+            # these are the indices of classes in the partition
+            # invert them into data indices
+
+            train = np.flatnonzero(np.in1d(self.y_indices, y_train))
+            test = np.flatnonzero(np.in1d(self.y_indices, y_test))
+
+            yield train, test
+
+
 ##############################################################################
 def _index_param_value(X, v, indices):
     """Private helper function for parameter value indexing."""

From d668f3b2660ed9cf72c2e9974a26acc1318686d4 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 11:37:54 -0400
Subject: [PATCH 11/21] fixed tests for shufflelabelsout

---
 sklearn/tests/test_cross_validation.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index b33e2b4c279d5..550816f530641 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -483,6 +483,29 @@ def test_predefinedsplit_with_kfold_split():
     assert_array_equal(ps_test, kf_test)
 
 
+def test_shuffle_labels_out():
+    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
+          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
+          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
+          ]
+
+    for y in ys:
+        slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33,
+                                    random_state=0)
+
+        for train, test in slo:
+            # First test: no train label is in the test set and vice versa
+            assert_false(np.any(np.in1d(y[train], np.unique(y[test]))))
+            assert_false(np.any(np.in1d(y[test], np.unique(y[train]))))
+
+            # Second test: train and test add up to all the data
+            assert_equal(y[train].size + y[test].size, y.size)
+
+            # Third test: train and test are disjoint
+            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
+
+
 def test_leave_label_out_changing_labels():
     # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
     # the labels variable is changed before calling __iter__

From 068cb38a8aba7b13a583c7d9c13edd2e585aaf05 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Mon, 13 Apr 2015 15:39:09 -0400
Subject: [PATCH 12/21] updated docstring

---
 sklearn/cross_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 9d5a8c65fb84f..10d024e9c8ae1 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -976,7 +976,7 @@ class ShuffleLabelsOut(ShuffleSplit):
     The difference between LeavePLabelOut and ShuffleLabelsOut is that
     the former generates splits using all subsets of size `p` unique labels,
     whereas ShuffleLabelsOut generates a user-determined number of random
-    test splits, each with `p` unique labels.
+    test splits, each with a user-determined fraction of unique labels.
 
 
     Parameters

From 74c265774a8c4e978020a19b70ec42765adda747 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Sat, 18 Apr 2015 11:48:44 -0400
Subject: [PATCH 13/21] Fixed an error in call to the super constructor

---
 sklearn/cross_validation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 10d024e9c8ae1..0fb7e5f6beb64 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1009,7 +1009,11 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
         classes, y_indices = np.unique(y, return_inverse=True)
 
         super(ShuffleLabelsOut, self).__init__(
-            len(classes), n_iter, test_size, train_size, random_state)
+            len(classes),
+            n_iter=n_iter,
+            test_size=test_size,
+            train_size=train_size,
+            random_state=random_state)
 
         self.classes = classes
         self.y_indices = y_indices

From 41d3704b4375ff880e1441ffdd421d09f0d230a7 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 12 May 2015 19:41:21 -0400
Subject: [PATCH 14/21] fixed repr, variable names in ShuffleLabelsOut

---
 sklearn/cross_validation.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 0fb7e5f6beb64..ebd24e9498419 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -981,7 +981,7 @@ class ShuffleLabelsOut(ShuffleSplit):
 
     Parameters
     ----------
-    y :  array, [n_samples]
+    labels :  array, [n_samples]
         Labels of samples
 
     n_iter : int (default 5)
@@ -1003,10 +1003,10 @@ class ShuffleLabelsOut(ShuffleSplit):
         Pseudo-random number generator state used for random sampling.
     '''
 
-    def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
+    def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
                  random_state=None):
 
-        classes, y_indices = np.unique(y, return_inverse=True)
+        classes, label_indices = np.unique(labels, return_inverse=True)
 
         super(ShuffleLabelsOut, self).__init__(
             len(classes),
@@ -1015,14 +1015,15 @@ def __init__(self, y, n_iter=5, test_size=0.2, train_size=None,
             train_size=train_size,
             random_state=random_state)
 
+        self.labels = labels
         self.classes = classes
-        self.y_indices = y_indices
+        self.label_indices = label_indices
 
     def __repr__(self):
         return ('%s(labels=%s, n_iter=%d, test_size=%s, '
                 'random_state=%s)' % (
                     self.__class__.__name__,
-                    self.y,
+                    self.labels,
                     self.n_iter,
                     str(self.test_size),
                     self.random_state,
@@ -1033,12 +1034,12 @@ def __len__(self):
 
     def _iter_indices(self):
 
-        for y_train, y_test in super(ShuffleLabelsOut, self)._iter_indices():
+        for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices():
             # these are the indices of classes in the partition
             # invert them into data indices
 
-            train = np.flatnonzero(np.in1d(self.y_indices, y_train))
-            test = np.flatnonzero(np.in1d(self.y_indices, y_test))
+            train = np.flatnonzero(np.in1d(self.label_indices, label_train))
+            test = np.flatnonzero(np.in1d(self.label_indices, label_test))
 
             yield train, test
 

From 085f38177d84554c3abd368f57d1b4fbd5a4e7df Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 12 May 2015 19:46:43 -0400
Subject: [PATCH 15/21] added length and repr tests to ShuffleLabelsOut

---
 sklearn/tests/test_cross_validation.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 550816f530641..814525a8b87e8 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -491,9 +491,16 @@ def test_shuffle_labels_out():
           ]
 
     for y in ys:
-        slo = cval.ShuffleLabelsOut(y, 6, test_size=0.33,
+        n_iter = 6
+        slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33,
                                     random_state=0)
 
+        # Make sure the repr works
+        repr(slo)
+
+        # Test that the length is correct
+        assert_equal(len(slo), n_iter)
+
         for train, test in slo:
             # First test: no train label is in the test set and vice versa
             assert_false(np.any(np.in1d(y[train], np.unique(y[test]))))

From 915144a95dd7cd959e4cb9b10497a34e77a95756 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Tue, 23 Jun 2015 13:56:36 -0400
Subject: [PATCH 16/21] added documentation for ShuffleLabelsOut

---
 doc/modules/cross_validation.rst | 34 ++++++++++++++++++++++++++++++++
 doc/whats_new.rst                |  6 ++++++
 sklearn/cross_validation.py      |  7 +++++++
 3 files changed, 47 insertions(+)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 9c25cfb417fd4..093ecd9065c7e 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and
 the proportion of samples in on each side of the train / test split.
 
 
+Shuffle-Labels-Out
+------------------
+
+:class:`ShuffleLabelsOut`
+
+The :class:`ShuffleLabelsOut` iterator behaves as a combination of 
+:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a 
+sequence of randomized partitions in which a subset of labels are held
+out for each split.
+
+Here is a usage example::
+
+  >>> from sklearn.cross_validation import ShuffleLabelsOut
+  
+  >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
+  >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5,
+  ...                        random_state=0)
+  >>> for train, test in slo:
+  ...     print("%s %s" % (train, test))
+  ...
+  [0 1 2 3] [4 5 6 7]
+  [2 3 6 7] [0 1 4 5]
+  [2 3 4 5] [0 1 6 7]
+  [4 5 6 7] [0 1 2 3]
+
+This class is useful when the behavior of :class:`LeavePLabelsOut` is
+desired, but the number of labels is large enough that generating all
+possible partitions with :math:`P` labels withheld would be prohibitively
+expensive.  In such a scenario, :class:`ShuffleLabelsOut` provides
+a random sample (with replacement) of the train / test splits
+generated by :class:`LeavePLabelsOut`.
+
+
+
 Predefined Fold-Splits / Validation-Sets
 ----------------------------------------
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index e45e208890ee5..5d9b61e3f7f45 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -33,6 +33,10 @@ New features
      function into a ``Pipeline``-compatible transformer object.
      By Joe Jevnik.
 
+   - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits,
+     similer to `ShuffleSplit`, except that the splits are conditioned on a label array.
+     By `Brian McFee`_.
+
 Enhancements
 ............
 
@@ -3574,3 +3578,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Thomas Unterthiner: https://github.com/untom
 
 .. _Loic Esteve: https://github.com/lesteve
+
+.. _Brian McFee: https://bmcfee.github.io
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index ebd24e9498419..a13be82ee59b0 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -978,6 +978,13 @@ class ShuffleLabelsOut(ShuffleSplit):
     whereas ShuffleLabelsOut generates a user-determined number of random
     test splits, each with a user-determined fraction of unique labels.
 
+    For example, a less computationally intensive alternative to
+    `LeavePLabelOut(labels, p=10)` would be
+    `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`.
+
+    Note: The parameters `test_size` and `train_size` refer to labels, and not
+    to samples, as in ShuffleSplit.
+
 
     Parameters
     ----------

From e2a5ad736d5ba24ecd24583de104a6bc35500060 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 09:45:59 -0400
Subject: [PATCH 17/21] updated ShuffleLabelsOut in whats_new

---
 doc/whats_new.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 5d9b61e3f7f45..602c5258541c1 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -34,8 +34,8 @@ New features
      By Joe Jevnik.
 
    - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits,
-     similer to `ShuffleSplit`, except that the splits are conditioned on a label array.
-     By `Brian McFee`_.
+     similar to :class:`cross_validation.ShuffleSplit`, except that the splits are 
+     conditioned on a label array. By `Brian McFee`_.
 
 Enhancements
 ............

From e2b2a6e82307fbc00b6e8f8772ff2199d9cdf947 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 09:47:38 -0400
Subject: [PATCH 18/21] ShuffleLabelsOut updated docstring for double-backtick

---
 sklearn/cross_validation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index a13be82ee59b0..64c795700fa35 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -974,16 +974,16 @@ class ShuffleLabelsOut(ShuffleSplit):
     and thus allow for cross-validation against time-based splits.
 
     The difference between LeavePLabelOut and ShuffleLabelsOut is that
-    the former generates splits using all subsets of size `p` unique labels,
+    the former generates splits using all subsets of size ``p`` unique labels,
     whereas ShuffleLabelsOut generates a user-determined number of random
     test splits, each with a user-determined fraction of unique labels.
 
     For example, a less computationally intensive alternative to
-    `LeavePLabelOut(labels, p=10)` would be
-    `ShuffleLabelsOut(labels, test_size=10, n_iter=100)`.
+    ``LeavePLabelOut(labels, p=10)`` would be
+    ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``.
 
-    Note: The parameters `test_size` and `train_size` refer to labels, and not
-    to samples, as in ShuffleSplit.
+    Note: The parameters ``test_size`` and ``train_size`` refer to labels, and 
+    not to samples, as in ShuffleSplit.
 
 
     Parameters

From ab74b1dba890698b39289e659b4ecab48619a5cd Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 14:14:55 -0400
Subject: [PATCH 19/21] renamed ShuffleLabelsOut to LabelShuffleSplit

---
 doc/modules/cross_validation.rst       | 10 +++++-----
 doc/whats_new.rst                      |  2 +-
 sklearn/cross_validation.py            | 14 +++++++-------
 sklearn/tests/test_cross_validation.py |  6 +++---
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 093ecd9065c7e..1b45da9de9771 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -433,19 +433,19 @@ the proportion of samples in on each side of the train / test split.
 Shuffle-Labels-Out
 ------------------
 
-:class:`ShuffleLabelsOut`
+:class:`LabelShuffleSplit`
 
-The :class:`ShuffleLabelsOut` iterator behaves as a combination of 
+The :class:`LabelShuffleSplit` iterator behaves as a combination of 
 :class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a 
 sequence of randomized partitions in which a subset of labels are held
 out for each split.
 
 Here is a usage example::
 
-  >>> from sklearn.cross_validation import ShuffleLabelsOut
+  >>> from sklearn.cross_validation import LabelShuffleSplit
   
   >>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
-  >>> slo = ShuffleLabelsOut(labels, n_iter=4, test_size=0.5,
+  >>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5,
   ...                        random_state=0)
   >>> for train, test in slo:
   ...     print("%s %s" % (train, test))
@@ -458,7 +458,7 @@ Here is a usage example::
 This class is useful when the behavior of :class:`LeavePLabelsOut` is
 desired, but the number of labels is large enough that generating all
 possible partitions with :math:`P` labels withheld would be prohibitively
-expensive.  In such a scenario, :class:`ShuffleLabelsOut` provides
+expensive.  In such a scenario, :class:`LabelShuffleSplit` provides
 a random sample (with replacement) of the train / test splits
 generated by :class:`LeavePLabelsOut`.
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 602c5258541c1..bb719018c8e82 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -33,7 +33,7 @@ New features
      function into a ``Pipeline``-compatible transformer object.
      By Joe Jevnik.
 
-   - :class:`cross_validation.ShuffleLabelsOut` generates random train-test splits,
+   - :class:`cross_validation.LabelShuffleSplit` generates random train-test splits,
      similar to :class:`cross_validation.ShuffleSplit`, except that the splits are 
      conditioned on a label array. By `Brian McFee`_.
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 64c795700fa35..011bf4d559cff 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -41,7 +41,7 @@
            'StratifiedKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
-           'ShuffleLabelsOut',
+           'LabelShuffleSplit',
            'check_cv',
            'cross_val_score',
            'cross_val_predict',
@@ -963,7 +963,7 @@ def __len__(self):
         return len(self.unique_folds)
 
 
-class ShuffleLabelsOut(ShuffleSplit):
+class LabelShuffleSplit(ShuffleSplit):
     '''Shuffle-Labels-Out cross-validation iterator
 
     Provides randomized train/test indices to split data according to a
@@ -973,14 +973,14 @@ class ShuffleLabelsOut(ShuffleSplit):
     For instance the labels could be the year of collection of the samples
     and thus allow for cross-validation against time-based splits.
 
-    The difference between LeavePLabelOut and ShuffleLabelsOut is that
+    The difference between LeavePLabelOut and LabelShuffleSplit is that
     the former generates splits using all subsets of size ``p`` unique labels,
-    whereas ShuffleLabelsOut generates a user-determined number of random
+    whereas LabelShuffleSplit generates a user-determined number of random
     test splits, each with a user-determined fraction of unique labels.
 
     For example, a less computationally intensive alternative to
     ``LeavePLabelOut(labels, p=10)`` would be
-    ``ShuffleLabelsOut(labels, test_size=10, n_iter=100)``.
+    ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
 
     Note: The parameters ``test_size`` and ``train_size`` refer to labels, and 
     not to samples, as in ShuffleSplit.
@@ -1015,7 +1015,7 @@ def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
 
         classes, label_indices = np.unique(labels, return_inverse=True)
 
-        super(ShuffleLabelsOut, self).__init__(
+        super(LabelShuffleSplit, self).__init__(
             len(classes),
             n_iter=n_iter,
             test_size=test_size,
@@ -1041,7 +1041,7 @@ def __len__(self):
 
     def _iter_indices(self):
 
-        for label_train, label_test in super(ShuffleLabelsOut, self)._iter_indices():
+        for label_train, label_test in super(LabelShuffleSplit, self)._iter_indices():
             # these are the indices of classes in the partition
             # invert them into data indices
 
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 814525a8b87e8..1cf67128eb94e 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -483,7 +483,7 @@ def test_predefinedsplit_with_kfold_split():
     assert_array_equal(ps_test, kf_test)
 
 
-def test_shuffle_labels_out():
+def test_label_shuffle_split():
     ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
           np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
           np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
@@ -492,8 +492,8 @@ def test_shuffle_labels_out():
 
     for y in ys:
         n_iter = 6
-        slo = cval.ShuffleLabelsOut(y, n_iter, test_size=0.33,
-                                    random_state=0)
+        slo = cval.LabelShuffleSplit(y, n_iter, test_size=0.33,
+                                     random_state=0)
 
         # Make sure the repr works
         repr(slo)

From 99c66f0b093e27dc8ac86eae935b0adbf5529e8e Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 14:30:27 -0400
Subject: [PATCH 20/21] LabelShuffleSplit test checks for proper ratios

---
 sklearn/tests/test_cross_validation.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
index 1cf67128eb94e..5d068b0782837 100644
--- a/sklearn/tests/test_cross_validation.py
+++ b/sklearn/tests/test_cross_validation.py
@@ -492,7 +492,8 @@ def test_label_shuffle_split():
 
     for y in ys:
         n_iter = 6
-        slo = cval.LabelShuffleSplit(y, n_iter, test_size=0.33,
+        test_size = 1./3
+        slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
                                      random_state=0)
 
         # Make sure the repr works
@@ -501,10 +502,14 @@ def test_label_shuffle_split():
         # Test that the length is correct
         assert_equal(len(slo), n_iter)
 
+        y_unique = np.unique(y)
+
         for train, test in slo:
             # First test: no train label is in the test set and vice versa
-            assert_false(np.any(np.in1d(y[train], np.unique(y[test]))))
-            assert_false(np.any(np.in1d(y[test], np.unique(y[train]))))
+            y_train_unique = np.unique(y[train])
+            y_test_unique = np.unique(y[test])
+            assert_false(np.any(np.in1d(y[train], y_test_unique)))
+            assert_false(np.any(np.in1d(y[test], y_train_unique)))
 
             # Second test: train and test add up to all the data
             assert_equal(y[train].size + y[test].size, y.size)
@@ -512,6 +517,10 @@ def test_label_shuffle_split():
             # Third test: train and test are disjoint
             assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
 
+            # Fourth test: # unique train and test labels are correct, +- 1 for rounding error
+            assert_true(abs(len(y_test_unique) - round(test_size * len(y_unique))) <= 1)
+            assert_true(abs(len(y_train_unique) - round((1.0 - test_size) * len(y_unique))) <= 1)
+
 
 def test_leave_label_out_changing_labels():
     # Check that LeaveOneLabelOut and LeavePLabelOut work normally if

From 98906df9be0771d0ca16b3032fc25da278a7e0d4 Mon Sep 17 00:00:00 2001
From: Brian McFee <brian.mcfee@nyu.edu>
Date: Thu, 13 Aug 2015 17:23:46 -0400
Subject: [PATCH 21/21] LabelShuffleSplit documentation header fix

---
 doc/modules/cross_validation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 1b45da9de9771..53afdf53550b1 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -430,8 +430,8 @@ validation that allows a finer control on the number of iterations and
 the proportion of samples in on each side of the train / test split.
 
 
-Shuffle-Labels-Out
-------------------
+Label-Shuffle-Split
+-------------------
 
 :class:`LabelShuffleSplit`