scikit-learn · andreasvc · Oct 13, 2015 · Oct 13, 2015 · Oct 13, 2015 · Oct 13, 2015
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -279,9 +279,9 @@ Imagine you have three subjects, each with an associated number from 1 to 3::
   >>> lkf = LabelKFold(labels, n_folds=3)
   >>> for train, test in lkf:
   ...     print("%s %s" % (train, test))
-  [0 1 2 3 4 5] [6 7 8 9]
-  [0 1 2 6 7 8 9] [3 4 5]
   [3 4 5 6 7 8 9] [0 1 2]
+  [0 1 2 6 7 8 9] [3 4 5]
+  [0 1 2 3 4 5] [6 7 8 9]
 
 Each subject is in a different testing fold, and the same subject is never in
 both testing and training. Notice that the folds do not have exactly the same

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -351,6 +351,11 @@ class LabelKFold(_BaseKFold):
     The folds are approximately balanced in the sense that the number of
     distinct labels is approximately the same in each fold.
 
+    When ``shuffle`` is ``False``, the labels are distributed over folds
+    according to the order in which they first appear in ``labels``. This makes
+    it possible to get approximately stratified folds by sorting the samples on
+    an attribute beforehand.
+
     Parameters
     ----------
     labels : array-like with shape (n_samples, )
@@ -385,14 +390,14 @@ class LabelKFold(_BaseKFold):
     ...     y_train, y_test = y[train_index], y[test_index]
     ...     print(X_train, X_test, y_train, y_test)
     ...
-    TRAIN: [0 1] TEST: [2 3]
-    [[1 2]
-     [3 4]] [[5 6]
-     [7 8]] [1 2] [3 4]
     TRAIN: [2 3] TEST: [0 1]
     [[5 6]
      [7 8]] [[1 2]
      [3 4]] [3 4] [1 2]
+    TRAIN: [0 1] TEST: [2 3]
+    [[1 2]
+     [3 4]] [[5 6]
+     [7 8]] [1 2] [3 4]
 
     See also
     --------
@@ -403,7 +408,12 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
         super(LabelKFold, self).__init__(len(labels), n_folds, shuffle,
                                          random_state)
 
-        unique_labels, labels = np.unique(labels, return_inverse=True)
+        unique_labels, unique_inverse = np.unique(
+                labels, return_inverse=True)
+        # separate call to get unique_indices to work around bug in Numpy 1.6.2
+        # https://github.com/numpy/numpy/issues/2785
+        _unique_labels, unique_indices = np.unique(
+                unique_inverse, return_index=True)
         n_labels = len(unique_labels)
 
         if n_folds > n_labels:
@@ -412,36 +422,33 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
                      " than the number of labels: {1}.").format(n_folds,
                                                                 n_labels))
 
+        # np.unique gives labels in sorted order; this maps the
+        # indices of labels to their order of first occurrence
+        ordering = np.argsort(unique_indices)
+
         if shuffle:
-            # In case of ties in label weights, label names are indirectly
-            # used to assign samples to folds. When shuffle=True, label names
-            # are randomized to obtain random fold assigments.
+            # When shuffle=True, the order of labels is randomized to obtain
+            # random fold assigments.
             rng = check_random_state(self.random_state)
-            unique_labels = np.arange(n_labels, dtype=np.int)
-            rng.shuffle(unique_labels)
-            labels = unique_labels[labels]
-            unique_labels, labels = np.unique(labels, return_inverse=True)
+            rng.shuffle(ordering)
 
         # Weight labels by their number of occurences
-        n_samples_per_label = np.bincount(labels)
-
-        # Distribute the most frequent labels first
-        indices = np.argsort(n_samples_per_label)[::-1]
-        n_samples_per_label = n_samples_per_label[indices]
+        n_samples_per_label = np.bincount(unique_inverse)
 
         # Total weight of each fold
-        n_samples_per_fold = np.zeros(n_folds)
+        n_samples_per_fold = np.zeros(n_folds, dtype=np.intp)
 
         # Mapping from label index to fold index
-        label_to_fold = np.zeros(len(unique_labels))
+        label_to_fold = np.zeros(n_labels, dtype=np.intp)
 
-        # Distribute samples by adding the largest weight to the lightest fold
-        for label_index, weight in enumerate(n_samples_per_label):
-            lightest_fold = np.argmin(n_samples_per_fold)
-            n_samples_per_fold[lightest_fold] += weight
-            label_to_fold[indices[label_index]] = lightest_fold
+        # Distribute samples by adding labels to the fold with the least number
+        # of samples at each iteration
+        for n in ordering:
+            fold = np.argmin(n_samples_per_fold)
+            n_samples_per_fold[fold] += n_samples_per_label[n]
+            label_to_fold[n] = fold
 
-        self.idxs = label_to_fold[labels]
+        self.idxs = label_to_fold[unique_inverse]
 
     def _iter_test_indices(self):
         for f in range(self.n_folds):

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -359,8 +359,7 @@ def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
     assert_greater(mean_score, 0.85)
 
 
-def check_label_kfold(shuffle):
-    rng = np.random.RandomState(0)
+def check_label_kfold(shuffle, rng):
 
     # Parameters of the test
     n_labels = 15
@@ -369,7 +368,7 @@ def check_label_kfold(shuffle):
 
     # Construct the test data
     tolerance = 0.05 * n_samples  # 5 percent error allowed
-    labels = rng.randint(0, n_labels, n_samples)
+    labels = np.random.RandomState(rng).randint(0, n_labels, n_samples)
     folds = cval.LabelKFold(labels,
                             n_folds=n_folds,
                             shuffle=shuffle,
@@ -407,13 +406,20 @@ def check_label_kfold(shuffle):
     n_labels = len(np.unique(labels))
     n_samples = len(labels)
     n_folds = 5
-    tolerance = 0.05 * n_samples  # 5 percent error allowed
+    tolerance = 0.1 * n_samples  # 10 percent error allowed
     folds = cval.LabelKFold(labels,
                             n_folds=n_folds,
                             shuffle=shuffle,
                             random_state=rng).idxs
     ideal_n_labels_per_fold = n_samples // n_folds
 
+    # Shuffle should have an effect
+    otherfolds = cval.LabelKFold(labels,
+                            n_folds=n_folds,
+                            shuffle=not shuffle,
+                            random_state=rng).idxs
+    assert_not_equal(list(folds), list(otherfolds))
+
     # Check that folds have approximately the same size
     assert_equal(len(folds), len(labels))
     for i in np.unique(folds):
@@ -437,8 +443,9 @@ def check_label_kfold(shuffle):
 
 
 def test_label_kfold():
-    for shuffle in [False, True]:
-        yield check_label_kfold, shuffle
+    yield check_label_kfold, False, 0
+    for random_state in range(3):
+        yield check_label_kfold, True, random_state
 
 
 def test_shuffle_split():