From 8960b60463881be58f8aa6dc2b1c1ec4401448bd Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Tue, 13 Oct 2015 23:52:38 +0200 Subject: [PATCH 1/5] LabelKFold: balance folds without sorting This changes LabelKFold so that the original or shuffled order of samples is reflected in the folds. Instead of sorting the labels by frequency, balance is achieved just by looking at the smallest fold at each iteration. This means shuffling has an effect beyond tie breaking, and the order of samples can be used as a simple way of achieving stratification. Closes #5390; see also #5300 --- sklearn/cross_validation.py | 53 ++++++++++++++------------ sklearn/tests/test_cross_validation.py | 15 ++++++-- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 18b55ac69ce85..afd08f1c18550 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -351,6 +351,11 @@ class LabelKFold(_BaseKFold): The folds are approximately balanced in the sense that the number of distinct labels is approximately the same in each fold. + When ``shuffle`` is ``False``, the labels are distributed over folds + according to the order in which they first appear in ``labels``. This makes + it possible to get approximately stratified folds by sorting the samples on + an attribute beforehand. + Parameters ---------- labels : array-like with shape (n_samples, ) @@ -385,14 +390,14 @@ class LabelKFold(_BaseKFold): ... y_train, y_test = y[train_index], y[test_index] ... print(X_train, X_test, y_train, y_test) ... - TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] [[5 6] [7 8]] [[1 2] [3 4]] [3 4] [1 2] + TRAIN: [0 1] TEST: [2 3] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [3 4] See also -------- @@ -403,7 +408,8 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): super(LabelKFold, self).__init__(len(labels), n_folds, shuffle, random_state) - unique_labels, labels = np.unique(labels, return_inverse=True) + unique_labels, unique_indices, unique_inverse = np.unique( + labels, return_index=True, return_inverse=True) n_labels = len(unique_labels) if n_folds > n_labels: @@ -412,36 +418,33 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): " than the number of labels: {1}.").format(n_folds, n_labels)) + # np.unique gives labels in sorted order; this maps the + # indices of labels to their order of first occurrence + ordering = np.argsort(unique_indices) + if shuffle: - # In case of ties in label weights, label names are indirectly - # used to assign samples to folds. When shuffle=True, label names - # are randomized to obtain random fold assigments. + # When shuffle=True, the order of labels is randomized to obtain + # random fold assigments. rng = check_random_state(self.random_state) - unique_labels = np.arange(n_labels, dtype=np.int) - rng.shuffle(unique_labels) - labels = unique_labels[labels] - unique_labels, labels = np.unique(labels, return_inverse=True) + rng.shuffle(ordering) # Weight labels by their number of occurences - n_samples_per_label = np.bincount(labels) - - # Distribute the most frequent labels first - indices = np.argsort(n_samples_per_label)[::-1] - n_samples_per_label = n_samples_per_label[indices] + n_samples_per_label = np.bincount(unique_inverse) # Total weight of each fold - n_samples_per_fold = np.zeros(n_folds) + n_samples_per_fold = np.zeros(n_folds, dtype=np.intp) # Mapping from label index to fold index - label_to_fold = np.zeros(len(unique_labels)) + label_to_fold = np.zeros(n_labels, dtype=np.intp) - # Distribute samples by adding the largest weight to the lightest fold - for label_index, weight in enumerate(n_samples_per_label): - lightest_fold = np.argmin(n_samples_per_fold) - n_samples_per_fold[lightest_fold] += weight - label_to_fold[indices[label_index]] = lightest_fold + # Distribute samples by adding labels to the fold with the least number + # of samples at each iteration + for n in ordering: + fold = np.argmin(n_samples_per_fold) + n_samples_per_fold[fold] += n_samples_per_label[n] + label_to_fold[n] = fold - self.idxs = label_to_fold[labels] + self.idxs = label_to_fold[unique_inverse] def _iter_test_indices(self): for f in range(self.n_folds): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 2f6cf3142be30..ad413d73ebdc8 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -359,8 +359,7 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 assert_greater(mean_score, 0.85) -def check_label_kfold(shuffle): - rng = np.random.RandomState(0) +def check_label_kfold(shuffle, rng): # Parameters of the test n_labels = 15 @@ -435,10 +434,18 @@ def check_label_kfold(shuffle): labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3) + # shuffle should have an effect + otherfolds = cval.LabelKFold(labels, + n_folds=n_folds, + shuffle=not shuffle, + random_state=rng).idxs + assert_not_equal(folds, otherfolds) + def test_label_kfold(): - for shuffle in [False, True]: - yield check_label_kfold, shuffle + yield check_label_kfold, False, 0 + for random_state in range(3): + yield check_label_kfold, True, random_state def test_shuffle_split(): From a233315bf3709fed8a96c91418e6051928bc2643 Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Wed, 14 Oct 2015 00:20:36 +0200 Subject: [PATCH 2/5] fix random state --- sklearn/tests/test_cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index ad413d73ebdc8..c8cc7e425ae26 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -368,7 +368,7 @@ def check_label_kfold(shuffle, rng): # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed - labels = rng.randint(0, n_labels, n_samples) + labels = np.random.RandomState(rng).randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, From 2f214deea282176bfe9507276c1d910228418382 Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Wed, 14 Oct 2015 00:47:05 +0200 Subject: [PATCH 3/5] make tests pass --- sklearn/tests/test_cross_validation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index c8cc7e425ae26..ef01d3bc7360f 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -406,13 +406,20 @@ def check_label_kfold(shuffle, rng): n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 - tolerance = 0.05 * n_samples # 5 percent error allowed + tolerance = 0.1 * n_samples # 10 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, random_state=rng).idxs ideal_n_labels_per_fold = n_samples // n_folds + # Shuffle should have an effect + otherfolds = cval.LabelKFold(labels, + n_folds=n_folds, + shuffle=not shuffle, + random_state=rng).idxs + assert_not_equal(list(folds), list(otherfolds)) + # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): @@ -434,13 +441,6 @@ def check_label_kfold(shuffle, rng): labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3) - # shuffle should have an effect - otherfolds = cval.LabelKFold(labels, - n_folds=n_folds, - shuffle=not shuffle, - random_state=rng).idxs - assert_not_equal(folds, otherfolds) - def test_label_kfold(): yield check_label_kfold, False, 0 From 2c9d20211c63d96f29c3c21b6e0cb2a62a27a14b Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Wed, 14 Oct 2015 01:02:00 +0200 Subject: [PATCH 4/5] fix doctest --- doc/modules/cross_validation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index e76b95c6e48be..829b7773ea3c3 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -279,9 +279,9 @@ Imagine you have three subjects, each with an associated number from 1 to 3:: >>> lkf = LabelKFold(labels, n_folds=3) >>> for train, test in lkf: ... print("%s %s" % (train, test)) - [0 1 2 3 4 5] [6 7 8 9] - [0 1 2 6 7 8 9] [3 4 5] [3 4 5 6 7 8 9] [0 1 2] + [0 1 2 6 7 8 9] [3 4 5] + [0 1 2 3 4 5] [6 7 8 9] Each subject is in a different testing fold, and the same subject is never in both testing and training. Notice that the folds do not have exactly the same From 026909b7716c3d4fd9beaa7eb65f1cb4484fb332 Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Wed, 14 Oct 2015 01:38:49 +0200 Subject: [PATCH 5/5] work around bug in Numpy 1.6.2 --- sklearn/cross_validation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index afd08f1c18550..c2d077cbc9fa0 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -408,8 +408,12 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None): super(LabelKFold, self).__init__(len(labels), n_folds, shuffle, random_state) - unique_labels, unique_indices, unique_inverse = np.unique( - labels, return_index=True, return_inverse=True) + unique_labels, unique_inverse = np.unique( + labels, return_inverse=True) + # separate call to get unique_indices to work around bug in Numpy 1.6.2 + # https://github.com/numpy/numpy/issues/2785 + _unique_labels, unique_indices = np.unique( + unique_inverse, return_index=True) n_labels = len(unique_labels) if n_folds > n_labels: