8000 [WIP] LabelKFold: balance folds without sorting by andreasvc · Pull Request #5396 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] LabelKFold: balance folds without sorting #5396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/modules/cross_validation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ Imagine you have three subjects, each with an associated number from 1 to 3::
>>> lkf = LabelKFold(labels, n_folds=3)
>>> for train, test in lkf:
... print("%s %s" % (train, test))
[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]
[0 1 2 6 7 8 9] [3 4 5]
[0 1 2 3 4 5] [6 7 8 9]

Each subject is in a different testing fold, and the same subject is never in
both testing and training. Notice that the folds do not have exactly the same
Expand Down
57 changes: 32 additions & 25 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,11 @@ class LabelKFold(_BaseKFold):
The folds are approximately balanced in the sense that the number of
distinct labels is approximately the same in each fold.

When ``shuffle`` is ``False``, the labels are distributed over folds
according to the order in which they first appear in ``labels``. This makes
it possible to get approximately stratified folds by sorting the samples on
an attribute beforehand.

Parameters
----------
labels : array-like with shape (n_samples, )
Expand Down Expand Up @@ -385,14 +390,14 @@ class LabelKFold(_BaseKFold):
... y_train, y_test = y[train_index], y[test_index]
... print(X_train, X_test, y_train, y_test)
...
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
[7 8]] [[1 2]
[3 4]] [3 4] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [3 4]

See also
--------
Expand All @@ -403,7 +408,12 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
super(LabelKFold, self).__init__(len(labels), n_folds, shuffle,
random_state)

unique_labels, labels = np.unique(labels, return_inverse=True)
unique_labels, unique_inverse = np.unique(
labels, return_inverse=True)
# separate call to get unique_indices to work around bug in Numpy 1.6.2
# https://github.com/numpy/numpy/issues/2785
_unique_labels, unique_indices = np.unique(
unique_inverse, return_index=True)
n_labels = len(unique_labels)

if n_folds > n_labels:
Expand All @@ -412,36 +422,33 @@ def __init__(self, labels, n_folds=3, shuffle=False, random_state=None):
" than the number of labels: {1}.").format(n_folds,
n_labels))

# np.unique gives labels in sorted order; this maps the
# indices of labels to their order of first occurrence
ordering = np.argsort(unique_indices)

if shuffle:
# In case of ties in label weights, label names are indirectly
# used to assign samples to folds. When shuffle=True, label names
# are randomized to obtain random fold assigments.
# When shuffle=True, the order of labels is randomized to obtain
# random fold assigments.
rng = check_random_state(self.random_state)
unique_labels = np.arange(n_labels, dtype=np.int)
rng.shuffle(unique_labels)
labels = unique_labels[labels]
unique_labels, labels = np.unique(labels, return_inverse=True)
rng.shuffle(ordering)

# Weight labels by their number of occurences
n_samples_per_label = np.bincount(labels)

# Distribute the most frequent labels first
indices = np.argsort(n_samples_per_label)[::-1]
n_samples_per_label = n_samples_per_label[indices]
n_samples_per_label = np.bincount(unique_inverse)

# Total weight of each fold
n_samples_per_fold = np.zeros(n_folds)
n_samples_per_fold = np.zeros(n_folds, dtype=np.intp)

# Mapping from label index to fold index
label_to_fold = np.zeros(len(unique_labels))
label_to_fold = np.zeros(n_labels, dtype=np.intp)

# Distribute samples by adding the largest weight to the lightest fold
for label_index, weight in enumerate(n_samples_per_label):
lightest_fold = np.argmin(n_samples_per_fold)
n_samples_per_fold[lightest_fold] += weight
label_to_fold[indices[label_index]] = lightest_fold
# Distribute samples by adding labels to the fold with the least number
# of samples at each iteration
for n in ordering:
fold = np.argmin(n_samples_per_fold)
n_samples_per_fold[fold] += n_samples_per_label[n]
label_to_fold[n] = fold

self.idxs = label_to_fold[labels]
self.idxs = label_to_fold[unique_inverse]

def _iter_test_indices(self):
for f in range(self.n_folds):
Expand Down
19 changes: 13 additions & 6 deletions sklearn/tests/test_cross_validation.py
5728
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,7 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372
assert_greater(mean_score, 0.85)


def check_label_kfold(shuffle):
rng = np.random.RandomState(0)
def check_label_kfold(shuffle, rng):

# Parameters of the test
n_labels = 15
Expand All @@ -369,7 +368,7 @@ def check_label_kfold(shuffle):

# Construct the test data
tolerance = 0.05 * n_samples # 5 percent error allowed
labels = rng.randint(0, n_labels, n_samples)
labels = np.random.RandomState(rng).randint(0, n_labels, n_samples)
folds = cval.LabelKFold(labels,
n_folds=n_folds,
shuffle=shuffle,
Expand Down Expand Up @@ -407,13 +406,20 @@ def check_label_kfold(shuffle):
n_labels = len(np.unique(labels))
n_samples = len(labels)
n_folds = 5
tolerance = 0.05 * n_samples # 5 percent error allowed
tolerance = 0.1 * n_samples # 10 percent error allowed
folds = cval.LabelKFold(labels,
n_folds=n_folds,
shuffle=shuffle,
random_state=rng).idxs
ideal_n_labels_per_fold = n_samples // n_folds

# Shuffle should have an effect
otherfolds = cval.LabelKFold(labels,
n_folds=n_folds,
shuffle=not shuffle,
random_state=rng).idxs
assert_not_equal(list(folds), list(otherfolds))

# Check that folds have approximately the same size
assert_equal(len(folds), len(labels))
for i in np.unique(folds):
Expand All @@ -437,8 +443,9 @@ def check_label_kfold(shuffle):


def test_label_kfold():
for shuffle in [False, True]:
yield check_label_kfold, shuffle
yield check_label_kfold, False, 0
for random_state in range(3):
yield check_label_kfold, True, random_state


def test_shuffle_split():
Expand Down
0