10000 ENH: added LabelShuffleSplit cv iterator · scikit-learn/scikit-learn@21a966a · GitHub
[go: up one dir, main page]

Skip to content

Commit 21a966a

Browse files
bmcfeeglouppe
authored andcommitted
ENH: added LabelShuffleSplit cv iterator
1 parent c779569 commit 21a966a

File tree

4 files changed

+173
-2
lines changed

4 files changed

+173
-2
lines changed

doc/modules/cross_validation.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,40 @@ validation that allows a finer control on the number of iterations and
430430
the proportion of samples in on each side of the train / test split.
431431

432432

433+
Label-Shuffle-Split
434+
-------------------
435+
436+
:class:`LabelShuffleSplit`
437+
438+
The :class:`LabelShuffleSplit` iterator behaves as a combination of
439+
:class:`ShuffleSplit` and :class:`LeavePLabelsOut`, and generates a
440+
sequence of randomized partitions in which a subset of labels are held
441+
out for each split.
442+
443+
Here is a usage example::
444+
445+
>>> from sklearn.cross_validation import LabelShuffleSplit
446+
447+
>>> labels = [1, 1, 2, 2, 3, 3, 4, 4]
448+
>>> slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5,
449+
... random_state=0)
450+
>>> for train, test in slo:
451+
... print("%s %s" % (train, test))
452+
...
453+
[0 1 2 3] [4 5 6 7]
454+
[2 3 6 7] [0 1 4 5]
455+
[2 3 4 5] [0 1 6 7]
456+
[4 5 6 7] [0 1 2 3]
457+
458+
This class is useful when the behavior of :class:`LeavePLabelsOut` is
459+
desired, but the number of labels is large enough that generating all
460+
possible partitions with :math:`P` labels withheld would be prohibitively
461+
expensive. In such a scenario, :class:`LabelShuffleSplit` provides
462+
a random sample (with replacement) of the train / test splits
463+
generated by :class:`LeavePLabelsOut`.
464+
465+
466+
433467
Predefined Fold-Splits / Validation-Sets
434468
----------------------------------------
435469

doc/whats_new.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ New features
2222

2323
- The new class :class:`preprocessing.RobustScaler` provides an
2424
alternative to :class:`preprocessing.StandardScaler` for feature-wise
25-
centering and range normalization that is robust to outliers. By `Thomas Unterthiner`_.
25+
centering and range normalization that is robust to outliers.
26+
By `Thomas Unterthiner`_.
2627

2728
- The new class :class:`preprocessing.MaxAbsScaler` provides an
2829
alternative to :class:`preprocessing.MinMaxScaler` for feature-wise
@@ -33,6 +34,11 @@ New features
3334
function into a ``Pipeline``-compatible transformer object.
3435
By Joe Jevnik.
3536

37+
- :class:`cross_validation.LabelShuffleSplit` generates random train-test
38+
splits, similar to :class:`cross_validation.ShuffleSplit`, except that
39+
the splits are conditioned on a label array. By `Brian McFee`_.
40+
41+
3642
Enhancements
3743
............
3844

@@ -3581,3 +3587,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
35813587
.. _Thomas Unterthiner: https://github.com/untom
35823588

35833589
.. _Loic Esteve: https://github.com/lesteve
3590+
3591+
.. _Brian McFee: https://bmcfee.github.io

sklearn/cross_validation.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
'StratifiedKFold',
4242
'StratifiedShuffleSplit',
4343
'PredefinedSplit',
44+
'LabelShuffleSplit',
4445
'check_cv',
4546
'cross_val_score',
4647
'cross_val_predict',
@@ -962,6 +963,92 @@ def __len__(self):
962963
return len(self.unique_folds)
963964

964965

966+
class LabelShuffleSplit(ShuffleSplit):
967+
'''Shuffle-Labels-Out cross-validation iterator
968+
969+
Provides randomized train/test indices to split data according to a
970+
third-party provided label. This label information can be used to encode
971+
arbitrary domain specific stratifications of the samples as integers.
972+
973+
For instance the labels could be the year of collection of the samples
974+
and thus allow for cross-validation against time-based splits.
975+
976+
The difference between LeavePLabelOut and LabelShuffleSplit is that
977+
the former generates splits using all subsets of size ``p`` unique labels,
978+
whereas LabelShuffleSplit generates a user-determined number of random
979+
test splits, each with a user-determined fraction of unique labels.
980+
981+
For example, a less computationally intensive alternative to
982+
``LeavePLabelOut(labels, p=10)`` would be
983+
``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
984+
985+
Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
986+
not to samples, as in ShuffleSplit.
987+
988+
Parameters
989+
----------
990+
labels : array, [n_samples]
991+
Labels of samples
992+
993+
n_iter : int (default 5)
994+
Number of re-shuffling & splitting iterations.
995+
996+
test_size : float (default 0.2), int, or None
997+
If float, should be between 0.0 and 1.0 and represent the
998+
proportion of the labels to include in the test split. If
999+
int, represents the absolute number of test labels. If None,
1000+
the value is automatically set to the complement of the train size.
1001+
1002+
train_size : float, int, or None (default is None)
1003+
If float, should be between 0.0 and 1.0 and represent the
1004+
proportion of the labels to include in the train split. If
1005+
int, represents the absolute number of train labels. If None,
1006+
the value is automatically set to the complement of the test size.
1007+
1008+
random_state : int or RandomState
1009+
Pseudo-random number generator state used for random sampling.
1010+
'''
1011+
def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
1012+
random_state=None):
1013+
1014+
classes, label_indices = np.unique(labels, return_inverse=True)
1015+
1016+
super(LabelShuffleSplit, self).__init__(
1017+
len(classes),
1018+
n_iter=n_iter,
1019+
test_size=test_size,
1020+
train_size=train_size,
1021+
random_state=random_state)
1022+
1023+
self.labels = labels
1024+
self.classes = classes
1025+
self.label_indices = label_indices
1026+
1027+
def __repr__(self):
1028+
return ('%s(labels=%s, n_iter=%d, test_size=%s, '
1029+
'random_state=%s)' % (
1030+
self.__class__.__name__,
1031+
self.labels,
1032+
self.n_iter,
1033+
str(self.test_size),
1034+
self.random_state,
1035+
))
1036+
1037+
def __len__(self):
1038+
return self.n_iter
1039+
1040+
def _iter_indices(self):
1041+
for label_train, label_test in super(LabelShuffleSplit,
1042+
self)._iter_indices():
1043+
# these are the indices of classes in the partition
1044+
# invert them into data indices
1045+
1046+
train = np.flatnonzero(np.in1d(self.label_indices, label_train))
1047+
test = np.flatnonzero(np.in1d(self.label_indices, label_test))
1048+
1049+
yield train, test
1050+
1051+
9651052
##################################################################### F987 #########
9661053
def _index_param_value(X, v, indices):
9671054
"""Private helper function for parameter value indexing."""

sklearn/tests/test_cross_validation.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ def test_stratified_shuffle_split_iter():
415415
/ float(len(y[test])))
416416
assert_array_almost_equal(p_train, p_test, 1)
417417
assert_equal(y[train].size + y[test].size, y.size)
418-
assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
418+
assert_array_equal(np.intersect1d(train, test), [])
419419

420420

421421
def test_stratified_shuffle_split_even():
@@ -485,6 +485,48 @@ def test_predefinedsplit_with_kfold_split():
485485
assert_array_equal(ps_test, kf_test)
486486

487487

488+
def test_label_shuffle_split():
489+
ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
490+
np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
491+
np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
492+
np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
493+
]
494+
495+
for y in ys:
496+
n_iter = 6
497+
test_size = 1./3
498+
slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
499+
random_state=0)
500+
501+
# Make sure the repr works
502+
repr(slo)
503+
504+
# Test that the length is correct
505+
assert_equal(len(slo), n_iter)
506+
507+
y_unique = np.unique(y)
508+
509+
for train, test in slo:
510+
# First test: no train label is in the test set and vice versa
511+
y_train_unique = np.unique(y[train])
512+
y_test_unique = np.unique(y[test])
513+
assert_false(np.any(np.in1d(y[train], y_test_unique)))
514+
assert_false(np.any(np.in1d(y[test], y_train_unique)))
515+
516+
# Second test: train and test add up to all the data
517+
assert_equal(y[train].size + y[test].size, y.size)
518+
519+
# Third test: train and test are disjoint
520+
assert_array_equal(np.intersect1d(train, test), [])
521+
522+
# Fourth test: # unique train and test labels are correct,
523+
# +- 1 for rounding error
524+
assert_true(abs(len(y_test_unique) -
525+
round(test_size * len(y_unique))) <= 1)
526+
assert_true(abs(len(y_train_unique) -
527+
round((1.0 - test_size) * len(y_unique))) <= 1)
528+
529+
488530
def test_leave_label_out_changing_labels():
489531
# Check that LeaveOneLabelOut and LeavePLabelOut work normally if
490532
# the labels variable is changed before calling __iter__

0 commit comments

Comments
 (0)
0