10000 BinnedStratifiedKFold ported to model_selection · scikit-learn/scikit-learn@567e9a8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 567e9a8

Browse files
committed
BinnedStratifiedKFold ported to model_selection
1 parent c0af740 commit 567e9a8

File tree

3 files changed

+285
-25
lines changed

3 files changed

+285
-25
lines changed

sklearn/model_selection/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from ._split import KFold
33
from ._split import LabelKFold
44
from ._split import StratifiedKFold
5+
from ._split import BinnedStratifiedKFold
56
from ._split import LeaveOneLabelOut
67
from ._split import LeaveOneOut
78
from ._split import LeavePLabelOut
@@ -40,6 +41,7 @@
4041
'RandomizedSearchCV',
4142
'ShuffleSplit',
4243
'StratifiedKFold',
44+
'BinnedStratifiedKFold',
4345
'StratifiedShuffleSplit',
4446
'check_cv',
4547
'cross_val_predict',

sklearn/model_selection/_split.py

Lines changed: 162 additions & 2 deletions
787
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
'ShuffleSplit',
4545
'LabelShuffleSplit',
4646
'StratifiedKFold',
47+
'BinnedStratifiedKFold',
4748
'StratifiedShuffleSplit',
4849
'PredefinedSplit',
4950
'train_test_split',
@@ -635,6 +636,165 @@ def split(self, X, y, labels=None):
635636
"""
636637
return super(StratifiedKFold, self).split(X, y, labels)
637638

639+
640+
class BinnedStratifiedKFold(_BaseKFold):
641+
"""Stratified K-Folds cross-validator
642+
643+
Provides train/test indices to split data in train/test sets.
644+
645+
This cross-validation object is a variation of KFold that returns
646+
stratified folds. The folds are made by preserving the percentage of
647+
samples for each class.
648+
649+
Read more in the :ref:`User Guide <cross_validation>`.
650+
651+
Parameters
652+
----------
653+
n_folds : int, default=3
654+
Number of folds. Must be at least 2.
655+
656+
shuffle : boolean, optional
657+
Whether to shuffle each stratification of the data before splitting
658+
into batches.
659+
660+
random_state : None, int or RandomState
661+
When shuffle=True, pseudo-random number generator state used for
662+
shuffling. If None, use default numpy RNG for shuffling.
663+
664+
Examples
665+
--------
666+
>>> from sklearn.model_selection import BinnedStratifiedKFold
667+
>>> y = np.arange(11.0)
668+
>>> np.random.seed(0)
669+
>>> np.random.shuffle(y)
670+
>>> X = y + 0.1* np.random.randn(len(y))
671+
>>> skf = BinnedStratifiedKFold(y, n_folds=3)
672+
>>> len(skf)
673+
3
674+
>>> print(skf) # doctest: +NORMALIZE_WHITESPACE
675+
sklearn.cross_validation.BinnedStratifiedKFold(n=11, n_folds=3,
676+
shuffle=False, random_state=None)
677+
>>> indarr = np.zeros(len(y), dtype=bool)
678+
>>> for train_index, test_index in skf:
679+
... print("TRAIN:", train_index, "TEST:", test_index)
680+
... X_train, X_test = X[train_index], X[test_index]
681+
... y_train, y_test = y[train_index], y[test_index]
682+
TRAIN: [ 1 2 3 4 5 8 10] TEST: [0 6 7 9]
683+
TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1 5 10]
684+
TRAIN: [ 0 1 5 6 7 9 10] TEST: [2 3 4 8]
685+
686+
Notes
687+
-----
688+
All the folds have size floor(n_samples / n_folds) or
689+
floor(n_samples / n_folds) +1,
690+
the length is assigned randomly (even if no shuffling is requested)
691+
to balance the variance between folds.
692+
693+
See also
694+
--------
695+
StratifiedKFold -- stratified k-fold generator for classification data
696+
"""
697+
698+
def __init__(self, n_folds=3, shuffle=False, random_state=None):
699+
super(BinnedStratifiedKFold, self).__init__(n_folds, shuffle,
700+
random_state)
701+
702+
def _make_test_folds(self, X, y=None, labels=None):
703+
if y is None:
704+
if hasattr(X, "shape") and \
705+
(len(X.shape) == 1 or all(X.shape[1:] == 1)):
706+
y = X
707+
else:
708+
raise ValueError("no y has been supplied; "
709+
"first argument is not a valid y")
710+
n_samples = len(y)
711+
self.n_samples = n_samples
712+
n_folds = self.n_folds
713+
yinds = np.arange(n_samples)
714+
"reorder the labels according to the ordering of `y`"
715+
sorter0 = np.argsort(y)
716+
yinds = yinds[sorter0]
717+
718+
self.n_classes = n_samples // n_folds + int(n_samples % n_folds != 0)
719+
720+
if n_samples // n_folds > 1:
721+
n_items_boundary_cls = n_folds * (n_samples // n_folds // 2)
722+
"assign lower `n_folds*(n_classes//2 )` labels to the lower class"
723+
lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds)
724+
"assign upper `n_folds*(n_classes//2 )` labels to the upper class"
725+
upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds)
726+
"""assign the remainder labels to the middle class;
727+
add -1 as a filling value; shuffle"""
728+
middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls]
729+
middleclasses = np.hstack([
730+
middleclasses,
731+
-np.ones(n_folds - len(middleclasses) % n_folds, dtype=int)
732+
])
733+
middleclasses = middleclasses.reshape(-1, n_folds)
734+
735+
rng = check_random_state(self.random_state)
736+
rng.shuffle(middleclasses.T)
737+
middleclasses = middleclasses.reshape(-1, n_folds)
738+
self._test_masks = np.vstack([
739+
lowerclasses,
740+
middleclasses,
741+
upperclasses]).T
742+
"to do : middle class rebalancing"
743+
elif n_samples > self.n_classes:
744+
"""put the lower half in one piece, and the rest into a ragged array;
745+
the central values will remain unpaired
746+
"""
747+
lowerclasses = yinds[:n_folds].reshape(-1, n_folds)
748+
upperclasses = yinds[n_folds:]
749+
upperclasses = np.hstack([
750+
upperclasses,
751+
-np.ones(n_folds - len(upperclasses) % n_folds, dtype=int)
752+
])
753+
754+
self._test_masks = np.vstack([lowerclasses, upperclasses]).T
755+
756+
if self.shuffle:
757+
rng.shuffle(self._test_masks)
758+
"remove missing values from the middle class"
759+
self._test_masks = [y[y != -1] for y in self._test_masks]
760+
761+
test_folds = np.empty(n_samples, dtype=np.int)
762+
for nn, fold_masks in enumerate(self._test_masks):
763+
test_folds[fold_masks] = nn
764+
return test_folds
765+
766+
def _iter_test_masks(self, X, y=None, labels=None):
767+
test_folds = self._make_test_folds(X, y)
768+
for i in range(self.n_folds):
769+
yield test_folds == i
770+
771+
def split(self, X, y=None, labels=None):
772+
"""Generate indices to split data into training and test set.
773+
774+
Parameters
775+
----------
776+
X : array-like, shape (n_samples, n_features)
777+
Training data, where n_samples is the number of samples
778+
and n_features is the number of features.
779+
780+
y : array-like, shape (n_samples,)
781+
The target variable for supervised learning problems.
782+
783+
labels : array-like, with shape (n_samples,), optional
784+
Group labels for the samples used while splitting the dataset into
785+
train/test set.
786+
+
Returns
788+
-------
789+
train : ndarray
790+
The training set indices for that split.
791+
792+
test : ndarray
793+
The testing set indices for that split.
794+
"""
795+
return super(BinnedStratifiedKFold, self).split(X, y, labels)
796+
797+
638798
class LeaveOneLabelOut(BaseCrossValidator):
639799
"""Leave One Label Out cross-validator
640800
@@ -1193,8 +1353,8 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
11931353
Validation helper to check if the test/test sizes are meaningful wrt to the
11941354
size of the data (n_samples)
11951355
"""
1196-
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i'
1197-
and test_size >= n_samples):
1356+
if (test_size is not None and np.asarray(test_size).dtype.kind == 'i' and
1357+
test_size >= n_samples):
11981358
raise ValueError('test_size=%d should be smaller than the number of '
11991359
'samples %d' % (test_size, n_samples))
12001360

sklearn/model_selection/tests/test_split.py

Lines changed: 121 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from sklearn.model_selection import cross_val_score
3030
from sklearn.model_selection import KFold
3131
from sklearn.model_selection import StratifiedKFold
32+
from sklearn.model_selection import BinnedStratifiedKFold
3233
from sklearn.model_selection import LabelKFold
3334
from sklearn.model_selection import LeaveOneOut
3435
from sklearn.model_selection import LeaveOneLabelOut
@@ -140,34 +141,27 @@ def test_cross_validator_with_default_params():
140141
X_1d = np.array([1, 2, 3, 4])
141142
y = np.array([1, 1, 2, 2])
142143
labels = np.array([1, 2, 3, 4])
143-
loo = LeaveOneOut()
144-
lpo = LeavePOut(p)
145-
kf = KFold(n_folds)
146-
skf = StratifiedKFold(n_folds)
147-
lolo = LeaveOneLabelOut()
148-
lopo = LeavePLabelOut(p)
149-
ss = ShuffleSplit(random_state=0)
150-
ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2
151-
152-
loo_repr = "LeaveOneOut()"
153-
lpo_repr = "LeavePOut(p=2)"
154-
kf_repr = "KFold(n_folds=2, random_state=None, shuffle=False)"
155-
skf_repr = "StratifiedKFold(n_folds=2, random_state=None, shuffle=False)"
156-
lolo_repr = "LeaveOneLabelOut()"
157-
lopo_repr = "LeavePLabelOut(n_labels=2)"
158-
ss_repr = ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, "
159-
"train_size=None)")
160-
ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
144+
cvs = [
145+
(LeaveOneOut(), "LeaveOneOut()", n_samples),
146+
(LeavePOut(p), "LeavePOut(p=%u)" % p, comb(n_samples, p) ),
147+
(KFold(n_folds), "KFold(n_folds=2, random_state=None, shuffle=False)", n_folds),
148+
(StratifiedKFold(n_folds), ("StratifiedKFold(n_folds=2, "
149+
"random_state=None, shuffle=False)"), n_folds),
150+
(LeaveOneLabelOut(), "LeaveOneLabelOut()", n_unique_labels),
151+
(LeavePLabelOut(p), "LeavePLabelOut(n_labels=%u)" % p, comb(n_unique_labels, p) ),
152+
(ShuffleSplit(random_state=0), ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, "
153+
"train_size=None)"), n_iter),
154+
(PredefinedSplit([1, 1, 2, 2]), "PredefinedSplit(test_fold=array([1, 1, 2, 2]))", 2),
155+
]
156+
# n_splits = np of unique folds = 2
161157

162158
n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds,
163159
n_unique_labels, comb(n_unique_labels, p), n_iter, 2]
164160

165-
for i, (cv, cv_repr) in enumerate(zip(
166-
[loo, lpo, kf, skf, lolo, lopo, ss, ps],
167-
[loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
168-
ss_repr, ps_repr])):
161+
for i, (cv, cv_repr, n_splits_ ) in enumerate(cvs):
162+
print( cv, cv_repr, n_splits_ )
169163
# Test if get_n_splits works correctly
170-
assert_equal(n_splits[i], cv.get_n_splits(X, y, labels))
164+
assert_equal(n_splits_, cv.get_n_splits(X, y, labels))
171165

172166
# Test if the cross-validator works as expected even if
173167
# the data is 1d
@@ -379,6 +373,110 @@ def test_stratifiedkfold_balance():
379373
assert_equal(np.sum(sizes), i)
380374

381375

376+
def test_binnedstratifiedkfold_balance():
377+
for i in range(11, 17):
378+
n_folds = 2 + int(10*np.random.rand())
379+
y = np.random.randn(i)
380+
np.random.shuffle(y)
381+
sizes = []
382+
383+
cv = BinnedStratifiedKFold(n_folds,
384+
shuffle=False, random_state=None)
385+
bskf = cv.split(y)
386+
387+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
388+
for train_index, test_index in bskf:
389+
sizes.append(
390+
len(test_index)
391+
)
392+
assert_true((np.max(sizes) - np.min(sizes)) <= 1)
393+
assert_equal(np.sum(sizes), i)
394+
395+
396+
def test_binnedstratifiedkfold_bin_spacing():
397+
"check if the binned `y` falls into bins of equal size (+/- 1)"
398+
for _ in range(10):
399+
n_folds = 2 + int(10*np.random.rand())
400+
y = np.random.randn(30)
401+
np.random.shuffle(y)
402+
403+
cv = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False,
404+
random_state=None)
405+
bskf = cv.split(y)
406+
#bins = np.percentile(y, np.arange(n_folds))
407+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
408+
409+
for train_index, test_index in bskf:
410+
y_test = y[test_index]
411+
hist_test, _ = np.histogram( y_test, bins = bins )
412+
assert_true(all(abs(hist_test - np.mean(hist_test)) <= 1),
413+
msg = "y_test falls into bins of too ragged sizes")
414+
415+
y_train = y[train_index]
416+
hist_train, _ = np.histogram( y_test, bins = bins )
417+
assert_true(all(abs(hist_train - np.mean(hist_train)) <= 1),
418+
msg = "y_train falls into bins of too ragged sizes")
419+
420+
421+
def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds():
422+
"""check if BinnedStratifiedKFold performs on average better than KFold in terms of
423+
lower between-fold variance of fold mean(y_test) and fold std(y_test)
424+
"""
425+
binned_has_more_stable_std_list = []
426+
binned_has_more_stable_mean_list = []
427+
428+
for trial in range(100):
429+
n_folds = 2 + int(10*np.random.rand())
430+
y = np.random.randn(30)
431+
np.random.shuffle(y)
432+
ymeans_binned = []
433+
ystds_binned = []
434+
435+
cv_bs = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False,
436+
random_state=None)
437+
bskf = cv_bs.split(y)
438+
439+
cv = KFold(n_folds = n_folds,
440+
shuffle = True, random_state = None)
441+
kf = cv.split(y)
442+
443+
#bins = np.percentile(y, np.arange(n_folds))
444+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
445+
446+
for train_index, test_index in bskf:
447+
y_test = y[test_index]
448+
ymeans_binned.append(y_test.mean())
449+
ystds_binned.append(y_test.std())
450+
hist_, _ = np.histogram(y[test_index], bins = bins)
451+
452+
assert_true(all(abs(hist_ - np.mean(hist_)) <= 1),
453+
msg="too ragged bins")
454+
455+
ymeans_regular = []
456+
ystds_regular = []
457+
for train_index_reg, test_index_reg in kf:
458+
ymeans_regular.append(y[test_index_reg].mean())
459+
ystds_regular.append(y[test_index_reg].std())
460+
461+
binned_has_more_stable_std = np.std(ystds_regular) > np.std(ystds_binned)
462+
binned_has_more_stable_std_list.append(binned_has_more_stable_std)
463+
464+
binned_has_more_stable_mean = np.std(ymeans_regular) > np.std(ymeans_binned)
465+
binned_has_more_stable_mean_list.append(binned_has_more_stable_mean)
466+
467+
binned_has_more_stable_std_fraction = np.mean(binned_has_more_stable_std_list)
468+
binned_has_more_stable_mean_fraction = np.mean(binned_has_more_stable_mean_list)
469+
470+
assert_greater( binned_has_more_stable_std_fraction, 0.5)
471+
assert_greater( binned_has_more_stable_mean_fraction, 0.5)
472+
print(" std(y_test) of BinnedStratifiedKFold was more stable than "
473+
"one of KFold in\t%.2f%% cases" % \
474+
(100.0*binned_has_more_stable_std_fraction))
475+
print("mean(y_test) of BinnedStratifiedKFold was more stable than "
476+
"one of KFold in\t%.2f%% cases" % \
477+
(100.0*binned_has_more_stable_mean_fraction))
478+
479+
382480
def test_shuffle_kfold():
383481
# Check the indices are shuffled properly
384482
kf = KFold(3)

0 commit comments

Comments
 (0)
0