8000 BinnedStratifiedKFold for conitinuous variables · scikit-learn/scikit-learn@354e02d · GitHub
[go: up one dir, main page]

Skip to content

Commit 354e02d

Browse files
committed
BinnedStratifiedKFold for conitinuous variables
1 parent ea9896e commit 354e02d

File tree

2 files changed

+245
-2
lines changed

2 files changed

+245
-2
lines changed

sklearn/cross_validation.py

Lines changed: 147 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
'LeavePOut',
5252
'ShuffleSplit',
5353
'StratifiedKFold',
54+
'BinnedStratifiedKFold',
5455
'StratifiedShuffleSplit',
5556
'PredefinedSplit',
5657
'LabelShuffleSplit',
@@ -230,8 +231,8 @@ def __repr__(self):
230231
)
231232

232233
def __len__(self):
233-
return int(factorial(self.n) / factorial(self.n - self.p)
234-
/ factorial(self.p))
234+
return int(factorial(self.n) / factorial(self.n - self.p) /
235+
factorial(self.p))
235236

236237

237238
class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
@@ -577,6 +578,150 @@ def __len__(self):
577578
return self.n_folds
578579

579580

581+
class BinnedStratifiedKFold(_BaseKFold):
582+
"""Binned Stratified K-Folds cross validation iterator for continuous data
583+
584+
Provides train/test indices to split data in train test sets
585+
based on continuous input `y` of length `len_y`.
586+
The input is binned into `ceil(len_y / n_folds)` classes
587+
with equal number of members, except the middle class,
588+
which receives the remainder of labels (of length `len_y % n_folds`).
589+
590+
This cross-validation object is a variation of KFold that
591+
returns binned stratified folds. The folds are made by preserving
592+
the percentage of samples for each class.
593+
594+
Read more in the :ref:`User Guide <cross_validation>`.
595+
596+
Parameters
597+
----------
598+
y : array-like, [n_samples]
599+
Samples to split in K folds.
600+
601+
n_folds : int, default=3
602+
Number of folds. Must be at least 2.
603+
604+
shuffle : boolean, optional
605+
Whether to shuffle each stratification of the data before splitting
606+
into batches.
607+
608+
random_state : None, int or RandomState
609+
When shuffle=True, pseudo-random number generator state used for
610+
shuffling. If None, use default numpy RNG for shuffling.
611+
612+
Examples
613+
--------
614+
>>> from sklearn.cross_validation import BinnedStratifiedKFold
615+
>>> y = np.arange(11.0)
616+
>>> np.random.seed(0)
617+
>>> np.random.shuffle(y)
618+
>>> X = y + 0.1* np.random.randn(len(y))
619+
>>> skf = BinnedStratifiedKFold(y, n_folds=3)
620+
>>> len(skf)
621+
3
622+
>>> print(skf) # doctest: +NORMALIZE_WHITESPACE
623+
sklearn.cross_validation.BinnedStratifiedKFold(n=11, n_folds=3,
624+
shuffle=False, random_state=None)
625+
>>> indarr = np.zeros( len(y), dtype = bool)
626+
>>> for train_index, test_index in skf:
627+
... print("TRAIN:", train_index, "TEST:", test_index)
628+
... X_train, X_test = X[train_index], X[test_index]
629+
... y_train, y_test = y[train_index], y[test_index]
630+
TRAIN: [ 1 2 3 4 5 8 10] TEST: [0 6 7 9]
631+
TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1 5 10]
632+
TRAIN: [ 0 1 5 6 7 9 10] TEST: [2 3 4 8]
633+
634+
Notes
635+
-----
636+
All the folds have size floor(n_samples / n_folds) or
637+
floor(n_samples / n_folds) +1,
638+
the length is assigned randomly (even if no shuffling is requested)
639+
to balance the variance between folds.
640+
641+
See also
642+
--------
643+
StratifiedKFold -- stratified k-fold generator for classification data
644+
"""
645+
646+
def __init__(self, y, n_folds=3, shuffle=False,
647+
random_state=None):
648+
self.random_state = random_state
649+
super(BinnedStratifiedKFold, self).__init__(
650+
len(y),
651+
n_folds=n_folds, shuffle=shuffle, random_state=random_state
652+
)
653+
len_y = len(y)
654+
yinds = np.arange(len_y)
655+
"reorder the labels according to the ordering of `y`"
656+
sorter0 = np.argsort(y)
657+
yinds = yinds[sorter0]
658+
659+
self.n_classes = len_y // n_folds + int(len_y % n_folds != 0)
660+
661+
if len_y // n_folds > 1:
662+
n_items_boundary_cls = n_folds * (len_y // n_folds // 2)
663+
"assign lower `n_folds*(n_classes//2 )` labels to the lower class"
664+
lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds)
665+
"assign upper `n_folds*(n_classes//2 )` labels to the upper class"
666+
upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds)
667+
"""assign the remainder labels to the middle class;
668+
add -1 as a filling value; shuffle"""
669+
middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls]
670+
middleclasses = np.hstack([
671+
middleclasses,
672+
-np.ones(n_folds - len(middleclasses) % n_folds, dtype=int)
673+
])
674+
middleclasses = middleclasses.reshape(-1, n_folds)
675+
676+
rng = check_random_state(self.random_state)
677+
rng.shuffle(middleclasses.T)
678+
middleclasses = middleclasses.reshape(-1, n_folds)
679+
self._test_masks = np.vstack([
680+
lowerclasses,
681+
middleclasses,
682+
upperclasses]).T
683+
"to do : middle class rebalancing"
684+
elif len_y > self.n_classes:
685+
"""put the lower half in one piece, and the rest into a ragged array;
686+
the central values will remain unpaired
687+
"""
688+
lowerclasses = yinds[:n_folds].reshape(-1, n_folds)
689+
upperclasses = yinds[n_folds:]
690+
upperclasses = np.hstack([
691+
upperclasses,
692+
-np.ones(n_folds - len(upperclasses) % n_folds, dtype=int)
693+
])
694+
695+
self._test_masks = np.vstack([lowerclasses, upperclasses]).T
696+
697+
if shuffle:
698+
rng.shuffle(self._test_masks)
699+
700+
"remove missing values from the middle class"
701+
self._test_masks = [y[y != -1] for y in self._test_masks]
702+
return
703+
704+
def _iter_test_masks(self):
705+
indarr = np.zeros(self.n, dtype=bool)
706+
for mask in self._test_masks:
707+
indarr[:] = False
708+
indarr[mask] = True
709+
yield indarr
710+
711+
def __repr__(self):
712+
return '%s.%s(n=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
713+
self.__class__.__module__,
714+
self.__class__.__name__,
715+
self.n,
716+
self.n_folds,
717+
self.shuffle,
718+
self.random_state,
719+
)
720+
721+
def __len__(self):
722+
return self.n_folds
723+
724+
580725
class LeaveOneLabelOut(_PartitionIterator):
581726
"""Leave-One-Label_Out cross-validation iterator
582727

sklearn/tests/test_cross_validation.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,104 @@ def test_label_kfold():
438438
assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
439439

440440

441+
def test_binnedstratifiedkfold_balance():
442+
for _ in range(10):
443+
n_folds = 2 + int(10*np.random.rand())
444+
y = np.random.randn(30)
445+
np.random.shuffle(y)
446+
sizes = []
447+
448+
bskf = cval.BinnedStratifiedKFold(y, n_folds=n_folds,
449+
shuffle = False, random_state=None)
450+
451+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
452+
for train_index, test_index in bskf:
453+
sizes.append(
454+
len(test_index)
455+
)
456+
assert_true((np.max(sizes) - np.min(sizes)) <= 1)
457+
assert_equal(np.sum(sizes), bskf.n)
458+
459+
460+
def test_binnedstratifiedkfold_bin_spacing():
461+
"check if the binned `y` falls into bins of equal size (+/- 1)"
462+
for _ in range(10):
463+
n_folds = 2 + int(10*np.random.rand())
464+
y = np.random.randn(30)
465+
np.random.shuffle(y)
466+
467+
skf = cval.BinnedStratifiedKFold(y, n_folds=n_folds,
468+
shuffle = False, random_state=None)
469+
470+
#bins = np.percentile(y, np.arange(n_folds))
471+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
472+
473+
for train_index, test_index in skf:
474+
y_test = y[test_index]
475+
hist_test, _ = np.histogram( y_test, bins = bins )
476+
assert_true(all(abs(hist_test - np.mean(hist_test)) <= 1),
477+
msg = "y_test falls into bins of too ragged sizes")
478+
479+
y_train = y[train_index]
480+
hist_train, _ = np.histogram( y_test, bins = bins )
481+
assert_true(all(abs(hist_train - np.mean(hist_train)) <= 1),
482+
msg = "y_train falls into bins of too ragged sizes")
483+
484+
485+
def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds():
486+
"""check if BinnedStratifiedKFold performs on average better than KFold in terms of
487+
lower between-fold variance of fold mean(y_test) and fold std(y_test)
488+
"""
489+
binned_has_more_stable_std_list = []
490+
binned_has_more_stable_mean_list = []
491+
492+
for trial in range(100):
493+
n_folds = 2 + int(10*np.random.rand())
494+
y = np.random.randn(30)
495+
np.random.shuffle(y)
496+
ymeans_binned = []
497+
ystds_binned = []
498+
499+
skf = cval.BinnedStratifiedKFold(y, n_folds=n_folds,
500+
shuffle = False, random_state=None)
501+
502+
503+
kf = cval.KFold(len(y), n_folds = n_folds,
504+
shuffle = True, random_state = None)
505+
506+
#bins = np.percentile(y, np.arange(n_folds))
507+
bins = np.array([np.percentile(y, q) for q in range(n_folds)])
508+
509+
for train_index, test_index in skf:
510+
y_test = y[test_index]
511+
ymeans_binned.append( y_test.mean() )
512+
ystds_binned.append( y_test.std() )
513+
hist_, _ = np.histogram( y[test_index], bins = bins )
514+
515+
assert (all(abs(hist_ - np.mean(hist_)) <= 1) ), "too ragged bins"
516+
517+
518+
ymeans_regular = []
519+
ystds_regular = []
520+
for train_index_reg, test_index_reg in kf:
521+
ymeans_regular.append(y[test_index_reg].mean())
522+
ystds_regular.append(y[test_index_reg].std())
523+
524+
binned_has_more_stable_std = np.std(ystds_regular) > np.std(ystds_binned)
525+
binned_has_more_stable_std_list.append(binned_has_more_stable_std)
526+
527+
binned_has_more_stable_mean = np.std(ymeans_regular) > np.std(ymeans_binned)
528+
binned_has_more_stable_mean_list.append( binned_has_more_stable_mean )
529+
530+
binned_has_more_stable_std_fraction = np.mean(binned_has_more_stable_std_list)
531+
binned_has_more_stable_mean_fraction = np.mean(binned_has_more_stable_mean_list)
532+
533+
assert_greater( binned_has_more_stable_std_fraction, 0.5)
534+
assert_greater( binned_has_more_stable_mean_fraction, 0.5)
535+
print(" std(y_test) of BinnedStratifiedKFold was more stable than one of KFold in\t%.2f%% cases" % (100.0*binned_has_more_stable_std_fraction))
536+
print("mean(y_test) of BinnedStratifiedKFold was more stable than one of KFold in\t%.2f%% cases" % (100.0*binned_has_more_stable_mean_fraction))
537+
538+
441539
def test_shuffle_split():
442540
ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
443541
ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)

0 commit comments

Comments
 (0)
0