|
51 | 51 | 'LeavePOut',
|
52 | 52 | 'ShuffleSplit',
|
53 | 53 | 'StratifiedKFold',
|
| 54 | + 'BinnedStratifiedKFold', |
54 | 55 | 'StratifiedShuffleSplit',
|
55 | 56 | 'PredefinedSplit',
|
56 | 57 | 'LabelShuffleSplit',
|
@@ -230,8 +231,8 @@ def __repr__(self):
|
230 | 231 | )
|
231 | 232 |
|
232 | 233 | def __len__(self):
|
233 |
| - return int(factorial(self.n) / factorial(self.n - self.p) |
234 |
| - / factorial(self.p)) |
| 234 | + return int(factorial(self.n) / factorial(self.n - self.p) / |
| 235 | + factorial(self.p)) |
235 | 236 |
|
236 | 237 |
|
237 | 238 | class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
|
@@ -577,6 +578,150 @@ def __len__(self):
|
577 | 578 | return self.n_folds
|
578 | 579 |
|
579 | 580 |
|
| 581 | +class BinnedStratifiedKFold(_BaseKFold): |
| 582 | + """Binned Stratified K-Folds cross validation iterator for continuous data |
| 583 | +
|
| 584 | + Provides train/test indices to split data in train test sets |
| 585 | + based on continuous input `y` of length `len_y`. |
| 586 | + The input is binned into `ceil(len_y / n_folds)` classes |
| 587 | + with equal number of members, except the middle class, |
| 588 | + which receives the remainder of labels (of length `len_y % n_folds`). |
| 589 | +
|
| 590 | + This cross-validation object is a variation of KFold that |
| 591 | + returns binned stratified folds. The folds are made by preserving |
| 592 | + the percentage of samples for each class. |
| 593 | +
|
| 594 | + Read more in the :ref:`User Guide <cross_validation>`. |
| 595 | +
|
| 596 | + Parameters |
| 597 | + ---------- |
| 598 | + y : array-like, [n_samples] |
| 599 | + Samples to split in K folds. |
| 600 | +
|
| 601 | + n_folds : int, default=3 |
| 602 | + Number of folds. Must be at least 2. |
| 603 | +
|
| 604 | + shuffle : boolean, optional |
| 605 | + Whether to shuffle each stratification of the data before splitting |
| 606 | + into batches. |
| 607 | +
|
| 608 | + random_state : None, int or RandomState |
| 609 | + When shuffle=True, pseudo-random number generator state used for |
| 610 | + shuffling. If None, use default numpy RNG for shuffling. |
| 611 | +
|
| 612 | + Examples |
| 613 | + -------- |
| 614 | + >>> from sklearn.cross_validation import BinnedStratifiedKFold |
| 615 | + >>> y = np.arange(11.0) |
| 616 | + >>> np.random.seed(0) |
| 617 | + >>> np.random.shuffle(y) |
| 618 | + >>> X = y + 0.1* np.random.randn(len(y)) |
| 619 | + >>> skf = BinnedStratifiedKFold(y, n_folds=3) |
| 620 | + >>> len(skf) |
| 621 | + 3 |
| 622 | + >>> print(skf) # doctest: +NORMALIZE_WHITESPACE |
| 623 | + sklearn.cross_validation.BinnedStratifiedKFold(n=11, n_folds=3, |
| 624 | + shuffle=False, random_state=None) |
| 625 | + >>> indarr = np.zeros( len(y), dtype = bool) |
| 626 | + >>> for train_index, test_index in skf: |
| 627 | + ... print("TRAIN:", train_index, "TEST:", test_index) |
| 628 | + ... X_train, X_test = X[train_index], X[test_index] |
| 629 | + ... y_train, y_test = y[train_index], y[test_index] |
| 630 | + TRAIN: [ 1 2 3 4 5 8 10] TEST: [0 6 7 9] |
| 631 | + TRAIN: [0 2 3 4 6 7 8 9] TEST: [ 1 5 10] |
| 632 | + TRAIN: [ 0 1 5 6 7 9 10] TEST: [2 3 4 8] |
| 633 | +
|
| 634 | + Notes |
| 635 | + ----- |
| 636 | + All the folds have size floor(n_samples / n_folds) or |
| 637 | + floor(n_samples / n_folds) +1, |
| 638 | + the length is assigned randomly (even if no shuffling is requested) |
| 639 | + to balance the variance between folds. |
| 640 | +
|
| 641 | + See also |
| 642 | + -------- |
| 643 | + StratifiedKFold -- stratified k-fold generator for classification data |
| 644 | + """ |
| 645 | + |
| 646 | + def __init__(self, y, n_folds=3, shuffle=False, |
| 647 | + random_state=None): |
| 648 | + self.random_state = random_state |
| 649 | + super(BinnedStratifiedKFold, self).__init__( |
| 650 | + len(y), |
| 651 | + n_folds=n_folds, shuffle=shuffle, random_state=random_state |
| 652 | + ) |
| 653 | + len_y = len(y) |
| 654 | + yinds = np.arange(len_y) |
| 655 | + "reorder the labels according to the ordering of `y`" |
| 656 | + sorter0 = np.argsort(y) |
| 657 | + yinds = yinds[sorter0] |
| 658 | + |
| 659 | + self.n_classes = len_y // n_folds + int(len_y % n_folds != 0) |
| 660 | + |
| 661 | + if len_y // n_folds > 1: |
| 662 | + n_items_boundary_cls = n_folds * (len_y // n_folds // 2) |
| 663 | + "assign lower `n_folds*(n_classes//2 )` labels to the lower class" |
| 664 | + lowerclasses = yinds[:n_items_boundary_cls].reshape(-1, n_folds) |
| 665 | + "assign upper `n_folds*(n_classes//2 )` labels to the upper class" |
| 666 | + upperclasses = yinds[-n_items_boundary_cls:].reshape(-1, n_folds) |
| 667 | + """assign the remainder labels to the middle class; |
| 668 | + add -1 as a filling value; shuffle""" |
| 669 | + middleclasses = yinds[n_items_boundary_cls:-n_items_boundary_cls] |
| 670 | + middleclasses = np.hstack([ |
| 671 | + middleclasses, |
| 672 | + -np.ones(n_folds - len(middleclasses) % n_folds, dtype=int) |
| 673 | + ]) |
| 674 | + middleclasses = middleclasses.reshape(-1, n_folds) |
| 675 | + |
| 676 | + rng = check_random_state(self.random_state) |
| 677 | + rng.shuffle(middleclasses.T) |
| 678 | + middleclasses = middleclasses.reshape(-1, n_folds) |
| 679 | + self._test_masks = np.vstack([ |
| 680 | + lowerclasses, |
| 681 | + middleclasses, |
| 682 | + upperclasses]).T |
| 683 | + "to do : middle class rebalancing" |
| 684 | + elif len_y > self.n_classes: |
| 685 | + """put the lower half in one piece, and the rest into a ragged array; |
| 686 | + the central values will remain unpaired |
| 687 | + """ |
| 688 | + lowerclasses = yinds[:n_folds].reshape(-1, n_folds) |
| 689 | + upperclasses = yinds[n_folds:] |
| 690 | + upperclasses = np.hstack([ |
| 691 | + upperclasses, |
| 692 | + -np.ones(n_folds - len(upperclasses) % n_folds, dtype=int) |
| 693 | + ]) |
| 694 | + |
| 695 | + self._test_masks = np.vstack([lowerclasses, upperclasses]).T |
| 696 | + |
| 697 | + if shuffle: |
| 698 | + rng.shuffle(self._test_masks) |
| 699 | + |
| 700 | + "remove missing values from the middle class" |
| 701 | + self._test_masks = [y[y != -1] for y in self._test_masks] |
| 702 | + return |
| 703 | + |
| 704 | + def _iter_test_masks(self): |
| 705 | + indarr = np.zeros(self.n, dtype=bool) |
| 706 | + for mask in self._test_masks: |
| 707 | + indarr[:] = False |
| 708 | + indarr[mask] = True |
| 709 | + yield indarr |
| 710 | + |
| 711 | + def __repr__(self): |
| 712 | + return '%s.%s(n=%s, n_folds=%i, shuffle=%s, random_state=%s)' % ( |
| 713 | + self.__class__.__module__, |
| 714 | + self.__class__.__name__, |
| 715 | + self.n, |
| 716 | + self.n_folds, |
| 717 | + self.shuffle, |
| 718 | + self.random_state, |
| 719 | + ) |
| 720 | + |
| 721 | + def __len__(self): |
| 722 | + return self.n_folds |
| 723 | + |
| 724 | + |
580 | 725 | class LeaveOneLabelOut(_PartitionIterator):
|
581 | 726 | """Leave-One-Label_Out cross-validation iterator
|
582 | 727 |
|
|
0 commit comments