From 4df86c60be44e17821adc07127a43baff83e9a19 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Sun, 13 Oct 2019 16:26:25 -0400 Subject: [PATCH 1/8] Initial implementation --- sklearn/model_selection/__init__.py | 2 + sklearn/model_selection/_split.py | 105 ++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 82a9b9371710d..6a023a4091a63 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -12,6 +12,7 @@ from ._split import ShuffleSplit from ._split import GroupShuffleSplit from ._split import StratifiedShuffleSplit +from ._split import StratifiedGroupShuffleSplit from ._split import PredefinedSplit from ._split import train_test_split from ._split import check_cv @@ -48,6 +49,7 @@ 'ShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', + 'StratifiedGroupShuffleSplit', 'check_cv', 'cross_val_predict', 'cross_val_score', diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 993694ae4ab4b..f8f6765126479 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1727,6 +1727,111 @@ def split(self, X, y, groups=None): return super().split(X, y, groups) +class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): + """Stratified GroupShuffleSplit cross-validator + + Provides randomized train/test indices to split data according to a + third-party provided group. This group information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + This cross-validation object is a merge of GroupShuffleSplit and + StratifiedShuffleSplit, which returns stratified randomized folds. The + folds are made by preserving the percentage of groups for each class. + + Note: like the StratifiedShuffleSplit strategy, stratified random group + splits do not guarantee that all folds will be different, although this is + still very likely for sizeable datasets. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int (default 5) + Number of re-shuffling & splitting iterations. + + test_size : float, int, None, optional (default=None) + If float, should be between 0.0 and 1.0 and represent the proportion + of groups to include in the test split (rounded up). If int, + represents the absolute number of test groups. If None, the value is + set to the complement of the train size. By default, the value is set + to 0.2. + + train_size : float, int, or None, default is None + If float, should be between 0.0 and 1.0 and represent the + proportion of the groups to include in the train split. If + int, represents the absolute number of train groups. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Examples + -------- + """ + + def __init__(self, n_splits=5, test_size=None, train_size=None, + random_state=None): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state) + self._default_test_size = 0.2 + + def _iter_indices(self, X, y, groups): + y = check_array(y, ensure_2d=False, dtype=None) + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, ensure_2d=False, dtype=None) + (unique_groups, unique_groups_y), group_indices = np.unique( + np.stack((groups, y)), axis=1, return_inverse=True) + if unique_groups.shape[0] != np.unique(groups).shape[0]: + raise ValueError("Members of each group must all be of the same " + "class.") + for group_train, group_test in super()._iter_indices( + X=unique_groups, y=unique_groups_y): + # these are the indices of unique_groups in the partition invert + # them into data indices + train = np.flatnonzero(np.in1d(group_indices, group_train)) + test = np.flatnonzero(np.in1d(group_indices, group_test)) + yield train, test + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : array-like, shape (n_samples,), optional + The target variable for supervised learning problems. + + groups : array-like, with shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. + """ + return super().split(X, y, groups) + + def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): """ From 6be3594be5337c17b45f790ba71f743e1dbae8bc Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Sun, 13 Oct 2019 16:34:09 -0400 Subject: [PATCH 2/8] Forgot to add to second __add__ list --- sklearn/model_selection/_split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index f8f6765126479..12416c28c4ae9 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -40,6 +40,7 @@ 'GroupShuffleSplit', 'StratifiedKFold', 'StratifiedShuffleSplit', + 'StratifiedGroupShuffleSplit', 'PredefinedSplit', 'train_test_split', 'check_cv'] From 2f286739bb233f90119638edd26f37d721aa4d01 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Sun, 13 Oct 2019 16:52:17 -0400 Subject: [PATCH 3/8] Update split method parameter doc --- sklearn/model_selection/_split.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 12416c28c4ae9..a08908751a374 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1809,8 +1809,13 @@ def split(self, X, y=None, groups=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape (n_samples,), optional + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like, shape (n_samples,) The target variable for supervised learning problems. + Stratification is done based on the y labels. groups : array-like, with shape (n_samples,) Group labels for the samples used while splitting the dataset into From 23657353d3c78bcd0bba0710be2781c898926944 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Mon, 14 Oct 2019 07:01:49 -0400 Subject: [PATCH 4/8] Added example; changed default test_size to 0.1; added to author list --- sklearn/model_selection/_split.py | 37 +++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index a08908751a374..1acf900305c43 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -3,10 +3,11 @@ functions to split the data based on a preset strategy. """ -# Author: Alexandre Gramfort , -# Gael Varoquaux , +# Author: Alexandre Gramfort +# Gael Varoquaux # Olivier Grisel # Raghav RV +# Leandro Hermida # License: BSD 3 clause from collections.abc import Iterable @@ -1755,7 +1756,7 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): of groups to include in the test split (rounded up). If int, represents the absolute number of test groups. If None, the value is set to the complement of the train size. By default, the value is set - to 0.2. + to 0.1. train_size : float, int, or None, default is None If float, should be between 0.0 and 1.0 and represent the @@ -1771,6 +1772,34 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): Examples -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedGroupShuffleSplit + >>> X = np.ones(shape=(15, 2)) + >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0]) + >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6]) + >>> print(groups.shape) + (15,) + >>> sgss = StratifiedGroupShuffleSplit(n_splits=3, train_size=.7, + ... random_state=43) + >>> sgss.get_n_splits() + 3 + >>> for train_idx, test_idx in sgss.split(X, y, groups): + ... print("TRAIN:", groups[train_idx]) + ... print(" ", y[train_idx]) + ... print(" TEST:", groups[test_idx]) + ... print(" ", y[test_idx]) + TRAIN: [2 2 2 4 5 5 5 5 6 6] + [1 1 1 0 1 1 1 1 0 0] + TEST: [1 1 3 3 3] + [0 0 1 1 1] + TRAIN: [1 1 2 2 2 3 3 3 4] + [0 0 1 1 1 1 1 1 0] + TEST: [5 5 5 5 6 6] + [1 1 1 1 0 0] + TRAIN: [1 1 2 2 2 3 3 3 6 6] + [0 0 1 1 1 1 1 1 0 0] + TEST: [4 5 5 5 5] + [0 1 1 1 1] """ def __init__(self, n_splits=5, test_size=None, train_size=None, @@ -1780,7 +1809,7 @@ def __init__(self, n_splits=5, test_size=None, train_size=None, test_size=test_size, train_size=train_size, random_state=random_state) - self._default_test_size = 0.2 + self._default_test_size = 0.1 def _iter_indices(self, X, y, groups): y = check_array(y, ensure_2d=False, dtype=None) From aa8f288e2b10192952f97dc2121c17d3b1ccc2f4 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Wed, 18 Mar 2020 15:49:53 -0400 Subject: [PATCH 5/8] StratifiedGroupKFold impl and other improvements --- sklearn/model_selection/_split.py | 155 +++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index afc58a58a9ea3..07d2fd582a144 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -419,10 +419,9 @@ class KFold(_BaseKFold): See also -------- - StratifiedKFold - Takes group information into account to avoid building folds with - imbalanced class distributions (for binary or multiclass - classification tasks). + StratifiedKFold: Takes class information into account to build folds which + retain class distributions (for binary or multiclass classification + tasks). GroupKFold: K-fold iterator variant with non-overlapping groups. @@ -735,6 +734,133 @@ def split(self, X, y, groups=None): return super().split(X, y, groups) +class StratifiedGroupKFold(StratifiedKFold): + """Stratified K-Folds iterator variant with non-overlapping groups. + + This cross-validation object is a variation of StratifiedKFold that returns + folds stratified by group class. The folds are made by preserving the + percentage of groups for each class. + + The same group will not appear in two different folds (the number of + distinct groups has to be at least equal to the number of folds). + + The difference between GroupKFold and StratifiedGroupKFold is that + the former attempts to create balanced folds such that the number of + distinct groups is approximately the same in each fold, whereas + StratifiedGroupKFold attempts to create folds which preserve the + percentage of groups for each class. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int or RandomState instance, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = np.ones((17, 2)) + >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) + >>> cv = StratifiedGroupKFold(n_splits=3, random_state=777) + >>> for train_idxs, test_idxs in cv.split(X, y, groups): + ... print("TRAIN:", groups[train_idxs]) + ... print(" ", y[train_idxs]) + ... print(" TEST:", groups[test_idxs]) + ... print(" ", y[test_idxs]) + TRAIN: [3 3 3 4 6 6 7 8 8] + [1 1 1 1 0 0 0 0 0] + TEST: [1 1 2 2 5 5 5 5] + [0 0 1 1 0 0 0 0] + TRAIN: [1 1 2 2 4 5 5 5 5 8 8] + [0 0 1 1 1 0 0 0 0 0 0] + TEST: [3 3 3 6 6 7] + [1 1 1 0 0 0] + TRAIN: [1 1 2 2 3 3 3 5 5 5 5 6 6 7] + [0 0 1 1 1 1 1 0 0 0 0 0 0 0] + TEST: [4 8 8] + [1 0 0] + >>> cv = GroupKFold(n_splits=3) + >>> for train_idxs, test_idxs in cv.split(X, y, groups): + ... print("TRAIN:", groups[train_idxs]) + ... print(" ", y[train_idxs]) + ... print(" TEST:", groups[test_idxs]) + ... print(" ", y[test_idxs]) + TRAIN: [2 2 3 3 3 4 6 6 7 8 8] + [1 1 1 1 1 1 0 0 0 0 0] + TEST: [1 1 5 5 5 5] + [0 0 0 0 0 0] + TRAIN: [1 1 5 5 5 5 6 6 7 8 8] + [0 0 0 0 0 0 0 0 0 0 0] + TEST: [2 2 3 3 3 4] + [1 1 1 1 1 1] + TRAIN: [1 1 2 2 3 3 3 4 5 5 5 5] + [0 0 1 1 1 1 1 1 0 0 0 0] + TEST: [6 6 7 8 8] + [0 0 0 0 0] + + Notes + ----- + The implementation is designed to: + + * Generate test sets such that all contain the same distribution of + group classes, or as close as possible. + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Preserve order dependencies in the dataset ordering, when + ``shuffle=False``: all samples from class k in some test set were + contiguous in y, or separated in y by samples from classes other than k. + * Generate test sets where the smallest and largest differ by at most one + group. + + See also + -------- + StratifiedKFold: Takes class information into account to build folds which + retain class distributions (for binary or multiclass classification + tasks). + + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_splits=5, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, + random_state=random_state) + + def _iter_test_masks(self, X, y, groups): + y = check_array(y, ensure_2d=False, dtype=None) + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, ensure_2d=False, dtype=None) + (unique_groups, unique_groups_y), group_indices = np.unique( + np.stack((groups, y)), axis=1, return_inverse=True) + n_groups = len(unique_groups) + if self.n_splits > n_groups: + raise ValueError("Cannot have number of splits n_splits=%d greater" + " than the number of groups: %d." + % (self.n_splits, n_groups)) + if unique_groups.shape[0] != np.unique(groups).shape[0]: + raise ValueError("Members of each group must all be of the same " + "class.") + for group_test in super()._iter_test_masks(X=unique_groups, + y=unique_groups_y): + # this is the mask of unique_groups in the partition invert it into + # a data mask + yield np.in1d(group_indices, np.where(group_test)) + + class TimeSeriesSplit(_BaseKFold): """Time Series cross-validator @@ -1745,8 +1871,9 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): arbitrary domain specific stratifications of the samples as integers. This cross-validation object is a merge of GroupShuffleSplit and - StratifiedShuffleSplit, which returns stratified randomized folds. The - folds are made by preserving the percentage of groups for each class. + StratifiedShuffleSplit, which returns randomized folds stratified by group + class. The folds are made by preserving the percentage of groups for each + class. Note: like the StratifiedShuffleSplit strategy, stratified random group splits do not guarantee that all folds will be different, although this is @@ -1756,23 +1883,23 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): Parameters ---------- - n_splits : int (default 5) + n_splits : int, default=5 Number of re-shuffling & splitting iterations. - test_size : float, int, None, optional (default=None) + test_size : float, int, None, default=None If float, should be between 0.0 and 1.0 and represent the proportion of groups to include in the test split (rounded up). If int, represents the absolute number of test groups. If None, the value is set to the complement of the train size. By default, the value is set to 0.1. - train_size : float, int, or None, default is None + train_size : float, int, or None, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the groups to include in the train split. If int, represents the absolute number of train groups. If None, the value is automatically set to the complement of the test size. - random_state : int, RandomState instance or None, optional (default=None) + random_state : int, RandomState instance or None, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -1808,6 +1935,12 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): [0 0 1 1 1 1 1 1 0 0] TEST: [4 5 5 5 5] [0 1 1 1 1] + + See also + -------- + GroupShuffleSplit: Shuffle-Group(s)-Out iterator. + + StratifiedShuffleSplit: Stratified ShuffleSplit iterator. """ def __init__(self, n_splits=5, test_size=None, train_size=None, @@ -1837,7 +1970,7 @@ def _iter_indices(self, X, y, groups): test = np.flatnonzero(np.in1d(group_indices, group_test)) yield train, test - def split(self, X, y=None, groups=None): + def split(self, X, y, groups=None): """Generate indices to split data into training and test set. Parameters From 647a97e0b0f380975e7452729c7c8b03a4809c12 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Wed, 18 Mar 2020 16:21:39 -0400 Subject: [PATCH 6/8] Add class to __all__ spec --- sklearn/model_selection/_split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 07d2fd582a144..228789b785c7c 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -40,6 +40,7 @@ 'ShuffleSplit', 'GroupShuffleSplit', 'StratifiedKFold', + 'StratifiedGroupKFold', 'StratifiedShuffleSplit', 'StratifiedGroupShuffleSplit', 'PredefinedSplit', From 36babe5a71bdcc46e08a8f08e460e36133f7d5ba Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Wed, 18 Mar 2020 16:57:54 -0400 Subject: [PATCH 7/8] Remove random_state when no shuffle --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 228789b785c7c..7f0b9a29dc3ca 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -776,7 +776,7 @@ class StratifiedGroupKFold(StratifiedKFold): >>> X = np.ones((17, 2)) >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) - >>> cv = StratifiedGroupKFold(n_splits=3, random_state=777) + >>> cv = StratifiedGroupKFold(n_splits=3) >>> for train_idxs, test_idxs in cv.split(X, y, groups): ... print("TRAIN:", groups[train_idxs]) ... print(" ", y[train_idxs]) From 32e502ac8530cfe8c0a81bf43f17f8d8c972d9f4 Mon Sep 17 00:00:00 2001 From: Leandro Hermida Date: Wed, 18 Mar 2020 18:46:55 -0400 Subject: [PATCH 8/8] Tighter formatting --- sklearn/model_selection/_split.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 7f0b9a29dc3ca..a0f374df40a92 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1946,11 +1946,8 @@ class StratifiedGroupShuffleSplit(StratifiedShuffleSplit): def __init__(self, n_splits=5, test_size=None, train_size=None, random_state=None): - super().__init__( - n_splits=n_splits, - test_size=test_size, - train_size=train_size, - random_state=random_state) + super().__init__(n_splits=n_splits, test_size=test_size, + train_size=train_size, random_state=random_state) self._default_test_size = 0.1 def _iter_indices(self, X, y, groups):