From b3ebf3737dbcdc52a159948e2a4ab2d9f3846449 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Tue, 20 Apr 2021 01:51:42 -0400 Subject: [PATCH 01/11] Support gap and test_size both code and document --- doc/modules/cross_validation.rst | 140 +++-- doc/modules/group_time_series_split.rst | 121 ++++ sklearn/model_selection/_split.py | 393 ++++++------- sklearn/model_selection/tests/test_split.py | 581 ++++++++++++-------- 4 files changed, 758 insertions(+), 477 deletions(-) create mode 100644 doc/modules/group_time_series_split.rst diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 98b3c41ee5c72..7b77f333e3b56 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -219,7 +219,7 @@ following keys - ``['test_', 'test_', 'test_', 'fit_time', 'score_time']`` ``return_train_score`` is set to ``False`` by default to save computation time. -To evaluate the scores on the training set as well you need to set it to +To evaluate the scores on the training set as well you need to be set to ``True``. You may also retain the estimator fitted on each training set by setting @@ -353,7 +353,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples:: Here is a visualization of the cross-validation behavior. Note that :class:`KFold` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_004.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -509,7 +509,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. Note that :class:`ShuffleSplit` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -566,7 +566,7 @@ We can see that :class:`StratifiedKFold` preserves the class ratios Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -585,7 +585,7 @@ percentage for each target class as in the complete set. Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_012.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -645,58 +645,6 @@ size due to the imbalance in the data. Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png - :target: ../auto_examples/model_selection/plot_cv_indices.html - :align: center - :scale: 75% - -.. _stratified_group_k_fold: - -StratifiedGroupKFold -^^^^^^^^^^^^^^^^^^^^ - -:class:`StratifiedGroupKFold` is a cross-validation scheme that combines both -:class:`StratifiedKFold` and :class:`GroupKFold`. The idea is to try to -preserve the distribution of classes in each split while keeping each group -within a single split. That might be useful when you have an unbalanced -dataset so that using just :class:`GroupKFold` might produce skewed splits. - -Example:: - - >>> from sklearn.model_selection import StratifiedGroupKFold - >>> X = list(range(18)) - >>> y = [1] * 6 + [0] * 12 - >>> groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6] - >>> sgkf = StratifiedGroupKFold(n_splits=3) - >>> for train, test in sgkf.split(X, y, groups=groups): - ... print("%s %s" % (train, test)) - [ 0 2 3 4 5 6 7 10 11 15 16 17] [ 1 8 9 12 13 14] - [ 0 1 4 5 6 7 8 9 11 12 13 14] [ 2 3 10 15 16 17] - [ 1 2 3 8 9 10 12 13 14 15 16 17] [ 0 4 5 6 7 11] - -Implementation notes: - -- With the current implementation full shuffle is not possible in most - scenarios. When shuffle=True, the following happens: - - 1. All groups a shuffled. - 2. Groups are sorted by standard deviation of classes using stable sort. - 3. Sorted groups are iterated over and assigned to folds. - - That means that only groups with the same standard deviation of class - distribution will be shuffled, which might be useful when each group has only - a single class. -- The algorithm greedily assigns each group to one of n_splits test sets, - choosing the test set that minimises the variance in class distribution - across test sets. Group assignment proceeds from groups with highest to - lowest variance in class frequency, i.e. large groups peaked on one or few - classes are assigned first. -- This split is suboptimal in a sense that it might produce imbalanced splits - even if perfect stratification is possible. If you have relatively close - distribution of classes in each group, using :class:`GroupKFold` is better. - -Here is a visualization of cross-validation behavior for uneven groups: - .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_005.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center @@ -785,7 +733,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_011.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -813,7 +761,7 @@ samples that are part of the validation set, and to -1 for all other samples. Using cross-validation iterators to split train and test -------------------------------------------------------- -The above group cross-validation functions may also be useful for splitting a +The above group cross-validation functions may also be useful for spitting a dataset into training and testing subsets. Note that the convenience function :func:`train_test_split` is a wrapper around :func:`ShuffleSplit` and thus only allows for stratified splitting (using the class labels) @@ -887,11 +835,83 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_013.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_010.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% +Group Time Series Split +^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`GroupTimeSeriesSplit` combines :class:`TimeSeriesSplit` with the Group awareness +of `GroupKFold`. Like :class:`TimeSeriesSplit` this also returns first :math:`k` folds +as train set and the :math:`(k+1)` th fold as test set. +Successive training sets are supersets of those that come before them. +Also, it adds all surplus data to the first training partition, which +is always used to train the model. +This class can be used to cross-validate time series data samples +that are observed at fixed time intervals. + +The same group will not appear in two different folds (the number of +distinct groups has to be at least equal to the number of folds). + +The groups should be Continuous like below. +['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd'] + +Non-continuous groups like below will give an error. +['a', 'a', 'a', 'a', 'a', 'b','b', 'b', 'b', 'b', 'b', 'a', 'c', 'c', 'c', 'b', 'd', 'd'] + +`GroupTimeSeriesSplit` is useful in cases where we have time series data for +say multiple days with multiple data points within a day. +During cross-validation we may not want the training days to be be used in testing. +Here the days can act as groups to keep the training and test splits separate. + +Example of 3-split time series cross-validation on a dataset with +18 samples and 4 groups:: + + >>> import numpy as np + >>> from sklearn.model_selection import GroupTimeSeriesSplit + >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a', + ... 'b', 'b', 'b', 'b', 'b', + ... 'c', 'c', 'c', 'c', + ... 'd', 'd', 'd']) + >>> gtss = GroupTimeSeriesSplit(n_splits=3) + >>> for train_idx, test_idx in gtss.split(groups, groups=groups): + ... print("TRAIN:", train_idx, "TEST:", test_idx) + ... print("TRAIN GROUP:", groups[train_idx], + ... "TEST GROUP:", groups[test_idx]) + TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a'] + TEST GROUP: ['b' 'b' 'b' 'b' 'b'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b'] + TEST GROUP: ['c' 'c' 'c' 'c'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + TEST: [15, 16, 17] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c'] + TEST GROUP: ['d' 'd' 'd'] + +Example of 2-split time series cross-validation on a dataset with +18 samples and 4 groups and 1 test_size and 3 max_train_size and 1 period gap:: + + >>> import numpy as np + >>> from sklearn.model_selection import GroupTimeSeriesSplit + >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'c', 'c', 'c', 'c',\ + 'd', 'd', 'd']) + >>> gtss = GroupTimeSeriesSplit(n_splits=2, test_size=1, gap=1,\ + max_train_size=3) + >>> for train_idx, test_idx in gtss.split(groups, groups=groups): + ... print("TRAIN:", train_idx, "TEST:", test_idx) + ... print("TRAIN GROUP:", groups[train_idx],\ + "TEST GROUP:", groups[test_idx]) + TRAIN: [0, 1, 2, 3, 4, 5] TEST: [11, 12, 13, 14] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a'] TEST GROUP: ['c' 'c' 'c' 'c'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [15, 16, 17] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b'] + TEST GROUP: ['d' 'd' 'd'] + A note on shuffling =================== diff --git a/doc/modules/group_time_series_split.rst b/doc/modules/group_time_series_split.rst new file mode 100644 index 0000000000000..765ccd20ef4f1 --- /dev/null +++ b/doc/modules/group_time_series_split.rst @@ -0,0 +1,121 @@ + +.. _GroupTimeSeriesSplit: + +================================================= +sklearn.model_selection.GroupTimeSeriesSplit +================================================= +.. code-block:: python + + class sklearn.model_selection.GroupTimeSeriesSplit(n_splits=5, *, max_train_size=None, test_size=None, gap=0) + +| *GroupTimeSeriesSplit* combines *TimeSeriesSplit* with the Group awareness of *GroupKFold*. +| +| Like *TimeSeriesSplit* this also returns first *k* folds as train set and the *(k+1)* th fold as test set. +| +| Since the Group applies on this class, the same group will not appear in two different + folds(the number of distinct groups has to be at least equal to the number of folds) which make sure the i.i.d. assumption will not be broken. + +| All operations of this CV strategy are done at the group level. +| So all our parameters, not limited to splits, including test_size, gap, and max_train_size, all represent the constraints on the number of groups. + + +Parameters: +----------- +| **n_splits;int,default=5** +| +| Number of splits. Must be at least 2. +| +| **max_train_size:int, default=None** +| +| Maximum number of groups for a single training set. +| +| **test_size:int, default=None** +| +| Used to limit the number of groups in the test set. Defaults to ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. +| +| **gap:int, default=0** +| +| Number of groups in samples to exclude from the end of each train set before the test set. + +Example 1: +--------- +.. code-block:: python + +>>> import numpy as np +>>> from sklearn.model_selection import GroupTimeSeriesSplit +>>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a', +... 'b', 'b', 'b', 'b', 'b', +... 'c', 'c', 'c', 'c', +... 'd', 'd', 'd']) +>>> gtss = GroupTimeSeriesSplit(n_splits=3) +>>> for train_idx, test_idx in gtss.split(groups, groups=groups): +... print("TRAIN:", train_idx, "TEST:", test_idx) +... print("TRAIN GROUP:", groups[train_idx], +... "TEST GROUP:", groups[test_idx]) +TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10] +TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a'] +TEST GROUP: ['b' 'b' 'b' 'b' 'b'] +TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14] +TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b'] +TEST GROUP: ['c' 'c' 'c' 'c'] +TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] +TEST: [15, 16, 17] +TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c'] +TEST GROUP: ['d' 'd' 'd'] + +Example 2: +--------- +.. code-block:: python + +>>> import numpy as np +>>> from sklearn.model_selection import GroupTimeSeriesSplit +>>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'c', 'c', 'c', 'c',\ + 'd', 'd', 'd']) +>>> gtss = GroupTimeSeriesSplit(n_splits=2, test_size=1, gap=1,\ + max_train_size=3) +>>> for train_idx, test_idx in gtss.split(groups, groups=groups): +... print("TRAIN:", train_idx, "TEST:", test_idx) +... print("TRAIN GROUP:", groups[train_idx],\ + "TEST GROUP:", groups[test_idx]) +TRAIN: [0, 1, 2, 3, 4, 5] TEST: [11, 12, 13, 14] +TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a'] TEST GROUP: ['c' 'c' 'c' 'c'] +TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [15, 16, 17] +TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b'] +TEST GROUP: ['d' 'd' 'd'] + +Methods: +-------- +| **get_n_splits([X, y, groups])** +| +| Returns the number of splitting iterations in the cross-validator +| *Parameters:* +| *X: object* +| Always ignored, exists for compatibility. +| *y: object* +| Always ignored, exists for compatibility. +| *groups: object* +| Always ignored, exists for compatibility. +| *Returns:* +| *n_splits: int* +| Returns the number of splitting iterations in the cross-validator. +| +| **split(X[groups, y])** +| +| Generate indices to split data into training and test set by group. +| *Parameters:* +| *X : array-like of shape (n_samples, n_features)* +| Training data, where n_samples is the number of samples +| and n_features is the number of features. +| *y : array-like of shape (n_samples,)* +| Always ignored, exists for compatibility. +| *groups : array-like of shape (n_samples,)* +| Group labels for the samples used while splitting the dataset into +| train/test set. +| *Yields:* +| *train : ndarray* +| The training set indices for that split. +| *test : ndarray* +| The testing set indices for that split. + diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 13edbeef071f5..2979a33b61785 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -3,16 +3,13 @@ functions to split the data based on a preset strategy. """ -# Author: Alexandre Gramfort -# Gael Varoquaux +# Author: Alexandre Gramfort , +# Gael Varoquaux , # Olivier Grisel # Raghav RV -# Leandro Hermida -# Rodion Martynov # License: BSD 3 clause from collections.abc import Iterable -from collections import defaultdict import warnings from itertools import chain, combinations from math import ceil, floor @@ -43,11 +40,11 @@ 'ShuffleSplit', 'GroupShuffleSplit', 'StratifiedKFold', - 'StratifiedGroupKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', 'train_test_split', - 'check_cv'] + 'check_cv', + 'GroupTimeSeriesSplit'] class BaseCrossValidator(metaclass=ABCMeta): @@ -736,190 +733,6 @@ def split(self, X, y, groups=None): return super().split(X, y, groups) -class StratifiedGroupKFold(_BaseKFold): - """Stratified K-Folds iterator variant with non-overlapping groups. - - This cross-validation object is a variation of StratifiedKFold attempts to - return stratified folds with non-overlapping groups. The folds are made by - preserving the percentage of samples for each class. - - The same group will not appear in two different folds (the number of - distinct groups has to be at least equal to the number of folds). - - The difference between GroupKFold and StratifiedGroupKFold is that - the former attempts to create balanced folds such that the number of - distinct groups is approximately the same in each fold, whereas - StratifiedGroupKFold attempts to create folds which preserve the - percentage of samples for each class as much as possible given the - constraint of non-overlapping groups between splits. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_splits : int, default=5 - Number of folds. Must be at least 2. - - shuffle : bool, default=False - Whether to shuffle each class's samples before splitting into batches. - Note that the samples within each split will not be shuffled. - This implementation can only shuffle groups that have approximately the - same y distribution, no global shuffle will be performed. - - random_state : int or RandomState instance, default=None - When `shuffle` is True, `random_state` affects the ordering of the - indices, which controls the randomness of each fold for each class. - Otherwise, leave `random_state` as `None`. - Pass an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.model_selection import StratifiedGroupKFold - >>> X = np.ones((17, 2)) - >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) - >>> cv = StratifiedGroupKFold(n_splits=3) - >>> for train_idxs, test_idxs in cv.split(X, y, groups): - ... print("TRAIN:", groups[train_idxs]) - ... print(" ", y[train_idxs]) - ... print(" TEST:", groups[test_idxs]) - ... print(" ", y[test_idxs]) - TRAIN: [1 1 2 2 4 5 5 5 5 8 8] - [0 0 1 1 1 0 0 0 0 0 0] - TEST: [3 3 3 6 6 7] - [1 1 1 0 0 0] - TRAIN: [3 3 3 4 5 5 5 5 6 6 7] - [1 1 1 1 0 0 0 0 0 0 0] - TEST: [1 1 2 2 8 8] - [0 0 1 1 0 0] - TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8] - [0 0 1 1 1 1 1 0 0 0 0 0] - TEST: [4 5 5 5 5] - [1 0 0 0 0] - - Notes - ----- - The implementation is designed to: - - * Mimic the behavior of StratifiedKFold as much as possible for trivial - groups (e.g. when each group contains only one sample). - * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to - ``y = [1, 0]`` should not change the indices generated. - * Stratify based on samples as much as possible while keeping - non-overlapping groups constraint. That means that in some cases when - there is a small number of groups containing a large number of samples - the stratification will not be possible and the behavior will be close - to GroupKFold. - - See also - -------- - StratifiedKFold: Takes class information into account to build folds which - retain class distributions (for binary or multiclass classification - tasks). - - GroupKFold: K-fold iterator variant with non-overlapping groups. - """ - - def __init__(self, n_splits=5, shuffle=False, random_state=None): - super().__init__(n_splits=n_splits, shuffle=shuffle, - random_state=random_state) - - def _iter_test_indices(self, X, y, groups): - # Implementation is based on this kaggle kernel: - # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation - # and is a subject to Apache 2.0 License. You may obtain a copy of the - # License at http://www.apache.org/licenses/LICENSE-2.0 - # Changelist: - # - Refactored function to a class following scikit-learn KFold - # interface. - # - Added heuristic for assigning group to the least populated fold in - # cases when all other criteria are equal - # - Swtch from using python ``Counter`` to ``np.unique`` to get class - # distribution - # - Added scikit-learn checks for input: checking that target is binary - # or multiclass, checking passed random state, checking that number - # of splits is less than number of members in each class, checking - # that least populated class has more members than there are splits. - rng = check_random_state(self.random_state) - y = np.asarray(y) - type_of_target_y = type_of_target(y) - allowed_target_types = ('binary', 'multiclass') - if type_of_target_y not in allowed_target_types: - raise ValueError( - 'Supported target types are: {}. Got {!r} instead.'.format( - allowed_target_types, type_of_target_y)) - - y = column_or_1d(y) - _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True) - if np.all(self.n_splits > y_cnt): - raise ValueError("n_splits=%d cannot be greater than the" - " number of members in each class." - % (self.n_splits)) - n_smallest_class = np.min(y_cnt) - if self.n_splits > n_smallest_class: - warnings.warn(("The least populated class in y has only %d" - " members, which is less than n_splits=%d." - % (n_smallest_class, self.n_splits)), UserWarning) - n_classes = len(y_cnt) - - _, groups_inv, groups_cnt = np.unique( - groups, return_inverse=True, return_counts=True) - y_counts_per_group = np.zeros((len(groups_cnt), n_classes)) - for class_idx, group_idx in zip(y_inv, groups_inv): - y_counts_per_group[group_idx, class_idx] += 1 - - y_counts_per_fold = np.zeros((self.n_splits, n_classes)) - groups_per_fold = defaultdict(set) - - if self.shuffle: - rng.shuffle(y_counts_per_group) - - # Stable sort to keep shuffled order for groups with the same - # class distribution variance - sorted_groups_idx = np.argsort(-np.std(y_counts_per_group, axis=1), - kind='mergesort') - - for group_idx in sorted_groups_idx: - group_y_counts = y_counts_per_group[group_idx] - best_fold = self._find_best_fold( - y_counts_per_fold=y_counts_per_fold, y_cnt=y_cnt, - group_y_counts=group_y_counts) - y_counts_per_fold[best_fold] += group_y_counts - groups_per_fold[best_fold].add(group_idx) - - for i in range(self.n_splits): - test_indices = [idx for idx, group_idx in enumerate(groups_inv) - if group_idx in groups_per_fold[i]] - yield test_indices - - def _find_best_fold( - self, y_counts_per_fold, y_cnt, group_y_counts): - best_fold = None - min_eval = np.inf - min_samples_in_fold = np.inf - for i in range(self.n_splits): - y_counts_per_fold[i] += group_y_counts - # Summarise the distribution over classes in each proposed fold - std_per_class = np.std( - y_counts_per_fold / y_cnt.reshape(1, -1), - axis=0) - y_counts_per_fold[i] -= group_y_counts - fold_eval = np.mean(std_per_class) - samples_in_fold = np.sum(y_counts_per_fold[i]) - is_current_fold_better = ( - fold_eval < min_eval or - np.isclose(fold_eval, min_eval) - and samples_in_fold < min_samples_in_fold - ) - if is_current_fold_better: - min_eval = fold_eval - min_samples_in_fold = samples_in_fold - best_fold = i - return best_fold - - class TimeSeriesSplit(_BaseKFold): """Time Series cross-validator @@ -2429,6 +2242,204 @@ def _build_repr(self): return '%s(%s)' % (class_name, _pprint(params, offset=len(class_name))) +class GroupTimeSeriesSplit(_BaseKFold): + """Time Series cross-validator variant with non-overlapping groups. + + Provides train/test indices to split time series data samples + that are observed at fixed time intervals according to a + third-party provided group. + In each split, test indices must be higher than before, and thus shuffling + in cross validator is inappropriate. + + This cross-validation object is a variation of :class:`KFold`. + In the kth split, it returns first k folds as train set and the + (k+1)th fold as test set. + + The same group will not appear in two different folds (the number of + distinct groups has to be at least equal to the number of folds). + + Note that unlike standard cross-validation methods, successive + training sets are supersets of those that come before them. + + The groups should be continuous. For Example: + np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'c', 'c', 'c', 'c',\ + 'd', 'd', 'd']) + + Non-continuous groups like below will give an error. + np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'a', 'c', 'c', 'c',\ + 'b', 'd', 'd']) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of splits. Must be at least 2. + + max_train_size : int, default=None + Maximum number of groups for a single training set. + + test_size : int, default=None + Used to limit the number of groups in the test set. Defaults to + ``n_samples // (n_splits + 1)``, which is the maximum allowed value + with ``gap=0``. + + gap : int, default=0 + Number of groups in samples to exclude from the end of each train set before + the test set. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupTimeSeriesSplit + >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'c', 'c', 'c', 'c',\ + 'd', 'd', 'd']) + >>> gtss = GroupTimeSeriesSplit(n_splits=3) + >>> for train_idx, test_idx in gtss.split(groups, groups=groups): + ... print("TRAIN:", train_idx, "TEST:", test_idx) + ... print("TRAIN GROUP:", groups[train_idx],\ + "TEST GROUP:", groups[test_idx]) + TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\ + TEST GROUP: ['b' 'b' 'b' 'b' 'b'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\ + TEST GROUP: ['c' 'c' 'c' 'c'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\ + TEST: [15, 16, 17] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\ + TEST GROUP: ['d' 'd' 'd'] + >>> # Fix test_size to 1, max_train_size to 3, and add in a 1 period gap + >>> import numpy as np + >>> from sklearn.model_selection import GroupTimeSeriesSplit + >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\ + 'b', 'b', 'b', 'b', 'b',\ + 'c', 'c', 'c', 'c',\ + 'd', 'd', 'd']) + >>> gtss = GroupTimeSeriesSplit(n_splits=2, test_size=1, gap=1,\ + max_train_size=3) + >>> for train_idx, test_idx in gtss.split(groups, groups=groups): + ... print("TRAIN:", train_idx, "TEST:", test_idx) + ... print("TRAIN GROUP:", groups[train_idx],\ + "TEST GROUP:", groups[test_idx]) + TRAIN: [0, 1, 2, 3, 4, 5] TEST: [11, 12, 13, 14] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a'] TEST GROUP: ['c' 'c' 'c' 'c'] + TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [15, 16, 17] + TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\ + TEST GROUP: ['d' 'd' 'd'] + """ + @_deprecate_positional_args + def __init__(self, + n_splits=5, + *, + max_train_size=None, + test_size=None, + gap=0): + super().__init__(n_splits, shuffle=False, random_state=None) + self.max_train_size = max_train_size + self.test_size = test_size + self.gap = gap + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is None: + raise ValueError( + "The 'groups' parameter should not be None") + X, y, groups = indexable(X, y, groups) + n_samples = _num_samples(X) + n_splits = self.n_splits + n_folds = n_splits + 1 + gap = self.gap + max_train_size = self.max_train_size + group_dict = {} + u, ind = np.unique(groups, return_index=True) + unique_groups = u[np.argsort(ind)] + n_samples = _num_samples(X) + n_groups = _num_samples(unique_groups) + # test size is handled here + group_test_size = self.test_size if self.test_size is not None \ + else n_groups // n_folds + for idx in np.arange(n_samples): + if (groups[idx] in group_dict): + if (idx - group_dict[groups[idx]][-1] == 1): + group_dict[groups[idx]].append(idx) + else: + raise ValueError( + ("The groups should be continuous." + " Found a non-continuous group at" + " index={0}").format(idx)) + else: + group_dict[groups[idx]] = [idx] + if n_folds > n_groups: + raise ValueError( + (f"Cannot have number of folds={n_folds} greater" + f" than the number of groups={n_groups}.")) + if n_groups - gap - (group_test_size * n_splits) <= 0: + raise ValueError( + (f"Too many splits={n_splits} for number of groups" + f"={n_groups} with test_size={group_test_size} and gap={gap}.")) + + for group_test_start in range(n_groups - n_splits * group_test_size, + n_groups, group_test_size): + train_array = [] + test_array = [] + train_group_idxs = unique_groups[:group_test_start] + train_end = train_group_idxs.size + # handle gap: remove gap amount of groups from the end of + # train_group_idxs + if gap: + train_group_idxs = train_group_idxs[:train_end - gap] + train_end -= gap + # handle max_train_size: remove max_train_size amount of group + # from the beginning of train_group_idxs + if max_train_size and max_train_size < train_end: + train_group_idxs = train_group_idxs[train_end - + max_train_size:train_end] + for train_group_idx in train_group_idxs: + train_array_tmp = group_dict[train_group_idx] + train_array = np.sort(np.unique( + np.concatenate((train_array, + train_array_tmp)), + axis=None), axis=None) + for test_group_idx in unique_groups[group_test_start: + group_test_start + + group_test_size]: + test_array_tmp = group_dict[test_group_idx] + test_array = np.sort(np.unique( + np.concatenate((test_array, + test_array_tmp)), + axis=None), axis=None) + yield [int(i) for i in train_array], [int(i) for i in test_array] + + def _yields_constant_splits(cv): # Return True if calling cv.split() always returns the same splits # We assume that if a cv doesn't have a shuffle parameter, it shuffles by diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c66d8e1836ac9..42744e6afb6ed 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1,7 +1,6 @@ """Test the split module""" import warnings import pytest -import re import numpy as np from scipy.sparse import coo_matrix, csc_matrix, csr_matrix from scipy import stats @@ -10,9 +9,10 @@ from itertools import combinations_with_replacement from itertools import permutations -from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_allclose, assert_raises_regexp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import _num_samples from sklearn.utils._mocking import MockDataFrame @@ -35,7 +35,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RepeatedKFold from sklearn.model_selection import RepeatedStratifiedKFold -from sklearn.model_selection import StratifiedGroupKFold +from sklearn.model_selection import GroupTimeSeriesSplit from sklearn.linear_model import Ridge @@ -81,7 +81,6 @@ def test_cross_validator_with_default_params(): lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 - sgkf = StratifiedGroupKFold(n_splits) loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" @@ -92,17 +91,15 @@ def test_cross_validator_with_default_params(): ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size=None, train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" - sgkf_repr = ("StratifiedGroupKFold(n_splits=2, random_state=None, " - "shuffle=False)") n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), - n_shuffle_splits, 2, n_splits] + n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( - [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf], + [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, - ss_repr, ps_repr, sgkf_repr])): + ss_repr, ps_repr])): # Test if get_n_splits works correctly assert n_splits_expected[i] == cv.get_n_splits(X, y, groups) @@ -120,10 +117,10 @@ def test_cross_validator_with_default_params(): # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." - with pytest.raises(ValueError, match=msg): - loo.get_n_splits(None, y, groups) - with pytest.raises(ValueError, match=msg): - lpo.get_n_splits(None, y, groups) + assert_raise_message(ValueError, msg, + loo.get_n_splits, None, y, groups) + assert_raise_message(ValueError, msg, + lpo.get_n_splits, None, y, groups) def test_2d_y(): @@ -137,11 +134,10 @@ def test_2d_y(): groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), - StratifiedGroupKFold(), ShuffleSplit(), - StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), - LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), - GroupKFold(n_splits=3), TimeSeriesSplit(), - PredefinedSplit(test_fold=groups)] + ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), + GroupShuffleSplit(), LeaveOneGroupOut(), + LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3), + TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) @@ -198,11 +194,6 @@ def test_kfold_valueerrors(): with pytest.warns(Warning, match="The least populated class"): next(skf_3.split(X2, y)) - sgkf_3 = StratifiedGroupKFold(3) - naive_groups = np.arange(len(y)) - with pytest.warns(Warning, match="The least populated class"): - next(sgkf_3.split(X2, y, naive_groups)) - # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split @@ -210,20 +201,12 @@ def test_kfold_valueerrors(): warnings.simplefilter("ignore") check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - check_cv_coverage( - sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3 - ) - # Check that errors are raised if all n_groups for individual # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) with pytest.raises(ValueError): next(skf_3.split(X2, y)) - with pytest.raises(ValueError): - next(sgkf_3.split(X2, y)) # Error when number of folds is <= 1 with pytest.raises(ValueError): @@ -232,14 +215,10 @@ def test_kfold_valueerrors(): KFold(1) error_string = ("k-fold cross-validation requires at least one" " train/test split") - with pytest.raises(ValueError, match=error_string): - StratifiedKFold(0) - with pytest.raises(ValueError, match=error_string): - StratifiedKFold(1) - with pytest.raises(ValueError, match=error_string): - StratifiedGroupKFold(0) - with pytest.raises(ValueError, match=error_string): - StratifiedGroupKFold(1) + assert_raise_message(ValueError, error_string, + StratifiedKFold, 0) + assert_raise_message(ValueError, error_string, + StratifiedKFold, 1) # When n_splits is not integer: with pytest.raises(ValueError): @@ -250,10 +229,6 @@ def test_kfold_valueerrors(): StratifiedKFold(1.5) with pytest.raises(ValueError): StratifiedKFold(2.0) - with pytest.raises(ValueError): - StratifiedGroupKFold(1.5) - with pytest.raises(ValueError): - StratifiedGroupKFold(2.0) # When shuffle is not a bool: with pytest.raises(TypeError): @@ -344,8 +319,7 @@ def test_stratified_kfold_no_shuffle(): @pytest.mark.parametrize('shuffle', [False, True]) @pytest.mark.parametrize('k', [4, 5, 6, 7, 8, 9, 10]) -@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) -def test_stratified_kfold_ratios(k, shuffle, kfold): +def test_stratified_kfold_ratios(k, shuffle): # Check that stratified kfold preserves class ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 @@ -353,14 +327,12 @@ def test_stratified_kfold_ratios(k, shuffle, kfold): y = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) - # ensure perfect stratification with StratifiedGroupKFold - groups = np.arange(len(y)) distr = np.bincount(y) / len(y) test_sizes = [] random_state = None if not shuffle else 0 - skf = kfold(k, random_state=random_state, shuffle=shuffle) - for train, test in skf.split(X, y, groups=groups): + skf = StratifiedKFold(k, random_state=random_state, shuffle=shuffle) + for train, test in skf.split(X, y): assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) test_sizes.append(len(test)) @@ -369,23 +341,20 @@ def test_stratified_kfold_ratios(k, shuffle, kfold): @pytest.mark.parametrize('shuffle', [False, True]) @pytest.mark.parametrize('k', [4, 6, 7]) -@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) -def test_stratified_kfold_label_invariance(k, shuffle, kfold): +def test_stratified_kfold_label_invariance(k, shuffle): # Check that stratified kfold gives the same indices regardless of labels n_samples = 100 y = np.array([2] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) X = np.ones(len(y)) - # ensure perfect stratification with StratifiedGroupKFold - groups = np.arange(len(y)) def get_splits(y): random_state = None if not shuffle else 0 return [(list(train), list(test)) for train, test - in kfold(k, random_state=random_state, - shuffle=shuffle).split(X, y, groups=groups)] + in StratifiedKFold(k, random_state=random_state, + shuffle=shuffle).split(X, y)] splits_base = get_splits(y) for perm in permutations([0, 1, 2]): @@ -404,20 +373,17 @@ def test_kfold_balance(): assert np.sum(sizes) == i -@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) -def test_stratifiedkfold_balance(kfold): +def test_stratifiedkfold_balance(): # Check that KFold returns folds with balanced sizes (only when # stratification is possible) # Repeat with shuffling turned off and on X = np.ones(17) y = [0] * 3 + [1] * 14 - # ensure perfect stratification with StratifiedGroupKFold - groups = np.arange(len(y)) for shuffle in (True, False): - cv = kfold(3, shuffle=shuffle) + cv = StratifiedKFold(3, shuffle=shuffle) for i in range(11, 17): - skf = cv.split(X[:i], y[:i], groups[:i]) + skf = cv.split(X[:i], y[:i]) sizes = [len(test) for _, test in skf] assert (np.max(sizes) - np.min(sizes)) <= 1 @@ -446,39 +412,39 @@ def test_shuffle_kfold(): assert sum(all_folds) == 300 -@pytest.mark.parametrize("kfold", - [KFold, StratifiedKFold, StratifiedGroupKFold]) -def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold): +def test_shuffle_kfold_stratifiedkfold_reproducibility(): X = np.ones(15) # Divisible by 3 y = [0] * 7 + [1] * 8 - groups_1 = np.arange(len(y)) X2 = np.ones(16) # Not divisible by 3 y2 = [0] * 8 + [1] * 8 - groups_2 = np.arange(len(y2)) # Check that when the shuffle is True, multiple split calls produce the # same split when random_state is int - kf = kfold(3, shuffle=True, random_state=0) + kf = KFold(3, shuffle=True, random_state=0) + skf = StratifiedKFold(3, shuffle=True, random_state=0) - np.testing.assert_equal( - list(kf.split(X, y, groups_1)), - list(kf.split(X, y, groups_1)) - ) + for cv in (kf, skf): + np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) + np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) # Check that when the shuffle is True, multiple split calls often # (not always) produce different splits when random_state is # RandomState instance or None - kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0)) - for data in zip((X, X2), (y, y2), (groups_1, groups_2)): - # Test if the two splits are different cv - for (_, test_a), (_, test_b) in zip(kf.split(*data), - kf.split(*data)): - # cv.split(...) returns an array of tuples, each tuple - # consisting of an array with train indices and test indices - # Ensure that the splits for data are not same - # when random state is not set - with pytest.raises(AssertionError): - np.testing.assert_array_equal(test_a, test_b) + kf = KFold(3, shuffle=True, random_state=np.random.RandomState(0)) + skf = StratifiedKFold(3, shuffle=True, + random_state=np.random.RandomState(0)) + + for cv in (kf, skf): + for data in zip((X, X2), (y, y2)): + # Test if the two splits are different cv + for (_, test_a), (_, test_b) in zip(cv.split(*data), + cv.split(*data)): + # cv.split(...) returns an array of tuples, each tuple + # consisting of an array with train indices and test indices + # Ensure that the splits for data are not same + # when random state is not set + with pytest.raises(AssertionError): + np.testing.assert_array_equal(test_a, test_b) def test_shuffle_stratifiedkfold(): @@ -549,96 +515,6 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 assert mean_score > 0.80 -def test_stratified_group_kfold_trivial(): - sgkf = StratifiedGroupKFold(n_splits=3) - # Trivial example - groups with the same distribution - y = np.array([1] * 6 + [0] * 12) - X = np.ones_like(y).reshape(-1, 1) - groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)) - distr = np.bincount(y) / len(y) - test_sizes = [] - for train, test in sgkf.split(X, y, groups): - # check group constraint - assert np.intersect1d(groups[train], groups[test]).size == 0 - # check y distribution - assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) - assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) - test_sizes.append(len(test)) - assert np.ptp(test_sizes) <= 1 - - -def test_stratified_group_kfold_approximate(): - # Not perfect stratification (even though it is possible) because of - # iteration over groups - sgkf = StratifiedGroupKFold(n_splits=3) - y = np.array([1] * 6 + [0] * 12) - X = np.ones_like(y).reshape(-1, 1) - groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]) - expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]]) - test_sizes = [] - for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): - # check group constraint - assert np.intersect1d(groups[train], groups[test]).size == 0 - split_dist = np.bincount(y[test]) / len(test) - assert_allclose(split_dist, expect_dist, atol=0.001) - test_sizes.append(len(test)) - assert np.ptp(test_sizes) <= 1 - - -@pytest.mark.parametrize('y, groups, expected', - [(np.array([0] * 6 + [1] * 6), - np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]), - np.asarray([[.5, .5], - [.5, .5], - [.5, .5]])), - (np.array([0] * 9 + [1] * 3), - np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]), - np.asarray([[.75, .25], - [.75, .25], - [.75, .25]]))]) -def test_stratified_group_kfold_homogeneous_groups(y, groups, expected): - sgkf = StratifiedGroupKFold(n_splits=3) - X = np.ones_like(y).reshape(-1, 1) - for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): - # check group constraint - assert np.intersect1d(groups[train], groups[test]).size == 0 - split_dist = np.bincount(y[test]) / len(test) - assert_allclose(split_dist, expect_dist, atol=0.001) - - -@pytest.mark.parametrize('cls_distr', - [(0.4, 0.6), - (0.3, 0.7), - (0.2, 0.8), - (0.8, 0.2)]) -@pytest.mark.parametrize('n_groups', [5, 30, 70]) -def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups): - # Check that given sufficient amount of samples StratifiedGroupKFold - # produces better stratified folds than regular GroupKFold - n_splits = 5 - sgkf = StratifiedGroupKFold(n_splits=n_splits) - gkf = GroupKFold(n_splits=n_splits) - rng = np.random.RandomState(0) - n_points = 1000 - y = rng.choice(2, size=n_points, p=cls_distr) - X = np.ones_like(y).reshape(-1, 1) - g = rng.choice(n_groups, n_points) - sgkf_folds = sgkf.split(X, y, groups=g) - gkf_folds = gkf.split(X, y, groups=g) - sgkf_entr = 0 - gkf_entr = 0 - for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds): - # check group constraint - assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0 - sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test) - gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test) - sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr) - gkf_entr += stats.entropy(gkf_distr, qk=cls_distr) - sgkf_entr /= n_splits - gkf_entr /= n_splits - assert sgkf_entr <= gkf_entr - - def test_shuffle_split(): ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X) ss2 = ShuffleSplit(test_size=2, random_state=0).split(X) @@ -751,10 +627,10 @@ def test_stratified_shuffle_split_iter(): assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(np.unique(y[test], - return_inverse=True)[1]) / + return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert len(train) + len(test) == y.size @@ -941,7 +817,7 @@ def test_leave_one_p_group_out(): assert repr(lpgo_1) == 'LeavePGroupsOut(n_groups=1)' assert repr(lpgo_2) == 'LeavePGroupsOut(n_groups=2)' assert (repr(LeavePGroupsOut(n_groups=3)) == - 'LeavePGroupsOut(n_groups=3)') + 'LeavePGroupsOut(n_groups=3)') for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): @@ -983,10 +859,10 @@ def test_leave_one_p_group_out(): lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) msg = "The 'groups' parameter should not be None." - with pytest.raises(ValueError, match=msg): - logo.get_n_splits(None, None, None) - with pytest.raises(ValueError, match=msg): - lpgo_1.get_n_splits(None, None, None) + assert_raise_message(ValueError, msg, + logo.get_n_splits, None, None, None) + assert_raise_message(ValueError, msg, + lpgo_1.get_n_splits, None, None, None) def test_leave_group_out_changing_groups(): @@ -1007,8 +883,8 @@ def test_leave_group_out_changing_groups(): # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 assert ( - 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, - groups=groups)) + 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, + groups=groups)) # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups) @@ -1016,37 +892,27 @@ def test_leave_group_out_changing_groups(): def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): X = y = groups = np.ones(0) - msg = re.escape("Found array with 0 sample(s)") - with pytest.raises(ValueError, match=msg): - next(LeaveOneGroupOut().split(X, y, groups)) - + assert_raise_message(ValueError, "Found array with 0 sample(s)", next, + LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) - msg = re.escape( - f"The groups parameter contains fewer than 2 unique groups ({groups})." - f" LeaveOneGroupOut expects at least 2." - ) - with pytest.raises(ValueError, match=msg): - next(LeaveOneGroupOut().split(X, y, groups)) - + msg = ("The groups parameter contains fewer than 2 unique groups ({}). " + "LeaveOneGroupOut expects at least 2.").format(groups) + assert_raise_message(ValueError, msg, next, + LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) - msg = re.escape( - f"The groups parameter contains fewer than (or equal to) n_groups " - f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " - f"that at least n_groups + 1 (4) unique groups " - f"be present" - ) - with pytest.raises(ValueError, match=msg): - next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) - + msg = ("The groups parameter contains fewer than (or equal to) n_groups " + "(3) numbers of unique groups ({}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present").format(groups) + assert_raise_message(ValueError, msg, next, + LeavePGroupsOut(n_groups=3).split(X, y, groups)) X = y = groups = np.arange(3) - msg = re.escape( - f"The groups parameter contains fewer than (or equal to) n_groups " - f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " - f"that at least n_groups + 1 (4) unique groups " - f"be present" - ) - with pytest.raises(ValueError, match=msg): - next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + msg = ("The groups parameter contains fewer than (or equal to) n_groups " + "(3) numbers of unique groups ({}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present").format(groups) + assert_raise_message(ValueError, msg, next, + LeavePGroupsOut(n_groups=3).split(X, y, groups)) @ignore_warnings @@ -1435,8 +1301,7 @@ def test_cv_iterable_wrapper(): "successive calls to split should yield different results") -@pytest.mark.parametrize('kfold', [GroupKFold, StratifiedGroupKFold]) -def test_group_kfold(kfold): +def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test @@ -1455,7 +1320,7 @@ def test_group_kfold(kfold): len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) - lkf = kfold(n_splits=n_splits) + lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i @@ -1463,7 +1328,7 @@ def test_group_kfold(kfold): assert len(folds) == len(groups) for i in np.unique(folds): assert (tolerance >= - abs(sum(folds == i) - ideal_n_groups_per_fold)) + abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): @@ -1500,7 +1365,7 @@ def test_group_kfold(kfold): assert len(folds) == len(groups) for i in np.unique(folds): assert (tolerance >= - abs(sum(folds == i) - ideal_n_groups_per_fold)) + abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): @@ -1524,8 +1389,8 @@ def test_group_kfold(kfold): groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) with pytest.raises( - ValueError, - match="Cannot have number of splits.*greater" + ValueError, + match="Cannot have number of splits.*greater" ): next(GroupKFold(n_splits=3).split(X, y, groups)) @@ -1535,8 +1400,8 @@ def test_time_series_cv(): # Should fail if there are more folds than samples with pytest.raises( - ValueError, - match="Cannot have number of folds.*greater" + ValueError, + match="Cannot have number of folds.*greater" ): next(TimeSeriesSplit(n_splits=7).split(X)) @@ -1695,7 +1560,7 @@ def test_nested_cv(): groups = rng.randint(0, 5, 15) cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(n_splits=3), - StratifiedKFold(), StratifiedGroupKFold(), + StratifiedKFold(), StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): @@ -1726,7 +1591,7 @@ def test_shuffle_split_empty_trainset(CVSplitter): with pytest.raises( ValueError, match='With n_samples=1, test_size=0.99 and train_size=None, ' - 'the resulting train set will be empty'): + 'the resulting train set will be empty'): next(cv.split(X, y, groups=[1])) @@ -1735,14 +1600,14 @@ def test_train_test_split_empty_trainset(): with pytest.raises( ValueError, match='With n_samples=1, test_size=0.99 and train_size=None, ' - 'the resulting train set will be empty'): + 'the resulting train set will be empty'): train_test_split(X, test_size=.99) X = [[1], [1], [1]] # 3 samples, ask for more than 2 thirds with pytest.raises( ValueError, match='With n_samples=3, test_size=0.67 and train_size=None, ' - 'the resulting train set will be empty'): + 'the resulting train set will be empty'): train_test_split(X, test_size=.67) @@ -1766,8 +1631,7 @@ def test_leave_p_out_empty_trainset(): next(cv.split(X, y, groups=[1, 2])) -@pytest.mark.parametrize('Klass', - (KFold, StratifiedKFold, StratifiedGroupKFold)) +@pytest.mark.parametrize('Klass', (KFold, StratifiedKFold)) def test_random_state_shuffle_false(Klass): # passing a non-default random_state when shuffle=False makes no sense with pytest.raises(ValueError, @@ -1775,13 +1639,274 @@ def test_random_state_shuffle_false(Klass): Klass(3, shuffle=False, random_state=0) +def test_group_time_series_fail_groups_are_none(): + # The GroupTimeSeriesSplit with no group should raise an Error + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] + + # Should fail if the 'groups' is None + with pytest.raises( + ValueError, + match="The 'groups' parameter should not be None"): + next(GroupTimeSeriesSplit(n_splits=7).split(X)) + + +def test_group_time_series_ordering_and_group_preserved(): + # With this test we check that we are only evaluating + # unseen groups in the future + groups = np.array(['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', + 'd', 'd', 'd']) + n_samples = len(groups) + n_splits = 3 + + X = y = np.ones(n_samples) + # Fake array of time like + time_stamps = X * np.arange(n_samples) + + gtf = GroupTimeSeriesSplit(n_splits=n_splits) + + # We check two things here: + # 1. Elements of a group in the evaluation split should not be + # in the training split + # 2. Elements of the training split should be in the past + splits = gtf.split(X, y, groups) + + # Get all the other entries for the groups found in test + for (train, test) in splits: + # verify that they are not in the test set + assert len(np.intersect1d(groups[train], groups[test])) == 0 + # All the elements in the train set should be in past of the + # elements of the test set + for e in time_stamps[train]: + assert (e < time_stamps[test]).all() + + +def test_group_time_series_more_folds_than_group(): + # Should fail if there are more folds than groups + groups = np.array([1, 1, 1, 2, 2]) + X = y = np.ones(len(groups)) + with pytest.raises( + ValueError, + match="Cannot have number of folds=4 greater" + " than the number of groups=2"): + next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) + + +def _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size): + for (train, test), (check_train, check_test) in zip(splits, check_splits): + train_groups = _get_unique_groups(train, groups) + check_train_groups = _get_unique_groups(check_train, groups) + assert_array_equal(test, check_test) + assert len(check_train_groups) <= max_train_size + suffix_start = max(len(train_groups) - max_train_size, 0) + print(suffix_start) + assert_array_equal(check_train_groups, train_groups[suffix_start:]) + + +def _get_unique_groups(index_set, groups): + unique_groups = set() + for i in index_set: + unique_groups.add(groups[i]) + result = list(unique_groups) + result.sort() + return result + + +def test_group_time_series_max_train_size(): + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) + # X = np.zeros((6, 1)) + X = y = np.ones(len(groups)) + splits = GroupTimeSeriesSplit(n_splits=3).split(X, y, groups) + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=3).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=3) + + # Test for the case where the size of a fold is greater than max_train_size + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=2).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=2) + + # Test for the case where the size of each fold is less than max_train_size + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=5).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=5) + + +def test_group_time_series_non_overlap_group(): + groups = np.array(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', + 'c', 'c', 'c', 'c', 'd', 'd', 'd']) + gtss = GroupTimeSeriesSplit(n_splits=3) + splits = gtss.split(groups, groups=groups) + train, test = next(splits) + assert_array_equal(train, np.array([0, 1, 2, 3, 4, 5])) + assert_array_equal(test, np.array([6, 7, 8, 9, 10])) + assert_array_equal(groups[train], np.array(['a', 'a', 'a', + 'a', 'a', 'a'])) + assert_array_equal(groups[test], np.array(['b', 'b', 'b', 'b', 'b'])) + + train, test = next(splits) + assert_array_equal(train, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) + assert_array_equal(test, np.array([11, 12, 13, 14])) + assert_array_equal(groups[train], np.array(['a', 'a', 'a', 'a', + 'a', 'a', + 'b', 'b', 'b', 'b', 'b'])) + assert_array_equal(groups[test], np.array(['c', 'c', 'c', 'c'])) + + train, test = next(splits) + assert_array_equal(train, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14])) + assert_array_equal(test, np.array([15, 16, 17])) + assert_array_equal(groups[train], np.array(['a', 'a', 'a', + 'a', 'a', 'a', + 'b', 'b', 'b', 'b', 'b', + 'c', 'c', 'c', 'c'])) + assert_array_equal(groups[test], ['d', 'd', 'd']) + + +def test_group_time_series_non_continuous(): + groups = np.array(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', + 'c', 'c', 'c', 'c', 'a', 'd', 'd']) + X = y = np.ones(len(groups)) + with pytest.raises( + ValueError, + match="The groups should be continuous." + " Found a non-continuous group at" + " index=15"): + next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) + + +def test_group_time_series_cv(): + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) + X = y = np.ones(len(groups)) + + tscv = GroupTimeSeriesSplit(2) + + # Manually check that Time Series CV preserves the data + # ordering on toy datasets + # Remove the last group ['7', '7'] + splits = tscv.split(X[:-2], y[:-2], groups[:-2]) + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [6, 7, 8, 9, 10]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(test, [11, 12, 13]) + + splits = GroupTimeSeriesSplit(2).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7]) + assert_array_equal(test, [8, 9, 10, 11, 12]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + assert_array_equal(test, [13, 14, 15]) + + # Check get_n_splits returns the correct number of splits + splits = TimeSeriesSplit(2).split(X) + n_splits_actual = len(list(splits)) + assert n_splits_actual == tscv.get_n_splits() + assert n_splits_actual == 2 + + +def test_group_time_series_test_size(): + groups = np.array(['1','1','1','2','2','2','3','3','4','4','4','5','5','6','7','7','8','9','10','10']) + X = y = np.ones(len(groups)) + + # Test alone + splits = GroupTimeSeriesSplit(n_splits=3, test_size=3).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) # group: [0] + assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # group: [0, 1, 2, 3] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + + # Test with max_train_size + splits = GroupTimeSeriesSplit(n_splits=2, test_size=2, + max_train_size=4).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) # group: [2, 3, 4, 5] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + + train, test = next(splits) + assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + + # Should fail with not enough data points for configuration + with pytest.raises(ValueError, match="Too many splits.*with test_size"): + splits = GroupTimeSeriesSplit(n_splits=5, test_size=2).split(X, y, groups) + next(splits) + + +def test_group_time_series_gap(): + groups = np.array(['1','1','1','2','2','2','3','3','4','4','4','5','5','6','7','7','8','9','10','10']) + X = y = np.ones(len(groups)) + + # Test alone + splits = GroupTimeSeriesSplit(n_splits=2, gap=2).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + + # Test with max_train_size + splits = GroupTimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13]) # group: [4, 5] + + train, test = next(splits) + assert_array_equal(train, [6, 7, 8, 9, 10]) # group: [2, 3] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + + train, test = next(splits) + assert_array_equal(train, [11, 12, 13]) # group: [4, 5] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + + # Test with test_size + splits = GroupTimeSeriesSplit(n_splits=2, gap=2, + max_train_size=4, test_size=2).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # group: [0, 1, 2, 3] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + + train, test = next(splits) + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) # group: [2, 3, 4, 5] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + + # Test with additional test_size + splits = GroupTimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X, y, groups) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + + # Verify proper error is thrown + with pytest.raises(ValueError, match="Too many splits.*and gap"): + splits = GroupTimeSeriesSplit(n_splits=4, gap=2).split(X, y, groups) + next(splits) + + @pytest.mark.parametrize('cv, expected', [ (KFold(), True), (KFold(shuffle=True, random_state=123), True), (StratifiedKFold(), True), (StratifiedKFold(shuffle=True, random_state=123), True), - (StratifiedGroupKFold(shuffle=True, random_state=123), True), - (StratifiedGroupKFold(), True), (RepeatedKFold(random_state=123), True), (RepeatedStratifiedKFold(random_state=123), True), (ShuffleSplit(random_state=123), True), @@ -1793,6 +1918,7 @@ def test_random_state_shuffle_false(Klass): (LeaveOneGroupOut(), True), (LeavePGroupsOut(n_groups=2), True), (LeavePOut(p=2), True), + (KFold(shuffle=True, random_state=None), False), (KFold(shuffle=True, random_state=None), False), (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), @@ -1809,6 +1935,9 @@ def test_random_state_shuffle_false(Klass): (GroupShuffleSplit(random_state=np.random.RandomState(0)), False), (StratifiedShuffleSplit(random_state=None), False), (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False), + (GroupTimeSeriesSplit(), True), + (GroupTimeSeriesSplit(n_splits=3), True), + (GroupTimeSeriesSplit(n_splits=3, max_train_size=3), True), ]) def test_yields_constant_splits(cv, expected): assert _yields_constant_splits(cv) == expected From ca88852daad59b191940775340e2493ffad2cf17 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sat, 24 Apr 2021 18:07:10 -0400 Subject: [PATCH 02/11] Fix Linting --- sklearn/model_selection/_split.py | 12 ++--- sklearn/model_selection/tests/test_split.py | 57 +++++++++++++-------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 2979a33b61785..939dcf02f04df 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2405,24 +2405,24 @@ def split(self, X, y=None, groups=None): if n_groups - gap - (group_test_size * n_splits) <= 0: raise ValueError( (f"Too many splits={n_splits} for number of groups" - f"={n_groups} with test_size={group_test_size} and gap={gap}.")) - + f"={n_groups} with test_size={group_test_size} and gap={gap}.")) + for group_test_start in range(n_groups - n_splits * group_test_size, n_groups, group_test_size): train_array = [] test_array = [] train_group_idxs = unique_groups[:group_test_start] train_end = train_group_idxs.size - # handle gap: remove gap amount of groups from the end of + # handle gap: remove gap amount of groups from the end of # train_group_idxs if gap: train_group_idxs = train_group_idxs[:train_end - gap] train_end -= gap - # handle max_train_size: remove max_train_size amount of group + # handle max_train_size: remove max_train_size amount of group # from the beginning of train_group_idxs if max_train_size and max_train_size < train_end: - train_group_idxs = train_group_idxs[train_end - - max_train_size:train_end] + train_group_idxs = train_group_idxs[ + train_end - max_train_size:train_end] for train_group_idx in train_group_idxs: train_array_tmp = group_dict[train_group_idx] train_array = np.sort(np.unique( diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 42744e6afb6ed..6b24005777c3d 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1691,7 +1691,8 @@ def test_group_time_series_more_folds_than_group(): next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) -def _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size): +def _check_group_time_series_max_train_size(splits, check_splits, groups, + max_train_size): for (train, test), (check_train, check_test) in zip(splits, check_splits): train_groups = _get_unique_groups(train, groups) check_train_groups = _get_unique_groups(check_train, groups) @@ -1712,7 +1713,8 @@ def _get_unique_groups(index_set, groups): def test_group_time_series_max_train_size(): - groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + '5', '5', '6', '7', '7']) # X = np.zeros((6, 1)) X = y = np.ones(len(groups)) splits = GroupTimeSeriesSplit(n_splits=3).split(X, y, groups) @@ -1772,7 +1774,8 @@ def test_group_time_series_non_continuous(): def test_group_time_series_cv(): - groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + '5', '5', '6', '7', '7']) X = y = np.ones(len(groups)) tscv = GroupTimeSeriesSplit(2) @@ -1807,44 +1810,50 @@ def test_group_time_series_cv(): def test_group_time_series_test_size(): - groups = np.array(['1','1','1','2','2','2','3','3','4','4','4','5','5','6','7','7','8','9','10','10']) + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + '5', '5', '6', '7', '7', '8', '9', '10', '10']) X = y = np.ones(len(groups)) # Test alone splits = GroupTimeSeriesSplit(n_splits=3, test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2]) # group: [0] - assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] + assert_array_equal(train, [0, 1, 2]) # group: [0] + assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # group: [0, 1, 2, 3] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10]) # group: [0, 1, 2, 3] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Test with max_train_size splits = GroupTimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) # group: [2, 3, 4, 5] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, + 13]) # group: [2, 3, 4, 5] assert_array_equal(test, [14, 15, 16]) # group: [6, 7] train, test = next(splits) - assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] # Should fail with not enough data points for configuration with pytest.raises(ValueError, match="Too many splits.*with test_size"): - splits = GroupTimeSeriesSplit(n_splits=5, test_size=2).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=5, test_size=2).split(X, y, + groups) next(splits) def test_group_time_series_gap(): - groups = np.array(['1','1','1','2','2','2','3','3','4','4','4','5','5','6','7','7','8','9','10','10']) + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + '5', '5', '6', '7', '7', '8', '9', '10', '10']) X = y = np.ones(len(groups)) # Test alone @@ -1855,11 +1864,13 @@ def test_group_time_series_gap(): assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12]) # group: [0, 1, 2, 3, 4] assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Test with max_train_size - splits = GroupTimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=3, gap=2, + max_train_size=2).split(X, y, groups) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] @@ -1878,22 +1889,26 @@ def test_group_time_series_gap(): max_train_size=4, test_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # group: [0, 1, 2, 3] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10]) # group: [0, 1, 2, 3] assert_array_equal(test, [14, 15, 16]) # group: [6, 7] train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) # group: [2, 3, 4, 5] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, + 13]) # group: [2, 3, 4, 5] assert_array_equal(test, [17, 18, 19]) # group: [8, 9] # Test with additional test_size - splits = GroupTimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=2, gap=2, + test_size=3).split(X, y, groups) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12]) # group: [0, 1, 2, 3, 4] assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Verify proper error is thrown From 85da7a34ea1feec0fb571eaee0f6317ace9e9bdb Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 00:44:39 -0400 Subject: [PATCH 03/11] Fix Linting --- sklearn/model_selection/_split.py | 9 +- sklearn/model_selection/tests/test_split.py | 115 ++++++++++++-------- 2 files changed, 72 insertions(+), 52 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 939dcf02f04df..8a110216e71e2 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2282,15 +2282,15 @@ class GroupTimeSeriesSplit(_BaseKFold): max_train_size : int, default=None Maximum number of groups for a single training set. - + test_size : int, default=None Used to limit the number of groups in the test set. Defaults to ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. gap : int, default=0 - Number of groups in samples to exclude from the end of each train set before - the test set. + Number of groups in samples to exclude from the end of each train set + before the test set. Examples -------- @@ -2405,7 +2405,8 @@ def split(self, X, y=None, groups=None): if n_groups - gap - (group_test_size * n_splits) <= 0: raise ValueError( (f"Too many splits={n_splits} for number of groups" - f"={n_groups} with test_size={group_test_size} and gap={gap}.")) + f"={n_groups} with test_size={group_test_size} \ + and gap={gap}.")) for group_test_start in range(n_groups - n_splits * group_test_size, n_groups, group_test_size): diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 6b24005777c3d..db29482091412 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1691,7 +1691,7 @@ def test_group_time_series_more_folds_than_group(): next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) -def _check_group_time_series_max_train_size(splits, check_splits, groups, +def _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size): for (train, test), (check_train, check_test) in zip(splits, check_splits): train_groups = _get_unique_groups(train, groups) @@ -1713,21 +1713,39 @@ def _get_unique_groups(index_set, groups): def test_group_time_series_max_train_size(): - groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) # X = np.zeros((6, 1)) X = y = np.ones(len(groups)) splits = GroupTimeSeriesSplit(n_splits=3).split(X, y, groups) - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=3).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=3) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=3).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=3) # Test for the case where the size of a fold is greater than max_train_size - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=2).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=2) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=2).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=2) # Test for the case where the size of each fold is less than max_train_size - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=5).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=5) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=5).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=5) def test_group_time_series_non_overlap_group(): @@ -1818,31 +1836,31 @@ def test_group_time_series_test_size(): splits = GroupTimeSeriesSplit(n_splits=3, test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2]) # group: [0] - assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10]) # group: [0, 1, 2, 3] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + 14, 15]) + assert_array_equal(test, [16, 17, 18, 19]) # Test with max_train_size - splits = GroupTimeSeriesSplit(n_splits=2, test_size=2, - max_train_size=4).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + test_size=2, + max_train_size=4).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, - 13]) # group: [2, 3, 4, 5] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [11, 12, 13, 14, 15, 16]) + assert_array_equal(test, [17, 18, 19]) # Should fail with not enough data points for configuration with pytest.raises(ValueError, match="Too many splits.*with test_size"): @@ -1860,56 +1878,57 @@ def test_group_time_series_gap(): splits = GroupTimeSeriesSplit(n_splits=2, gap=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12]) # group: [0, 1, 2, 3, 4] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + assert_array_equal(test, [16, 17, 18, 19]) # Test with max_train_size splits = GroupTimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13]) # group: [4, 5] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13]) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10]) # group: [2, 3] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [6, 7, 8, 9, 10]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [11, 12, 13]) # group: [4, 5] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [11, 12, 13]) + assert_array_equal(test, [17, 18, 19]) # Test with test_size - splits = GroupTimeSeriesSplit(n_splits=2, gap=2, - max_train_size=4, test_size=2).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + gap=2, + max_train_size=4, + test_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10]) # group: [0, 1, 2, 3] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, - 13]) # group: [2, 3, 4, 5] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) + assert_array_equal(test, [17, 18, 19]) # Test with additional test_size - splits = GroupTimeSeriesSplit(n_splits=2, gap=2, - test_size=3).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + gap=2, + test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12]) # group: [0, 1, 2, 3, 4] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + assert_array_equal(test, [16, 17, 18, 19]) # Verify proper error is thrown with pytest.raises(ValueError, match="Too many splits.*and gap"): From e6d76036607f110b25b44e71bbde48463f059bf9 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 01:01:36 -0400 Subject: [PATCH 04/11] Remove unused library --- sklearn/model_selection/tests/test_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index db29482091412..c9634192503ac 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -9,7 +9,6 @@ from itertools import combinations_with_replacement from itertools import permutations -from sklearn.utils._testing import assert_allclose, assert_raises_regexp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message From 9ad966f4813dffd941729adaad394fd01353c89e Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 01:50:38 -0400 Subject: [PATCH 05/11] Fix import error --- sklearn/model_selection/tests/test_split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c9634192503ac..7b7591ab8df83 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -9,6 +9,7 @@ from itertools import combinations_with_replacement from itertools import permutations +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message From 23cada5868da8a012d4c5e7b36124f631cbcc09f Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 01:56:00 -0400 Subject: [PATCH 06/11] Revert "Fix Linting" This reverts commit 85da7a34ea1feec0fb571eaee0f6317ace9e9bdb. --- sklearn/model_selection/_split.py | 9 +- sklearn/model_selection/tests/test_split.py | 115 ++++++++------------ 2 files changed, 52 insertions(+), 72 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 8a110216e71e2..939dcf02f04df 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2282,15 +2282,15 @@ class GroupTimeSeriesSplit(_BaseKFold): max_train_size : int, default=None Maximum number of groups for a single training set. - + test_size : int, default=None Used to limit the number of groups in the test set. Defaults to ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. gap : int, default=0 - Number of groups in samples to exclude from the end of each train set - before the test set. + Number of groups in samples to exclude from the end of each train set before + the test set. Examples -------- @@ -2405,8 +2405,7 @@ def split(self, X, y=None, groups=None): if n_groups - gap - (group_test_size * n_splits) <= 0: raise ValueError( (f"Too many splits={n_splits} for number of groups" - f"={n_groups} with test_size={group_test_size} \ - and gap={gap}.")) + f"={n_groups} with test_size={group_test_size} and gap={gap}.")) for group_test_start in range(n_groups - n_splits * group_test_size, n_groups, group_test_size): diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 7b7591ab8df83..3e7952dc62289 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1691,7 +1691,7 @@ def test_group_time_series_more_folds_than_group(): next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) -def _check_group_time_series_max_train_size(splits, check_splits, groups, +def _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size): for (train, test), (check_train, check_test) in zip(splits, check_splits): train_groups = _get_unique_groups(train, groups) @@ -1713,39 +1713,21 @@ def _get_unique_groups(index_set, groups): def test_group_time_series_max_train_size(): - groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) # X = np.zeros((6, 1)) X = y = np.ones(len(groups)) splits = GroupTimeSeriesSplit(n_splits=3).split(X, y, groups) - check_splits = GroupTimeSeriesSplit( - n_splits=3, - max_train_size=3).split(X, y, groups) - _check_group_time_series_max_train_size( - splits, - check_splits, - groups, - max_train_size=3) + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=3).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=3) # Test for the case where the size of a fold is greater than max_train_size - check_splits = GroupTimeSeriesSplit( - n_splits=3, - max_train_size=2).split(X, y, groups) - _check_group_time_series_max_train_size( - splits, - check_splits, - groups, - max_train_size=2) + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=2).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=2) # Test for the case where the size of each fold is less than max_train_size - check_splits = GroupTimeSeriesSplit( - n_splits=3, - max_train_size=5).split(X, y, groups) - _check_group_time_series_max_train_size( - splits, - check_splits, - groups, - max_train_size=5) + check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=5).split(X, y, groups) + _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=5) def test_group_time_series_non_overlap_group(): @@ -1836,31 +1818,31 @@ def test_group_time_series_test_size(): splits = GroupTimeSeriesSplit(n_splits=3, test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2]) - assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(train, [0, 1, 2]) # group: [0] + assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - assert_array_equal(test, [11, 12, 13, 14, 15]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10]) # group: [0, 1, 2, 3] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15]) - assert_array_equal(test, [16, 17, 18, 19]) + 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Test with max_train_size - splits = GroupTimeSeriesSplit( - n_splits=2, - test_size=2, - max_train_size=4).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=2, test_size=2, + max_train_size=4).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) - assert_array_equal(test, [14, 15, 16]) + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, + 13]) # group: [2, 3, 4, 5] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] train, test = next(splits) - assert_array_equal(train, [11, 12, 13, 14, 15, 16]) - assert_array_equal(test, [17, 18, 19]) + assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] # Should fail with not enough data points for configuration with pytest.raises(ValueError, match="Too many splits.*with test_size"): @@ -1878,57 +1860,56 @@ def test_group_time_series_gap(): splits = GroupTimeSeriesSplit(n_splits=2, gap=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) - assert_array_equal(test, [11, 12, 13, 14, 15]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) - assert_array_equal(test, [16, 17, 18, 19]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Test with max_train_size splits = GroupTimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) - assert_array_equal(test, [11, 12, 13]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13]) # group: [4, 5] train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10]) - assert_array_equal(test, [14, 15, 16]) + assert_array_equal(train, [6, 7, 8, 9, 10]) # group: [2, 3] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] train, test = next(splits) - assert_array_equal(train, [11, 12, 13]) - assert_array_equal(test, [17, 18, 19]) + assert_array_equal(train, [11, 12, 13]) # group: [4, 5] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] # Test with test_size - splits = GroupTimeSeriesSplit( - n_splits=2, - gap=2, - max_train_size=4, - test_size=2).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=2, gap=2, + max_train_size=4, test_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - assert_array_equal(test, [14, 15, 16]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10]) # group: [0, 1, 2, 3] + assert_array_equal(test, [14, 15, 16]) # group: [6, 7] train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) - assert_array_equal(test, [17, 18, 19]) + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, + 13]) # group: [2, 3, 4, 5] + assert_array_equal(test, [17, 18, 19]) # group: [8, 9] # Test with additional test_size - splits = GroupTimeSeriesSplit( - n_splits=2, - gap=2, - test_size=3).split(X, y, groups) + splits = GroupTimeSeriesSplit(n_splits=2, gap=2, + test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) - assert_array_equal(test, [11, 12, 13, 14, 15]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] + assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) - assert_array_equal(test, [16, 17, 18, 19]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12]) # group: [0, 1, 2, 3, 4] + assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] # Verify proper error is thrown with pytest.raises(ValueError, match="Too many splits.*and gap"): From 6bf77abbdea2dd6fa01d95dbd33ce7c1bd0dbda0 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 02:10:40 -0400 Subject: [PATCH 07/11] Revert "Fix import error" This reverts commit 9ad966f4813dffd941729adaad394fd01353c89e. --- sklearn/model_selection/tests/test_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 3e7952dc62289..88bdb49dbe4d9 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -9,7 +9,6 @@ from itertools import combinations_with_replacement from itertools import permutations -from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message From 0275ff160600c54e46efaf83c315af550cd3817b Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 02:23:25 -0400 Subject: [PATCH 08/11] Remove inline comment --- sklearn/model_selection/tests/test_split.py | 117 ++++++++++++-------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 88bdb49dbe4d9..c88b1becf33b7 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -9,6 +9,7 @@ from itertools import combinations_with_replacement from itertools import permutations +from sklearn.utils._testing import assert_allclose, assert_raises_regexp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message @@ -1690,7 +1691,7 @@ def test_group_time_series_more_folds_than_group(): next(GroupTimeSeriesSplit(n_splits=3).split(X, y, groups)) -def _check_group_time_series_max_train_size(splits, check_splits, groups, +def _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size): for (train, test), (check_train, check_test) in zip(splits, check_splits): train_groups = _get_unique_groups(train, groups) @@ -1712,21 +1713,39 @@ def _get_unique_groups(index_set, groups): def test_group_time_series_max_train_size(): - groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', + groups = np.array(['1', '1', '1', '2', '2', '2', '3', '3', '4', '4', '4', '5', '5', '6', '7', '7']) # X = np.zeros((6, 1)) X = y = np.ones(len(groups)) splits = GroupTimeSeriesSplit(n_splits=3).split(X, y, groups) - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=3).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=3) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=3).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=3) # Test for the case where the size of a fold is greater than max_train_size - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=2).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=2) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=2).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=2) # Test for the case where the size of each fold is less than max_train_size - check_splits = GroupTimeSeriesSplit(n_splits=3, max_train_size=5).split(X, y, groups) - _check_group_time_series_max_train_size(splits, check_splits, groups, max_train_size=5) + check_splits = GroupTimeSeriesSplit( + n_splits=3, + max_train_size=5).split(X, y, groups) + _check_group_time_series_max_train_size( + splits, + check_splits, + groups, + max_train_size=5) def test_group_time_series_non_overlap_group(): @@ -1817,31 +1836,31 @@ def test_group_time_series_test_size(): splits = GroupTimeSeriesSplit(n_splits=3, test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2]) # group: [0] - assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) # group: [1, 2, 3] + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [3, 4, 5, 6, 7, 8, 9, 10]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10]) # group: [0, 1, 2, 3] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15]) # group: [0, 1, 2, 3, 4, 5, 6] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + 14, 15]) + assert_array_equal(test, [16, 17, 18, 19]) # Test with max_train_size - splits = GroupTimeSeriesSplit(n_splits=2, test_size=2, - max_train_size=4).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + test_size=2, + max_train_size=4).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, - 13]) # group: [2, 3, 4, 5] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [11, 12, 13, 14, 15, 16]) # group: [4, 5, 6, 7] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [11, 12, 13, 14, 15, 16]) + assert_array_equal(test, [17, 18, 19]) # Should fail with not enough data points for configuration with pytest.raises(ValueError, match="Too many splits.*with test_size"): @@ -1859,56 +1878,57 @@ def test_group_time_series_gap(): splits = GroupTimeSeriesSplit(n_splits=2, gap=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12]) # group: [0, 1, 2, 3, 4] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + assert_array_equal(test, [16, 17, 18, 19]) # Test with max_train_size splits = GroupTimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13]) # group: [4, 5] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13]) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10]) # group: [2, 3] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [6, 7, 8, 9, 10]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [11, 12, 13]) # group: [4, 5] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [11, 12, 13]) + assert_array_equal(test, [17, 18, 19]) # Test with test_size - splits = GroupTimeSeriesSplit(n_splits=2, gap=2, - max_train_size=4, test_size=2).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + gap=2, + max_train_size=4, + test_size=2).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10]) # group: [0, 1, 2, 3] - assert_array_equal(test, [14, 15, 16]) # group: [6, 7] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + assert_array_equal(test, [14, 15, 16]) train, test = next(splits) - assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, - 13]) # group: [2, 3, 4, 5] - assert_array_equal(test, [17, 18, 19]) # group: [8, 9] + assert_array_equal(train, [6, 7, 8, 9, 10, 11, 12, 13]) + assert_array_equal(test, [17, 18, 19]) # Test with additional test_size - splits = GroupTimeSeriesSplit(n_splits=2, gap=2, - test_size=3).split(X, y, groups) + splits = GroupTimeSeriesSplit( + n_splits=2, + gap=2, + test_size=3).split(X, y, groups) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5]) # group: [0, 1] - assert_array_equal(test, [11, 12, 13, 14, 15]) # group: [4, 5, 6] + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [11, 12, 13, 14, 15]) train, test = next(splits) - assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12]) # group: [0, 1, 2, 3, 4] - assert_array_equal(test, [16, 17, 18, 19]) # group: [7, 8, 9] + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + assert_array_equal(test, [16, 17, 18, 19]) # Verify proper error is thrown with pytest.raises(ValueError, match="Too many splits.*and gap"): @@ -1932,7 +1952,6 @@ def test_group_time_series_gap(): (LeaveOneGroupOut(), True), (LeavePGroupsOut(n_groups=2), True), (LeavePOut(p=2), True), - (KFold(shuffle=True, random_state=None), False), (KFold(shuffle=True, random_state=None), False), (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), From acfc966ddf16fe2fd5eca6fb4cc6b935fcd93229 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 02:26:46 -0400 Subject: [PATCH 09/11] Remove assert_raises_regexp --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c88b1becf33b7..b43920d7e9ba0 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -9,7 +9,7 @@ from itertools import combinations_with_replacement from itertools import permutations -from sklearn.utils._testing import assert_allclose, assert_raises_regexp +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message From 841bad226d36de5a72e0bee6b2c4b8134f677fe7 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Sun, 25 Apr 2021 02:43:12 -0400 Subject: [PATCH 10/11] Fix Linting --- sklearn/model_selection/_split.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 939dcf02f04df..8c20b6fd085c5 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2282,15 +2282,15 @@ class GroupTimeSeriesSplit(_BaseKFold): max_train_size : int, default=None Maximum number of groups for a single training set. - + test_size : int, default=None Used to limit the number of groups in the test set. Defaults to ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. gap : int, default=0 - Number of groups in samples to exclude from the end of each train set before - the test set. + Number of groups in samples to exclude from the end of each train set + before the test set. Examples -------- @@ -2405,7 +2405,8 @@ def split(self, X, y=None, groups=None): if n_groups - gap - (group_test_size * n_splits) <= 0: raise ValueError( (f"Too many splits={n_splits} for number of groups" - f"={n_groups} with test_size={group_test_size} and gap={gap}.")) + f"={n_groups} with \ + test_size={group_test_size} and gap={gap}.")) for group_test_start in range(n_groups - n_splits * group_test_size, n_groups, group_test_size): From a3546180a73ecf4f5faa9f4bb91c42aa6b967006 Mon Sep 17 00:00:00 2001 From: Soso Song Date: Tue, 27 Apr 2021 19:54:05 -0400 Subject: [PATCH 11/11] Remove StratifiedGroupKFold --- sklearn/model_selection/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index f79db2a5acc17..2c28b33193e4a 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -14,10 +14,10 @@ from ._split import ShuffleSplit from ._split import GroupShuffleSplit from ._split import StratifiedShuffleSplit -from ._split import StratifiedGroupKFold from ._split import PredefinedSplit from ._split import train_test_split from ._split import check_cv +from ._split import GroupTimeSeriesSplit from ._validation import cross_val_score from ._validation import cross_val_predict @@ -58,7 +58,6 @@ 'RandomizedSearchCV', 'ShuffleSplit', 'StratifiedKFold', - 'StratifiedGroupKFold', 'StratifiedShuffleSplit', 'check_cv', 'cross_val_predict', @@ -68,4 +67,5 @@ 'learning_curve', 'permutation_test_score', 'train_test_split', - 'validation_curve'] + 'validation_curve', + 'GroupTimeSeriesSplit']