diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index ed014cea6f2ff..61ea8c7ef4248 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -782,7 +782,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=3) >>> print(tscv) - TimeSeriesSplit(max_train_size=None, n_splits=3) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None) >>> for train, test in tscv.split(X): ... print("%s %s" % (train, test)) [0 1 2] [3] diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index dd4ab30a7f2ff..2554d136c13ea 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -47,6 +47,15 @@ Changelog :mod:`sklearn.module` ..................... +:mod:`sklearn.model_selection` +.............................. + +- |Enhancement| :class:`model_selection.TimeSeriesSplit` has two new keyword + arguments `test_size` and `gap`. `test_size` allows the out-of-sample + time series length to be fixed for all folds. `gap` removes a fixed number of + samples between the train and test set on each fold. + :pr:`13204` by :user:`Kyle Kosic `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9b2087e039f40..75a4b865fda62 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -766,6 +766,15 @@ class TimeSeriesSplit(_BaseKFold): max_train_size : int, default=None Maximum size for a single training set. + test_size : int, default=None + Used to limit the size of the test set. Defaults to + ``n_samples // (n_splits + 1)``, which is the maximum allowed value + with ``gap=0``. + + gap : int, default=0 + Number of samples to exclude from the end of each train set before + the test set. + Examples -------- >>> import numpy as np @@ -774,7 +783,7 @@ class TimeSeriesSplit(_BaseKFold): >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit() >>> print(tscv) - TimeSeriesSplit(max_train_size=None, n_splits=5) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -784,18 +793,45 @@ class TimeSeriesSplit(_BaseKFold): TRAIN: [0 1 2] TEST: [3] TRAIN: [0 1 2 3] TEST: [4] TRAIN: [0 1 2 3 4] TEST: [5] + >>> # Fix test_size to 2 with 12 samples + >>> X = np.random.randn(12, 2) + >>> y = np.random.randint(0, 2, 12) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) + >>> for train_index, test_index in tscv.split(X): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [0 1 2 3 4 5] TEST: [6 7] + TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] + TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11] + >>> # Add in a 2 period gap + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) + >>> for train_index, test_index in tscv.split(X): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [0 1 2 3] TEST: [6 7] + TRAIN: [0 1 2 3 4 5] TEST: [8 9] + TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11] Notes ----- The training set has size ``i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1)`` in the ``i``th split, - with a test set of size ``n_samples//(n_splits + 1)``, + with a test set of size ``n_samples//(n_splits + 1)`` by default, where ``n_samples`` is the number of samples. """ @_deprecate_positional_args - def __init__(self, n_splits=5, *, max_train_size=None): + def __init__(self, + n_splits=5, + *, + max_train_size=None, + test_size=None, + gap=0): super().__init__(n_splits, shuffle=False, random_state=None) self.max_train_size = max_train_size + self.test_size = test_size + self.gap = gap def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -824,21 +860,31 @@ def split(self, X, y=None, groups=None): n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 + gap = self.gap + test_size = self.test_size if self.test_size is not None \ + else n_samples // n_folds + + # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( - ("Cannot have number of folds ={0} greater" - " than the number of samples: {1}.").format(n_folds, - n_samples)) + (f"Cannot have number of folds={n_folds} greater" + f" than the number of samples={n_samples}.")) + if n_samples - gap - (test_size * n_splits) <= 0: + raise ValueError( + (f"Too many splits={n_splits} for number of samples" + f"={n_samples} with test_size={test_size} and gap={gap}.")) + indices = np.arange(n_samples) - test_size = (n_samples // n_folds) - test_starts = range(test_size + n_samples % n_folds, + test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) + for test_start in test_starts: - if self.max_train_size and self.max_train_size < test_start: - yield (indices[test_start - self.max_train_size:test_start], + train_end = test_start - gap + if self.max_train_size and self.max_train_size < train_end: + yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) else: - yield (indices[:test_start], + yield (indices[:train_end], indices[test_start:test_start + test_size]) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 3b984745420f1..b89571ba085dd 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1440,6 +1440,100 @@ def test_time_series_max_train_size(): _check_time_series_max_train_size(splits, check_splits, max_train_size=2) +def test_time_series_test_size(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=2, test_size=2, + max_train_size=4).split(X) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7]) + assert_array_equal(test, [8, 9]) + + # Should fail with not enough data points for configuration + with pytest.raises(ValueError, match="Too many splits.*with test_size"): + splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X) + next(splits) + + +def test_time_series_gap(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=2, gap=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, + max_train_size=4, test_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with additional test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Verify proper error is thrown + with pytest.raises(ValueError, match="Too many splits.*and gap"): + splits = TimeSeriesSplit(n_splits=4, gap=2).split(X) + next(splits) + + def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0)