From 72bd6bcd61cf6a3690be7a4a29b329c7c1204443 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 13:33:07 -0500 Subject: [PATCH 01/20] initial changes --- sklearn/model_selection/_split.py | 32 +++++++++++++++++---- sklearn/model_selection/tests/test_split.py | 8 ++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 24394f8691c7e..b9699353b10a0 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -742,6 +742,13 @@ class TimeSeriesSplit(_BaseKFold): max_train_size : int, optional Maximum size for a single training set. + test_size : int, optional + Number of samples in each test set. Defaults to ``n_samples / (n_splits + 1)``. + + gap_size : int, default=0 + Number of samples to exclude from the end of the train set before the test set. + + Examples -------- >>> import numpy as np @@ -768,12 +775,14 @@ class TimeSeriesSplit(_BaseKFold): with a test set of size ``n_samples//(n_splits + 1)``, where ``n_samples`` is the number of samples. """ - def __init__(self, n_splits='warn', max_train_size=None): + def __init__(self, n_splits='warn', max_train_size=None, test_size=None, gap_size=0): if n_splits == 'warn': warnings.warn(NSPLIT_WARNING, FutureWarning) n_splits = 3 super().__init__(n_splits, shuffle=False, random_state=None) self.max_train_size = max_train_size + self.test_size = test_size + self.gap_size = gap_size def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -802,21 +811,32 @@ def split(self, X, y=None, groups=None): n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 + gap_size = self.gap_size + test_size = self.test_size if self.test_size else (n_samples // n_folds) + + # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( ("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format(n_folds, n_samples)) + if n_samples - gap_size - (test_size * n_splits) <= 0: + raise ValueError( + ("Too many splits ={0} for number of samples" + " ={1} with test_size ={2} and gap_size ={3}").format(n_splits, + n_samples, + test_size, + gap_size)) indices = np.arange(n_samples) - test_size = (n_samples // n_folds) - test_starts = range(test_size + n_samples % n_folds, + test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) + for test_start in test_starts: - if self.max_train_size and self.max_train_size < test_start: - yield (indices[test_start - self.max_train_size:test_start], + if self.max_train_size and self.max_train_size < (test_start - gap_size): + yield (indices[test_start - self.max_train_size - gap_size:test_start - gap_size], indices[test_start:test_start + test_size]) else: - yield (indices[:test_start], + yield (indices[:test_start - gap_size], indices[test_start:test_start + test_size]) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5981df285f54b..a3b7318b8eb1b 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1420,6 +1420,14 @@ def test_time_series_max_train_size(): _check_time_series_max_train_size(splits, check_splits, max_train_size=2) +def test_time_series_test_size(): + pass + + +def test_time_series_gap_size(): + pass + + @pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 def test_nested_cv(): # Test if nested cross validation works with different combinations of cv From 541079821298e7882adb738f7991632bd7f9852a Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 15:11:55 -0500 Subject: [PATCH 02/20] docs and unit tests --- sklearn/model_selection/_split.py | 26 ++++++- sklearn/model_selection/tests/test_split.py | 76 ++++++++++++++++++++- 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b9699353b10a0..ed75f910e4023 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -746,7 +746,7 @@ class TimeSeriesSplit(_BaseKFold): Number of samples in each test set. Defaults to ``n_samples / (n_splits + 1)``. gap_size : int, default=0 - Number of samples to exclude from the end of the train set before the test set. + Number of samples to exclude from the end of each train set before the test set. Examples @@ -767,12 +767,32 @@ class TimeSeriesSplit(_BaseKFold): TRAIN: [0 1 2] TEST: [3] TRAIN: [0 1 2 3] TEST: [4] TRAIN: [0 1 2 3 4] TEST: [5] + >>> # Fix test_size to 2 with 12 samples + >>> X = np.random.randn(12, 2) + >>> y = np.random.randint(0, 2, 12) + >>> tscv = TimeSeriesSplit3(n_splits=3, test_size=2) + >>> for train_index, test_index in tscv.split(X): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [0 1 2 3 4 5] TEST: [6 7] + TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] + TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11] + >>> # Add in a 2 period gap + >>> tscv = TimeSeriesSplit3(n_splits=3, test_size=2, gap_size=2) + >>> for train_index, test_index in tscv.split(X): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [0 1 2 3] TEST: [6 7] + TRAIN: [0 1 2 3 4 5] TEST: [8 9] + TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11] Notes ----- The training set has size ``i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1)`` in the ``i``th split, - with a test set of size ``n_samples//(n_splits + 1)``, + with a test set of size ``n_samples//(n_splits + 1)`` by default, where ``n_samples`` is the number of samples. """ def __init__(self, n_splits='warn', max_train_size=None, test_size=None, gap_size=0): @@ -823,7 +843,7 @@ def split(self, X, y=None, groups=None): if n_samples - gap_size - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" - " ={1} with test_size ={2} and gap_size ={3}").format(n_splits, + " ={1} with test_size ={2} and gap_size ={3}.").format(n_splits, n_samples, test_size, gap_size)) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index a3b7318b8eb1b..cfad7fded5f6b 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1421,11 +1421,83 @@ def test_time_series_max_train_size(): def test_time_series_test_size(): - pass + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7]) + assert_array_equal(test, [8, 9]) + + # Should fail with not enough data points for configuration + assert_raises_regexp(ValueError, "Too many splits.*with test_size", + next, TimeSeriesSplit(n_splits=5, test_size=2).split(X)) def test_time_series_gap_size(): - pass + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=2, gap_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=3, gap_size=2, max_train_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with test_size + splits = TimeSeriesSplit(n_splits=2, gap_size=2, + max_train_size=4, test_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [8, 9]) + + # Verify proper error is thrown + assert_raises_regexp(ValueError, "Too many splits.*and gap_size", + next, TimeSeriesSplit(n_splits=4, gap_size=2).split(X)) @pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 From f04c37d915bec9634a33937d3c599b6bc2fc9deb Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 15:12:21 -0500 Subject: [PATCH 03/20] typo in docstring --- sklearn/model_selection/_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index ed75f910e4023..9c031f35f1daf 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -770,7 +770,7 @@ class TimeSeriesSplit(_BaseKFold): >>> # Fix test_size to 2 with 12 samples >>> X = np.random.randn(12, 2) >>> y = np.random.randint(0, 2, 12) - >>> tscv = TimeSeriesSplit3(n_splits=3, test_size=2) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -779,7 +779,7 @@ class TimeSeriesSplit(_BaseKFold): TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11] >>> # Add in a 2 period gap - >>> tscv = TimeSeriesSplit3(n_splits=3, test_size=2, gap_size=2) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap_size=2) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From 3fed48f823e3d04a31e590f2ec000dc7b04f1fa4 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 15:22:13 -0500 Subject: [PATCH 04/20] flake8; clarify indexing logic --- sklearn/model_selection/_split.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 9c031f35f1daf..fe6e7de3ec30a 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -743,10 +743,12 @@ class TimeSeriesSplit(_BaseKFold): Maximum size for a single training set. test_size : int, optional - Number of samples in each test set. Defaults to ``n_samples / (n_splits + 1)``. + Number of samples in each test set. Defaults to + ``n_samples / (n_splits + 1)``. gap_size : int, default=0 - Number of samples to exclude from the end of each train set before the test set. + Number of samples to exclude from the end of each train set before + the test set. Examples @@ -795,7 +797,8 @@ class TimeSeriesSplit(_BaseKFold): with a test set of size ``n_samples//(n_splits + 1)`` by default, where ``n_samples`` is the number of samples. """ - def __init__(self, n_splits='warn', max_train_size=None, test_size=None, gap_size=0): + def __init__(self, n_splits='warn', max_train_size=None, test_size=None, + gap_size=0): if n_splits == 'warn': warnings.warn(NSPLIT_WARNING, FutureWarning) n_splits = 3 @@ -832,7 +835,7 @@ def split(self, X, y=None, groups=None): n_splits = self.n_splits n_folds = n_splits + 1 gap_size = self.gap_size - test_size = self.test_size if self.test_size else (n_samples // n_folds) + test_size = self.test_size if self.test_size else n_samples // n_folds # Make sure we have enough samples for the given split parameters if n_folds > n_samples: @@ -843,20 +846,20 @@ def split(self, X, y=None, groups=None): if n_samples - gap_size - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" - " ={1} with test_size ={2} and gap_size ={3}.").format(n_splits, - n_samples, - test_size, - gap_size)) + " ={1} with test_size ={2} and gap_size ={3}." + "").format(n_splits, n_samples, test_size, gap_size)) + indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) for test_start in test_starts: - if self.max_train_size and self.max_train_size < (test_start - gap_size): - yield (indices[test_start - self.max_train_size - gap_size:test_start - gap_size], + train_end = test_start - gap_size + if self.max_train_size and self.max_train_size < train_end: + yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) else: - yield (indices[:test_start - gap_size], + yield (indices[:train_end], indices[test_start:test_start + test_size]) From 266751f399c7247ef8411ea7d27bf9daa0fcf56d Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 18:47:41 -0500 Subject: [PATCH 05/20] flake8 tests --- sklearn/model_selection/tests/test_split.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index cfad7fded5f6b..dbb84a56aa0ab 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1439,7 +1439,8 @@ def test_time_series_test_size(): assert_array_equal(test, [7, 8, 9]) # Test with max_train_size - splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X) + splits = TimeSeriesSplit(n_splits=2, test_size=2, + max_train_size=4).split(X) train, test = next(splits) assert_array_equal(train, [2, 3, 4, 5]) @@ -1450,8 +1451,10 @@ def test_time_series_test_size(): assert_array_equal(test, [8, 9]) # Should fail with not enough data points for configuration - assert_raises_regexp(ValueError, "Too many splits.*with test_size", - next, TimeSeriesSplit(n_splits=5, test_size=2).split(X)) + assert_raises_regexp(ValueError, + "Too many splits.*with test_size", + next, + TimeSeriesSplit(n_splits=5, test_size=2).split(X)) def test_time_series_gap_size(): @@ -1496,8 +1499,10 @@ def test_time_series_gap_size(): assert_array_equal(test, [8, 9]) # Verify proper error is thrown - assert_raises_regexp(ValueError, "Too many splits.*and gap_size", - next, TimeSeriesSplit(n_splits=4, gap_size=2).split(X)) + assert_raises_regexp(ValueError, + "Too many splits.*and gap_size", + next, + TimeSeriesSplit(n_splits=4,gap_size=2).split(X)) @pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 From 1e06e43b20ce6fd94635ec38dd25d3de13369f88 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 18:55:37 -0500 Subject: [PATCH 06/20] another flake8 fix in test_split.py --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index dbb84a56aa0ab..dc26f8875563b 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1502,7 +1502,7 @@ def test_time_series_gap_size(): assert_raises_regexp(ValueError, "Too many splits.*and gap_size", next, - TimeSeriesSplit(n_splits=4,gap_size=2).split(X)) + TimeSeriesSplit(n_splits=4, gap_size=2).split(X)) @pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 From 166b9a9c637fc2327867d18a2948da28348e6369 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 19:45:51 -0500 Subject: [PATCH 07/20] fix failing doctest --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index fe6e7de3ec30a..756824a47caa3 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -759,7 +759,7 @@ class TimeSeriesSplit(_BaseKFold): >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=5) >>> print(tscv) # doctest: +NORMALIZE_WHITESPACE - TimeSeriesSplit(max_train_size=None, n_splits=5) + TimeSeriesSplit(gap_size=0, max_train_size=None, n_splits=5, test_size=None) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From dbb51b4f6d8519809d9926a82967634fe8cfcb43 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 19:50:00 -0500 Subject: [PATCH 08/20] flake8 again :) --- sklearn/model_selection/_split.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 756824a47caa3..b4c55c78b298c 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -759,7 +759,8 @@ class TimeSeriesSplit(_BaseKFold): >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=5) >>> print(tscv) # doctest: +NORMALIZE_WHITESPACE - TimeSeriesSplit(gap_size=0, max_train_size=None, n_splits=5, test_size=None) + TimeSeriesSplit(gap_size=0, max_train_size=None, n_splits=5, + test_size=None) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From a0c16f6473fba160be0cf1148bb73828c64f700e Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 20 Feb 2019 20:39:06 -0500 Subject: [PATCH 09/20] update cv rst documentation --- doc/modules/cross_validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index 575efef9360c9..81274a326d09f 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -774,7 +774,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=3) >>> print(tscv) # doctest: +NORMALIZE_WHITESPACE - TimeSeriesSplit(max_train_size=None, n_splits=3) + TimeSeriesSplit(gap_size=0, max_train_size=None, n_splits=3, test_size=None) >>> for train, test in tscv.split(X): ... print("%s %s" % (train, test)) [0 1 2] [3] From b4fa003028ea566f6172a4aea8c589e8a5d8db04 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 25 Dec 2019 13:21:12 -0600 Subject: [PATCH 10/20] rename gap_size -> gap --- doc/modules/cross_validation.rst | 2 +- sklearn/model_selection/_split.py | 16 ++++++++-------- sklearn/model_selection/tests/test_split.py | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index a20627a3128cf..c723421af312b 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -782,7 +782,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=3) >>> print(tscv) # doctest: +NORMALIZE_WHITESPACE - TimeSeriesSplit(gap_size=0, max_train_size=None, n_splits=3, test_size=None) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None) >>> for train, test in tscv.split(X): ... print("%s %s" % (train, test)) [0 1 2] [3] diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 777eb36c7e7f1..b75d68c75b888 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -763,7 +763,7 @@ class TimeSeriesSplit(_BaseKFold): Number of samples in each test set. Defaults to ``n_samples / (n_splits + 1)``. - gap_size : int, default=0 + gap : int, default=0 Number of samples to exclude from the end of each train set before the test set. @@ -818,11 +818,11 @@ def __init__(self, n_splits=5, max_train_size=None, test_size=None, - gap_size=0): + gap=0): super().__init__(n_splits, shuffle=False, random_state=None) self.max_train_size = max_train_size self.test_size = test_size - self.gap_size = gap_size + self.gap = gap def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -851,7 +851,7 @@ def split(self, X, y=None, groups=None): n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 - gap_size = self.gap_size + gap = self.gap test_size = self.test_size if self.test_size else n_samples // n_folds # Make sure we have enough samples for the given split parameters @@ -860,18 +860,18 @@ def split(self, X, y=None, groups=None): ("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format(n_folds, n_samples)) - if n_samples - gap_size - (test_size * n_splits) <= 0: + if n_samples - gap - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" - " ={1} with test_size ={2} and gap_size ={3}." - "").format(n_splits, n_samples, test_size, gap_size)) + " ={1} with test_size ={2} and gap ={3}." + "").format(n_splits, n_samples, test_size, gap)) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) for test_start in test_starts: - train_end = test_start - gap_size + train_end = test_start - gap if self.max_train_size and self.max_train_size < train_end: yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 4228f7ada4d82..581ee82792b1c 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1544,11 +1544,11 @@ def test_time_series_test_size(): TimeSeriesSplit(n_splits=5, test_size=2).split(X)) -def test_time_series_gap_size(): +def test_time_series_gap(): X = np.zeros((10, 1)) # Test alone - splits = TimeSeriesSplit(n_splits=2, gap_size=2).split(X) + splits = TimeSeriesSplit(n_splits=2, gap=2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1]) @@ -1559,7 +1559,7 @@ def test_time_series_gap_size(): assert_array_equal(test, [7, 8, 9]) # Test with max_train_size - splits = TimeSeriesSplit(n_splits=3, gap_size=2, max_train_size=2).split(X) + splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1]) @@ -1574,7 +1574,7 @@ def test_time_series_gap_size(): assert_array_equal(test, [8, 9]) # Test with test_size - splits = TimeSeriesSplit(n_splits=2, gap_size=2, + splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2).split(X) train, test = next(splits) @@ -1587,9 +1587,9 @@ def test_time_series_gap_size(): # Verify proper error is thrown assert_raises_regexp(ValueError, - "Too many splits.*and gap_size", + "Too many splits.*and gap", next, - TimeSeriesSplit(n_splits=4, gap_size=2).split(X)) + TimeSeriesSplit(n_splits=4, gap=2).split(X)) def test_nested_cv(): From caa43987380575f44f880e2e57079e9842faf12b Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 25 Dec 2019 13:24:50 -0600 Subject: [PATCH 11/20] change check for is not None --- sklearn/model_selection/_split.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b75d68c75b888..78b822c04f7ca 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -852,7 +852,8 @@ def split(self, X, y=None, groups=None): n_splits = self.n_splits n_folds = n_splits + 1 gap = self.gap - test_size = self.test_size if self.test_size else n_samples // n_folds + test_size = self.test_size if self.test_size is not None \ + else n_samples // n_folds # Make sure we have enough samples for the given split parameters if n_folds > n_samples: From a17d06a6fd404c09fd187f46dfe65293d0fc11aa Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 25 Dec 2019 13:41:15 -0600 Subject: [PATCH 12/20] update docs --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 78b822c04f7ca..bf946bf9675ca 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -776,7 +776,7 @@ class TimeSeriesSplit(_BaseKFold): >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit() >>> print(tscv) - TimeSeriesSplit(max_train_size=None, n_splits=5) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From 9fdf59a21e3d084b7d9aa6c516c599b1ea4dd14e Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Wed, 25 Dec 2019 14:04:09 -0600 Subject: [PATCH 13/20] update doctests --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index bf946bf9675ca..b91f4b0703071 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -798,7 +798,7 @@ class TimeSeriesSplit(_BaseKFold): TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11] >>> # Add in a 2 period gap - >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap_size=2) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) >>> for train_index, test_index in tscv.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] From bb52c269b9c1868da1bad20930fdeb4672c664f9 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Mon, 6 Jan 2020 11:37:09 -0500 Subject: [PATCH 14/20] clean up docs and tests --- sklearn/model_selection/_split.py | 5 +++-- sklearn/model_selection/tests/test_split.py | 13 +++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index b91f4b0703071..f77cf308183cb 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -760,8 +760,9 @@ class TimeSeriesSplit(_BaseKFold): Maximum size for a single training set. test_size : int, optional - Number of samples in each test set. Defaults to - ``n_samples / (n_splits + 1)``. + Used to limit the size of the test set. Defaults to + ``n_samples / (n_splits + 1)``, which is the maximum allowed value + with ``gap=0``. gap : int, default=0 Number of samples to exclude from the end of each train set before diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 581ee82792b1c..23f7e67b2fee2 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1538,10 +1538,9 @@ def test_time_series_test_size(): assert_array_equal(test, [8, 9]) # Should fail with not enough data points for configuration - assert_raises_regexp(ValueError, - "Too many splits.*with test_size", - next, - TimeSeriesSplit(n_splits=5, test_size=2).split(X)) + with pytest.raises(ValueError, match="Too many splits.*with test_size"): + TimeSeriesSplit(n_splits=5, test_size=2).split(X) + print(X.shape) def test_time_series_gap(): @@ -1586,10 +1585,8 @@ def test_time_series_gap(): assert_array_equal(test, [8, 9]) # Verify proper error is thrown - assert_raises_regexp(ValueError, - "Too many splits.*and gap", - next, - TimeSeriesSplit(n_splits=4, gap=2).split(X)) + with pytest.raises(ValueError, "Too many splits.*and gap"): + TimeSeriesSplit(n_splits=4, gap=2).split(X) def test_nested_cv(): From a765033e9372591df722cf7124307220edd71ed1 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Mon, 6 Jan 2020 11:37:49 -0500 Subject: [PATCH 15/20] fix kwarg --- sklearn/model_selection/tests/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 23f7e67b2fee2..929cb7569ef71 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1585,7 +1585,7 @@ def test_time_series_gap(): assert_array_equal(test, [8, 9]) # Verify proper error is thrown - with pytest.raises(ValueError, "Too many splits.*and gap"): + with pytest.raises(ValueError, match="Too many splits.*and gap"): TimeSeriesSplit(n_splits=4, gap=2).split(X) From 7461e84b5eede55e4470085fec1e3239caf6f333 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Mon, 6 Jan 2020 11:57:48 -0500 Subject: [PATCH 16/20] update test --- sklearn/model_selection/tests/test_split.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 929cb7569ef71..d267db7f6c29f 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1539,8 +1539,8 @@ def test_time_series_test_size(): # Should fail with not enough data points for configuration with pytest.raises(ValueError, match="Too many splits.*with test_size"): - TimeSeriesSplit(n_splits=5, test_size=2).split(X) - print(X.shape) + splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X) + next(splits) def test_time_series_gap(): @@ -1586,7 +1586,8 @@ def test_time_series_gap(): # Verify proper error is thrown with pytest.raises(ValueError, match="Too many splits.*and gap"): - TimeSeriesSplit(n_splits=4, gap=2).split(X) + splits = TimeSeriesSplit(n_splits=4, gap=2).split(X) + next(splits) def test_nested_cv(): From dc58df91c52d670fe1f668bccc4e5703dcdc626b Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Mon, 4 May 2020 13:23:09 -0400 Subject: [PATCH 17/20] resolve merge conflict --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index e76292de7792d..538fbadd4b372 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -825,7 +825,7 @@ class TimeSeriesSplit(_BaseKFold): @_deprecate_positional_args def __init__(self, n_splits=5, - * + *, max_train_size=None, test_size=None, gap=0): From 02d17bc3adaca3f1782c283d6d548d73dcd366ce Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Sun, 10 May 2020 15:00:50 -0400 Subject: [PATCH 18/20] implement suggested changes --- doc/modules/cross_validation.rst | 2 +- doc/whats_new/v0.24.rst | 9 +++++++++ sklearn/model_selection/_split.py | 15 ++++++--------- sklearn/model_selection/tests/test_split.py | 11 +++++++++++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index feb4734f7a2b3..61ea8c7ef4248 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -781,7 +781,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit(n_splits=3) - >>> print(tscv) # doctest: +NORMALIZE_WHITESPACE + >>> print(tscv) TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None) >>> for train, test in tscv.split(X): ... print("%s %s" % (train, test)) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index dd4ab30a7f2ff..2554d136c13ea 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -47,6 +47,15 @@ Changelog :mod:`sklearn.module` ..................... +:mod:`sklearn.model_selection` +.............................. + +- |Enhancement| :class:`model_selection.TimeSeriesSplit` has two new keyword + arguments `test_size` and `gap`. `test_size` allows the out-of-sample + time series length to be fixed for all folds. `gap` removes a fixed number of + samples between the train and test set on each fold. + :pr:`13204` by :user:`Kyle Kosic `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 538fbadd4b372..7657a5326735d 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -766,16 +766,15 @@ class TimeSeriesSplit(_BaseKFold): max_train_size : int, default=None Maximum size for a single training set. - test_size : int, optional + test_size : int, default=None Used to limit the size of the test set. Defaults to - ``n_samples / (n_splits + 1)``, which is the maximum allowed value + ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. gap : int, default=0 Number of samples to exclude from the end of each train set before the test set. - Examples -------- >>> import numpy as np @@ -868,14 +867,12 @@ def split(self, X, y=None, groups=None): # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( - ("Cannot have number of folds ={0} greater" - " than the number of samples: {1}.").format(n_folds, - n_samples)) + (f"Cannot have number of folds ={n_folds} greater" + f" than the number of samples: {n_samples}.")) if n_samples - gap - (test_size * n_splits) <= 0: raise ValueError( - ("Too many splits ={0} for number of samples" - " ={1} with test_size ={2} and gap ={3}." - "").format(n_splits, n_samples, test_size, gap)) + (f"Too many splits ={n_splits} for number of samples" + f" ={n_samples} with test_size ={test_size} and gap ={gap}.")) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index ea2a58007178f..b89571ba085dd 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1517,6 +1517,17 @@ def test_time_series_gap(): assert_array_equal(train, [2, 3, 4, 5]) assert_array_equal(test, [8, 9]) + # Test with additional test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + # Verify proper error is thrown with pytest.raises(ValueError, match="Too many splits.*and gap"): splits = TimeSeriesSplit(n_splits=4, gap=2).split(X) From 0f21d453d344405a2b2092eaa544fd13edf19fb0 Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Sun, 10 May 2020 21:41:27 -0400 Subject: [PATCH 19/20] Update sklearn/model_selection/_split.py Co-authored-by: Thomas J Fan --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 7657a5326735d..940f4459a9ad9 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -867,7 +867,7 @@ def split(self, X, y=None, groups=None): # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( - (f"Cannot have number of folds ={n_folds} greater" + (f"Cannot have number of folds={n_folds} greater" f" than the number of samples: {n_samples}.")) if n_samples - gap - (test_size * n_splits) <= 0: raise ValueError( From d6797e33d2af5835f04cb4741d3a0741dabd1b1e Mon Sep 17 00:00:00 2001 From: Kyle Kosic Date: Sun, 10 May 2020 21:42:02 -0400 Subject: [PATCH 20/20] Apply suggestions from code review Co-authored-by: Thomas J Fan --- sklearn/model_selection/_split.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 940f4459a9ad9..75a4b865fda62 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -868,11 +868,11 @@ def split(self, X, y=None, groups=None): if n_folds > n_samples: raise ValueError( (f"Cannot have number of folds={n_folds} greater" - f" than the number of samples: {n_samples}.")) + f" than the number of samples={n_samples}.")) if n_samples - gap - (test_size * n_splits) <= 0: raise ValueError( - (f"Too many splits ={n_splits} for number of samples" - f" ={n_samples} with test_size ={test_size} and gap ={gap}.")) + (f"Too many splits={n_splits} for number of samples" + f"={n_samples} with test_size={test_size} and gap={gap}.")) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size,