8000 [MRG] Feature: Additional `TimeSeriesSplit` Functionality by kykosic · Pull Request #13204 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Feature: Additional TimeSeriesSplit Functionality #13204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
May 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/modules/cross_validation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples::
>>> y = np.array([1, 2, 3, 4, 5, 6])
>>> tscv = TimeSeriesSplit(n_splits=3)
>>> print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=3)
TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None)
>>> for train, test in tscv.split(X):
... print("%s %s" % (train, test))
[0 1 2] [3]
Expand Down
9 changes: 9 additions & 0 deletions 8000 doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ Changelog
:mod:`sklearn.module`
.....................

:mod:`sklearn.model_selection`
..............................

- |Enhancement| :class:`model_selection.TimeSeriesSplit` has two new keyword
arguments `test_size` and `gap`. `test_size` allows the out-of-sample
time series length to be fixed for all folds. `gap` removes a fixed number of
samples between the train and test set on each fold.
:pr:`13204` by :user:`Kyle Kosic <kykosic>`.


Code and Documentation Contributors
-----------------------------------
Expand Down
68 changes: 57 additions & 11 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,15 @@ class TimeSeriesSplit(_BaseKFold):
max_train_size : int, default=None
Maximum size for a single training set.

test_size : int, default=None
Used to limit the size of the test set. Defaults to
``n_samples // (n_splits + 1)``, which is the maximum allowed value
with ``gap=0``.

gap : int, default=0
Number of samples to exclude from the end of each train set before
the test set.

Examples
--------
>>> import numpy as np
Expand All @@ -774,7 +783,7 @@ class TimeSeriesSplit(_BaseKFold):
>>> y = np.array([1, 2, 3, 4, 5, 6])
>>> tscv = TimeSeriesSplit()
>>> print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=5)
TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
Expand All @@ -784,18 +793,45 @@ class TimeSeriesSplit(_BaseKFold):
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]
>>> # Fix test_size to 2 with 12 samples
>>> X = np.random.randn(12, 2)
>>> y = np.random.randint(0, 2, 12)
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1 2 3 4 5] TEST: [6 7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]
>>> # Add in a 2 period gap
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)
>>> for train_index, test_index in tscv.split(X):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1 2 3] TEST: [6 7]
TRAIN: [0 1 2 3 4 5] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11]

Notes
-----
The training set has size ``i * n_samples // (n_splits + 1)
+ n_samples % (n_splits + 1)`` in the ``i``th split,
with a test set of size ``n_samples//(n_splits + 1)``,
with a test set of size ``n_samples//(n_splits + 1)`` by default,
where ``n_samples`` is the number of samples.
"""
@_deprecate_positional_args
def __init__(self, n_splits=5, *, max_train_size=None):
def __init__(self,
n_splits=5,
*,
max_train_size=None,
test_size=None,
gap=0):
super().__init__(n_splits, shuffle=False, random_state=None)
self.max_train_size = max_train_size
self.test_size = test_size
self.gap = gap

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.
Expand Down Expand Up @@ -824,21 +860,31 @@ def split(self, X, y=None, groups=None):
n_samples = _num_samples(X)
n_splits = self.n_splits
n_folds = n_splits + 1
gap = self.gap
test_size = self.test_size if self.test_size is not None \
else n_samples // n_folds

# Make sure we have enough samples for the given split parameters
if n_folds > n_samples:
raise ValueError(
("Cannot have number of folds ={0} greater"
" than the number of samples: {1}.").format(n_folds,
n_samples))
(f"Cannot have number of folds={n_folds} greater"
f" than the number of samples={n_samples}."))
if n_samples - gap - (test_size * n_splits) <= 0:
raise ValueError(
(f"Too many splits={n_splits} for number of samples"
f"={n_samples} with test_size={test_size} and gap={gap}."))

indices = np.arange(n_samples)
test_size = (n_samples // n_folds)
test_starts = range(test_size + n_samples % n_folds,
test_starts = range(n_samples - n_splits * test_size,
n_samples, test_size)

for test_start in test_starts:
if self.max_train_size and self.max_train_size < test_start:
yield (indices[test_start - self.max_train_size:test_start],
train_end = test_start - gap
if self.max_train_size and self.max_train_size < train_end:
yield (indices[train_end - self.max_train_size:train_end],
indices[test_start:test_start + test_size])
else:
yield (indices[:test_start],
yield (indices[:train_end],
indices[test_start:test_start + test_size])


Expand Down
94 changes: 94 additions & 0 deletions sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,6 +1440,100 @@ def test_time_series_max_train_size():
_check_time_series_max_train_size(splits, check_splits, max_train_size=2)


def test_time_series_test_size():
X = np.zeros((10, 1))

# Test alone
splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X)

train, test = next(splits)
assert_array_equal(train, [0])
assert_array_equal(test, [1, 2, 3])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3])
assert_array_equal(test, [4, 5, 6])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6])
assert_array_equal(test, [7, 8, 9])

# Test with max_train_size
splits = TimeSeriesSplit(n_splits=2, test_size=2,
max_train_size=4).split(X)

train, test = next(splits)
assert_array_equal(train, [2, 3, 4, 5])
assert_array_equal(test, [6, 7])

train, test = next(splits)
assert_array_equal(train, [4, 5, 6, 7])
assert_array_equal(test, [8, 9])

# Should fail with not enough data points for configuration
with pytest.raises(ValueError, match="Too many splits.*with test_size"):
splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X)
next(splits)


def test_time_series_gap():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test_time_series_test_size and test_time_series_gap can be pytest.mark.parametrize.

The two error cases can be in its own test as well. This is not a blocker and can be done in a followup PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be happy to implement this suggestion in a followup PR

X = np.zeros((10, 1))

# Test alone
splits = TimeSeriesSplit(n_splits=2, gap=2).split(X)

train, test = next(splits)
assert_array_equal(train, [0, 1])
assert_array_equal(test, [4, 5, 6])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [7, 8, 9])

# Test with max_train_size
splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X)

train, test = next(splits)
assert_array_equal(train, [0, 1])
assert_array_equal(test, [4, 5])

train, test = next(splits)
assert_array_equal(train, [2, 3])
assert_array_equal(test, [6, 7])

train, test = next(splits)
assert_array_equal(train, [4, 5])
assert_array_equal(test, [8, 9])

# Test with test_size
splits = TimeSeriesSplit(n_splits=2, gap=2,
max_train_size=4, test_size=2).split(X)

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3])
assert_array_equal(test, [6, 7])

train, test = next(splits)
assert_array_equal(train, [2, 3, 4, 5])
assert_array_equal(test, [8, 9])

# Test with additional test_size
splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X)

train, test = next(splits)
assert_array_equal(train, [0, 1])
assert_array_equal(test, [4, 5, 6])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [7, 8, 9])

# Verify proper error is thrown
with pytest.raises(ValueError, match="Too many splits.*and gap"):
splits = TimeSeriesSplit(n_splits=4, gap=2).split(X)
next(splits)


def test_nested_cv():
# Test if nested cross validation works with different combinations of cv
rng = np.random.RandomState(0)
Expand Down
0