8000 [MRG] ENH Support split by group in TimeSeriesSplit by tczhao · Pull Request #19496 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] ENH Support split by group in TimeSeriesSplit #19496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 50 additions & 13 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,17 +761,19 @@ class TimeSeriesSplit(_BaseKFold):

max_train_size : int, default=None
Maximum size for a single training set.
or maximum number of data groups when group is supplied

test_size : int, default=None
Used to limit the size of the test set. Defaults to
``n_samples // (n_splits + 1)``, which is the maximum allowed value
with ``gap=0``.
with ``gap=0``
or numer of data groups when group is supplied

.. versionadded:: 0.24

gap : int, default=0
Number of samples to exclude from the end of each train set before
the test set.
Number of samples or groups to exclude from the end of each train set
before the test set.

.. versionadded:: 0.24

Expand All @@ -793,6 +795,15 @@ class TimeSeriesSplit(_BaseKFold):
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]
>>> # Split using group
>>> tscv = TimeSeriesSplit(n_splits=2)
>>> groups = np.array([1, 1, 2, 3, 4, 4])
>>> for train_index, test_index in tscv.split(X, groups=groups):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4 5]
>>> # Fix test_size to 2 with 12 samples
>>> X = np.random.randn(12, 2)
>>> y = np.random.randint(0, 2, 12)
Expand Down Expand Up @@ -845,8 +856,9 @@ def split(self, X, y=None, groups=None):
y : array-like of shape (n_samples,)
Always ignored, exists for compatibility.

groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.

Yields
------
Expand All @@ -856,7 +868,18 @@ def split(self, X, y=None, groups=None):
test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
samples, y, groups = indexable(X, y, groups)

if groups is None:
X = samples
cv_type = "samples"
else:
_, count_index, count = np.unique(groups, return_counts=True,
return_index=True)
X = np.argsort(count_index)
cum_count = np.concatenate(([0], np.cumsum(count[X])))
cv_type = "groups"

n_samples = _num_samples(X)
n_splits = self.n_splits
n_folds = n_splits + 1
Expand All @@ -868,24 +891,38 @@ def split(self, X, y=None, groups=None):
if n_folds > n_samples:
raise ValueError(
(f"Cannot have number of folds={n_folds} greater"
f" than the number of samples={n_samples}."))
f" than the number of {cv_type}={n_samples}."))
if n_samples - gap - (test_size * n_splits) <= 0:
raise ValueError(
(f"Too many splits={n_splits} for number of samples"
(f"Too many splits={n_splits} for number of {cv_type}"
f"={n_samples} with test_size={test_size} and gap={gap}."))

indices = np.arange(n_samples)
if groups is None:
indices = np.arange(n_samples)
else:
indices = np.arange(_num_samples(samples))
test_starts = range(n_samples - n_splits * test_size,
n_samples, test_size)

for test_start in test_starts:
train_end = test_start - gap
if self.max_train_size and self.max_train_size < train_end:
yield (indices[train_end - self.max_train_size:train_end],
indices[test_start:test_start + test_size])
if groups is None:
yield (indices[train_end - self.max_train_size:train_end],
indices[test_start:test_start + test_size])
else:
yield (indices[cum_count[train_end - self.max_train_size]:
cum_count[train_end]],
indices[cum_count[test_start]:
cum_count[test_start + test_size]])
else:
yield (indices[:train_end],
indices[test_start:test_start + test_size])
if groups is None:
yield (indices[:train_end],
indices[test_start:test_start + test_size])
else:
yield (indices[:cum_count[train_end]],
indices[cum_count[test_start]:
cum_count[test_start + test_size]])


class LeaveOneGroupOut(BaseCrossValidator):
Expand Down
121 changes: 120 additions & 1 deletion sklearn/model_selection/tests/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def test_2d_y():
ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
GroupShuffleSplit(), LeaveOneGroupOut(),
LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3),
TimeSeriesSplit(), PredefinedSplit(test_fold=groups)]
TimeSeriesSplit(2),
PredefinedSplit(test_fold=groups)]
for splitter in splitters:
list(splitter.split(X, y, groups))
list(splitter.split(X, y_2d, groups))
Expand Down Expand Up @@ -1381,6 +1382,7 @@ def test_group_kfold():

def test_time_series_cv():
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
groups = np.array([1, 1, 2, 2, 3, 4, 5])

# Should fail if there are more folds than samples
assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
Expand Down Expand Up @@ -1410,12 +1412,37 @@ def test_time_series_cv():
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [5, 6])

# ordering on toy datasets with group
splits = tscv.split(X[:-1], groups=groups[:-1])
train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3])
assert_array_equal(test, [4])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [5])

splits = TimeSeriesSplit(2).split(X)

train, test = next(splits)
assert_array_equal(train, [0, 1, 2])
assert_array_equal(test, [3, 4])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [5, 6])

# Check get_n_splits returns the correct number of splits
splits = TimeSeriesSplit(2).split(X)
n_splits_actual = len(list(splits))
assert n_splits_actual == tscv.get_n_splits()
assert n_splits_actual == 2

splits = TimeSeriesSplit(2).split(X, groups=groups)
n_splits_actual = len(list(splits))
assert n_splits_actual == tscv.get_n_splits()
assert n_splits_actual == 2


def _check_time_series_max_train_size(splits, check_splits, max_train_size):
for (train, test), (check_train, check_test) in zip(splits, check_splits):
Expand All @@ -1427,21 +1454,39 @@ def _check_time_series_max_train_size(splits, check_splits, max_train_size):

def test_time_series_max_train_size():
X = np.zeros((6, 1))
groups = np.array([3, 4, 5, 1, 2, 2])
splits = TimeSeriesSplit(n_splits=3).split(X)
group_splits = TimeSeriesSplit(n_splits=3).split(X, groups=groups)

check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=3)

check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3) \
.split(X, groups=groups)
_check_time_series_max_train_size(group_splits,
check_splits, max_train_size=3)

# Test for the case where the size of a fold is greater than max_train_size
check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=2)

8000 check_splits = TimeSeriesSplit(n_splits=2, max_train_size=2) \
.split(X, groups=groups)
_check_time_series_max_train_size(group_splits,
check_splits, max_train_size=2)

# Test for the case where the size of each fold is less than max_train_size
check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=2)

check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
_check_time_series_max_train_size(group_splits,
check_splits, max_train_size=2)


def test_time_series_test_size():
X = np.zeros((10, 1))
groups = np.array([6, 7, 1, 1, 1, 2, 2, 3, 4, 5])

# Test alone
splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X)
Expand All @@ -1458,6 +1503,21 @@ def test_time_series_test_size():
assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6])
assert_array_equal(test, [7, 8, 9])

# Test alone with groups
splits = TimeSeriesSplit(n_splits=3, test_size=2).split(X, groups=groups)

train, test = next(splits)
assert_array_equal(train, [0])
assert_array_equal(test, [1, 2, 3, 4])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [5, 6, 7])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
assert_array_equal(test, [8, 9])

# Test with max_train_size
splits = TimeSeriesSplit(n_splits=2, test_size=2,
max_train_size=4).split(X)
Expand All @@ -1470,14 +1530,31 @@ def test_time_series_test_size():
assert_array_equal(train, [4, 5, 6, 7])
assert_array_equal(test, [8, 9])

# Test with max_train_size and groups
splits = TimeSeriesSplit(n_splits=2, test_size=2,
max_train_size=2).split(X, groups=groups)

train, test = next(splits)
assert_array_equal(train, [1, 2, 3, 4])
assert_array_equal(test, [5, 6, 7])

train, test = next(splits)
assert_array_equal(train, [5, 6, 7])
assert_array_equal(test, [8, 9])

# Should fail with not enough data points for configuration
with pytest.raises(ValueError, match="Too many splits.*with test_size"):
splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X)
next(splits)
with pytest.raises(ValueError, match="Too many splits.*with test_size"):
splits = TimeSeriesSplit(n_splits=5, test_size=2) \
.split(X, groups=groups)
next(splits)


def test_time_series_gap():
X = np.zeros((10, 1))
groups = np.array([6, 7, 1, 1, 1, 2, 2, 3, 4, 5])

# Test alone
splits = TimeSeriesSplit(n_splits=2, gap=2).split(X)
Expand All @@ -1490,6 +1567,17 @@ def test_time_series_gap():
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [7, 8, 9])

# Test alone with groups
splits = TimeSeriesSplit(n_splits=2, gap=2).split(X, groups=groups)

train, test = next(splits)
assert_array_equal(train, [0])
assert_array_equal(test, [5, 6, 7])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [8, 9])

# Test with max_train_size
splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X)

Expand All @@ -1505,6 +1593,22 @@ def test_time_series_gap():
assert_array_equal(train, [4, 5])
assert_array_equal(test, [8, 9])

# Test with max_train_size and groups
splits = TimeSeriesSplit(n_splits=3, gap=2,
max_train_size=2).split(X, groups=groups)

train, test = next(splits)
assert_array_equal(train, [0, 1])
assert_array_equal(test, [7])

train, test = next(splits)
assert_array_equal(train, [1, 2, 3, 4])
assert_array_equal(test, [8])

train, test = next(splits)
assert_array_equal(train, [2, 3, 4, 5, 6])
assert_array_equal(test, [9])

# Test with test_size
splits = TimeSeriesSplit(n_splits=2, gap=2,
max_train_size=4, test_size=2).split(X)
Expand All @@ -1517,6 +1621,18 @@ def test_time_series_gap():
assert_array_equal(train, [2, 3, 4, 5])
assert_array_equal(test, [8, 9])

# Test with test_size and groups
splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2)\
.split(X, groups=groups)

train, test = next(splits)
assert_array_equal(train, [0])
assert_array_equal(test, [5, 6, 7])

train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [8, 9])

# Test with additional test_size
splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X)

Expand All @@ -1532,6 +1648,9 @@ def test_time_series_gap():
with pytest.raises(ValueError, match="Too many splits.*and gap"):
splits = TimeSeriesSplit(n_splits=4, gap=2).split(X)
next(splits)
with pytest.raises(ValueError, match="Too many splits.*and gap"):
splits = TimeSeriesSplit(n_splits=5, gap=2).split(X, groups=groups)
next(splits)


def test_nested_cv():
Expand Down
0