scikit-learn · tczhao · Feb 19, 2021 · Feb 19, 2021
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -761,17 +761,19 @@ class TimeSeriesSplit(_BaseKFold):
 
     max_train_size : int, default=None
         Maximum size for a single training set.
+        or maximum number of data groups when group is supplied
 
     test_size : int, default=None
         Used to limit the size of the test set. Defaults to
         ``n_samples // (n_splits + 1)``, which is the maximum allowed value
-        with ``gap=0``.
+        with ``gap=0``
+        or numer of data groups when group is supplied
 
         .. versionadded:: 0.24
 
     gap : int, default=0
-        Number of samples to exclude from the end of each train set before
-        the test set.
+        Number of samples or groups to exclude from the end of each train set
+        before the test set.
 
         .. versionadded:: 0.24
 
@@ -793,6 +795,15 @@ class TimeSeriesSplit(_BaseKFold):
     TRAIN: [0 1 2] TEST: [3]
     TRAIN: [0 1 2 3] TEST: [4]
     TRAIN: [0 1 2 3 4] TEST: [5]
+    >>> # Split using group
+    >>> tscv = TimeSeriesSplit(n_splits=2)
+    >>> groups = np.array([1, 1, 2, 3, 4, 4])
+    >>> for train_index, test_index in tscv.split(X, groups=groups):
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [0 1 2] TEST: [3]
+    TRAIN: [0 1 2 3] TEST: [4 5]
     >>> # Fix test_size to 2 with 12 samples
     >>> X = np.random.randn(12, 2)
     >>> y = np.random.randint(0, 2, 12)
@@ -845,8 +856,9 @@ def split(self, X, y=None, groups=None):
         y : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
 
-        groups : array-like of shape (n_samples,)
-            Always ignored, exists for compatibility.
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
 
         Yields
         ------
@@ -856,7 +868,18 @@ def split(self, X, y=None, groups=None):
         test : ndarray
             The testing set indices for that split.
         """
-        X, y, groups = indexable(X, y, groups)
+        samples, y, groups = indexable(X, y, groups)
+
+        if groups is None:
+            X = samples
+            cv_type = "samples"
+        else:
+            _, count_index, count = np.unique(groups, return_counts=True,
+                                              return_index=True)
+            X = np.argsort(count_index)
+            cum_count = np.concatenate(([0], np.cumsum(count[X])))
+            cv_type = "groups"
+
         n_samples = _num_samples(X)
         n_splits = self.n_splits
         n_folds = n_splits + 1
@@ -868,24 +891,38 @@ def split(self, X, y=None, groups=None):
         if n_folds > n_samples:
             raise ValueError(
                 (f"Cannot have number of folds={n_folds} greater"
-                 f" than the number of samples={n_samples}."))
+                 f" than the number of {cv_type}={n_samples}."))
         if n_samples - gap - (test_size * n_splits) <= 0:
             raise ValueError(
-                (f"Too many splits={n_splits} for number of samples"
+                (f"Too many splits={n_splits} for number of {cv_type}"
                  f"={n_samples} with test_size={test_size} and gap={gap}."))
 
-        indices = np.arange(n_samples)
+        if groups is None:
+            indices = np.arange(n_samples)
+        else:
+            indices = np.arange(_num_samples(samples))
         test_starts = range(n_samples - n_splits * test_size,
                             n_samples, test_size)
 
         for test_start in test_starts:
             train_end = test_start - gap
             if self.max_train_size and self.max_train_size < train_end:
-                yield (indices[train_end - self.max_train_size:train_end],
-                       indices[test_start:test_start + test_size])
+                if groups is None:
+                    yield (indices[train_end - self.max_train_size:train_end],
+                           indices[test_start:test_start + test_size])
+                else:
+                    yield (indices[cum_count[train_end - self.max_train_size]:
+                           cum_count[train_end]],
+                           indices[cum_count[test_start]:
+                           cum_count[test_start + test_size]])
             else:
-                yield (indices[:train_end],
-                       indices[test_start:test_start + test_size])
+                if groups is None:
+                    yield (indices[:train_end],
+                           indices[test_start:test_start + test_size])
+                else:
+                    yield (indices[:cum_count[train_end]],
+                           indices[cum_count[test_start]:
+                           cum_count[test_start + test_size]])
 
 
 class LeaveOneGroupOut(BaseCrossValidator):

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -138,7 +138,8 @@ def test_2d_y():
                  ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                  GroupShuffleSplit(), LeaveOneGroupOut(),
                  LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3),
-                 TimeSeriesSplit(), PredefinedSplit(test_fold=groups)]
+                 TimeSeriesSplit(2),
+                 PredefinedSplit(test_fold=groups)]
     for splitter in splitters:
         list(splitter.split(X, y, groups))
         list(splitter.split(X, y_2d, groups))
@@ -1381,6 +1382,7 @@ def test_group_kfold():
 
 def test_time_series_cv():
     X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
+    groups = np.array([1, 1, 2, 2, 3, 4, 5])
 
     # Should fail if there are more folds than samples
     assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
@@ -1410,12 +1412,37 @@ def test_time_series_cv():
     assert_array_equal(train, [0, 1, 2, 3, 4])
     assert_array_equal(test, [5, 6])
 
+    # ordering on toy datasets with group
+    splits = tscv.split(X[:-1], groups=groups[:-1])
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5])
+
+    splits = TimeSeriesSplit(2).split(X)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5, 6])
+
     # Check get_n_splits returns the correct number of splits
     splits = TimeSeriesSplit(2).split(X)
     n_splits_actual = len(list(splits))
     assert n_splits_actual == tscv.get_n_splits()
     assert n_splits_actual == 2
 
+    splits = TimeSeriesSplit(2).split(X, groups=groups)
+    n_splits_actual = len(list(splits))
+    assert n_splits_actual == tscv.get_n_splits()
+    assert n_splits_actual == 2
+
 
 def _check_time_series_max_train_size(splits, check_splits, max_train_size):
     for (train, test), (check_train, check_test) in zip(splits, check_splits):
@@ -1427,21 +1454,39 @@ def _check_time_series_max_train_size(splits, check_splits, max_train_size):
 
 def test_time_series_max_train_size():
     X = np.zeros((6, 1))
+    groups = np.array([3, 4, 5, 1, 2, 2])
     splits = TimeSeriesSplit(n_splits=3).split(X)
+    group_splits = TimeSeriesSplit(n_splits=3).split(X, groups=groups)
+
     check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
     _check_time_series_max_train_size(splits, check_splits, max_train_size=3)
 
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3) \
+        .split(X, groups=groups)
+    _check_time_series_max_train_size(group_splits,
+                                      check_splits, max_train_size=3)
+
     # Test for the case where the size of a fold is greater than max_train_size
     check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
     _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
 
+    check_splits = TimeSeriesSplit(n_splits=2, max_train_size=2) \
+        .split(X, groups=groups)
+    _check_time_series_max_train_size(group_splits,
+                                      check_splits, max_train_size=2)
+
     # Test for the case where the size of each fold is less than max_train_size
     check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
     _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
 
+    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
+    _check_time_series_max_train_size(group_splits,
+                                      check_splits, max_train_size=2)
+
 
 def test_time_series_test_size():
     X = np.zeros((10, 1))
+    groups = np.array([6, 7, 1, 1, 1, 2, 2, 3, 4, 5])
 
     # Test alone
     splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X)
@@ -1458,6 +1503,21 @@ def test_time_series_test_size():
     assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6])
     assert_array_equal(test, [7, 8, 9])
 
+    # Test alone with groups
+    splits = TimeSeriesSplit(n_splits=3, test_size=2).split(X, groups=groups)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [1, 2, 3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
+    assert_array_equal(test, [8, 9])
+
     # Test with max_train_size
     splits = TimeSeriesSplit(n_splits=2, test_size=2,
                              max_train_size=4).split(X)
@@ -1470,14 +1530,31 @@ def test_time_series_test_size():
     assert_array_equal(train, [4, 5, 6, 7])
     assert_array_equal(test, [8, 9])
 
+    # Test with max_train_size and groups
+    splits = TimeSeriesSplit(n_splits=2, test_size=2,
+                             max_train_size=2).split(X, groups=groups)
+
+    train, test = next(splits)
+    assert_array_equal(train, [1, 2, 3, 4])
+    assert_array_equal(test, [5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [5, 6, 7])
+    assert_array_equal(test, [8, 9])
+
     # Should fail with not enough data points for configuration
     with pytest.raises(ValueError, match="Too many splits.*with test_size"):
         splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X)
         next(splits)
+    with pytest.raises(ValueError, match="Too many splits.*with test_size"):
+        splits = TimeSeriesSplit(n_splits=5, test_size=2) \
+            .split(X, groups=groups)
+        next(splits)
 
 
 def test_time_series_gap():
     X = np.zeros((10, 1))
+    groups = np.array([6, 7, 1, 1, 1, 2, 2, 3, 4, 5])
 
     # Test alone
     splits = TimeSeriesSplit(n_splits=2, gap=2).split(X)
@@ -1490,6 +1567,17 @@ def test_time_series_gap():
     assert_array_equal(train, [0, 1, 2, 3, 4])
     assert_array_equal(test, [7, 8, 9])
 
+    # Test alone with groups
+    splits = TimeSeriesSplit(n_splits=2, gap=2).split(X, groups=groups)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [8, 9])
+
     # Test with max_train_size
     splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X)
 
@@ -1505,6 +1593,22 @@ def test_time_series_gap():
     assert_array_equal(train, [4, 5])
     assert_array_equal(test, [8, 9])
 
+    # Test with max_train_size and groups
+    splits = TimeSeriesSplit(n_splits=3, gap=2,
+                             max_train_size=2).split(X, groups=groups)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [1, 2, 3, 4])
+    assert_array_equal(test, [8])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5, 6])
+    assert_array_equal(test, [9])
+
     # Test with test_size
     splits = TimeSeriesSplit(n_splits=2, gap=2,
                              max_train_size=4, test_size=2).split(X)
@@ -1517,6 +1621,18 @@ def test_time_series_gap():
     assert_array_equal(train, [2, 3, 4, 5])
     assert_array_equal(test, [8, 9])
 
+    # Test with test_size and groups
+    splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2)\
+        .split(X, groups=groups)
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [8, 9])
+
     # Test with additional test_size
     splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X)
 
@@ -1532,6 +1648,9 @@ def test_time_series_gap():
     with pytest.raises(ValueError, match="Too many splits.*and gap"):
         splits = TimeSeriesSplit(n_splits=4, gap=2).split(X)
         next(splits)
+    with pytest.raises(ValueError, match="Too many splits.*and gap"):
+        splits = TimeSeriesSplit(n_splits=5, gap=2).split(X, groups=groups)
+        next(splits)
 
 
 def test_nested_cv():