scikit-learn · yenchenlin · Feb 13, 2016 · Feb 15, 2016 · yenchenlin · Feb 13, 2016
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
@@ -2,6 +2,7 @@
 from ._split import KFold
 from ._split import LabelKFold
 from ._split import StratifiedKFold
+from ._split import HomogeneousTimeSeriesCV
 from ._split import LeaveOneLabelOut
 from ._split import LeaveOneOut
 from ._split import LeavePLabelOut
@@ -27,6 +28,7 @@
 
 __all__ = ('BaseCrossValidator',
            'GridSearchCV',
+           'HomogeneousTimeSeriesCV',
            'KFold',
            'LabelKFold',
            'LabelShuffleSplit',

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -637,6 +637,121 @@ def split(self, X, y, labels=None):
         """
         return super(StratifiedKFold, self).split(X, y, labels)
 
+class HomogeneousTimeSeriesCV(_BaseKFold):
+    """Homogeneous Time Series cross-validator
+
+    Provides train/test indices to split time series data in train/test sets.
+
+    This cross-validation object is a variation of KFold.
+    In iteration k, it returns first k folds as train set and k+1 fold as
+    test set.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    n_folds : int, default=3
+        Number of folds. Must be at least 2.
+
+    Examples
+    --------
+    >>> from sklearn.model_selection import HomogeneousTimeSeriesCV
+    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
+    >>> y = np.array([1, 2, 3, 4])
+    >>> htscv = HomogeneousTimeSeriesCV(n_folds=4)
+    >>> htscv.get_n_splits(X)
+    3
+    >>> print(htscv)  # doctest: +NORMALIZE_WHITESPACE
+    HomogeneousTimeSeriesCV(n_folds=2)
+    >>> for train_index, test_index in htscv.split(X):
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...    X_train, X_test = X[train_index], X[test_index]
+    ...    y_train, y_test = y[train_index], y[test_index]
+    TRAIN: [0] TEST: [1]
+    TRAIN: [0 1] TEST: [2]
+    TRAIN: [0 1 2] TEST: [3]
+
+    Notes
+    -----
+    The first ``n_samples % n_folds`` folds have size
+    ``n_samples // n_folds + 1``, other folds have size
+    ``n_samples // n_folds``, where ``n_samples`` is the number of samples.
+
+    Number of splitting iterations in this cross-validator, n_folds-1,
+    is not equal to other KFold based cross-validators'.
+
+    See also
+    --------
+    """
+    def __init__(self, n_folds=3):
+        super(HomogeneousTimeSeriesCV, self).__init__(n_folds,
+                                                      shuffle=False,
+                                                      random_state=None)
+
+    def split(self, X, y=None, labels=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like, shape (n_samples,)
+            The target variable for supervised learning problems.
+
+        labels : array-like, with shape (n_samples,), optional
+            Group labels for the samples used while splitting the dataset into
+            train/test set.
+
+        Returns
+        -------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y, labels = indexable(X, y, labels)
+        n_samples = _num_samples(X)
+        if self.n_folds > n_samples:
+            raise ValueError(
+                ("Cannot have number of folds n_folds={0} greater"
+                 " than the number of samples: {1}.").format(self.n_folds,
+                                                             n_samples))
+        n_folds = self.n_folds
+        indices = np.arange(n_samples)
+        fold_sizes = (n_samples // n_folds) * np.ones(n_folds, dtype=np.int)
+        fold_sizes[:n_samples % n_folds] += 1
+        current = 0
+        for fold_size in fold_sizes:
+            start, stop = current, current + fold_size
+            if current != 0:
+                yield indices[:start], indices[start:stop]
+            current = stop
+
+    def get_n_splits(self, X=None, y=None, labels=None):
+        """Returns the number of splitting iterations in the cross-validator
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        labels : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return self.n_folds-1
+
+
 class LeaveOneLabelOut(BaseCrossValidator):
     """Leave One Label Out cross-validator
 

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -30,6 +30,7 @@
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import LabelKFold
+from sklearn.model_selection import HomogeneousTimeSeriesCV
 from sklearn.model_selection import LeaveOneOut
 from sklearn.model_selection import LeaveOneLabelOut
 from sklearn.model_selection import LeavePOut
@@ -970,6 +971,39 @@ def test_label_kfold():
                          next, LabelKFold(n_folds=3).split(X, y, labels))
 
 
+def test_homogeneous_time_series_cv():
+    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
+
+    # Should fail if there are more folds than samples
+    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
+                         next,
+                         HomogeneousTimeSeriesCV(n_folds=9).split(X))
+
+    htscv = HomogeneousTimeSeriesCV(3)
+
+    # Manually check that Homogeneous Time Series CV preserves the data
+    # ordering on toy datasets
+    splits = htscv.split(X[:-1])
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1])
+    assert_array_equal(test, [2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3])
+    assert_array_equal(test, [4, 5])
+
+    splits = HomogeneousTimeSeriesCV(3).split(X)
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [3, 4])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [5, 6])
+
+    # Check get_n_splits returns the number of folds - 1
+    assert_equal(2, htscv.get_n_splits())
+
 def test_nested_cv():
     # Test if nested cross validation works with different combinations of cv
     rng = np.random.RandomState(0)