diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index d2d542cb791b6..96baa81e8db27 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -137,7 +137,7 @@ validation iterator instead, for instance:: >>> from sklearn.model_selection import ShuffleSplit >>> n_samples = iris.data.shape[0] - >>> cv = ShuffleSplit(n_iter=3, test_size=0.3, random_state=0) + >>> cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) >>> cross_val_score(clf, iris.data, iris.target, cv=cv) ... # doctest: +ELLIPSIS array([ 0.97..., 0.97..., 1. ]) @@ -224,7 +224,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples:: >>> from sklearn.model_selection import KFold >>> X = ["a", "b", "c", "d"] - >>> kf = KFold(n_folds=2) + >>> kf = KFold(n_splits=2) >>> for train, test in kf.split(X): ... print("%s %s" % (train, test)) [2 3] [0 1] @@ -253,7 +253,7 @@ two slightly unbalanced classes:: >>> X = np.ones(10) >>> y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - >>> skf = StratifiedKFold(n_folds=3) + >>> skf = StratifiedKFold(n_splits=3) >>> for train, test in skf.split(X, y): ... print("%s %s" % (train, test)) [2 3 6 7 8 9] [0 1 4 5] @@ -278,7 +278,7 @@ Imagine you have three subjects, each with an associated number from 1 to 3:: >>> y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] >>> labels = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] - >>> lkf = LabelKFold(n_folds=3) + >>> lkf = LabelKFold(n_splits=3) >>> for train, test in lkf.split(X, y, labels): ... print("%s %s" % (train, test)) [0 1 2 3 4 5] [6 7 8 9] @@ -454,7 +454,7 @@ Here is a usage example:: >>> from sklearn.model_selection import ShuffleSplit >>> X = np.arange(5) - >>> ss = ShuffleSplit(n_iter=3, test_size=0.25, + >>> ss = ShuffleSplit(n_splits=3, test_size=0.25, ... random_state=0) >>> for train_index, test_index in ss.split(X): ... print("%s %s" % (train_index, test_index)) @@ -485,7 +485,7 @@ Here is a usage example:: >>> X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001] >>> y = ["a", "b", "b", "b", "c", "c", "c", "a"] >>> labels = [1, 1, 2, 2, 3, 3, 4, 4] - >>> lss = LabelShuffleSplit(n_iter=4, test_size=0.5, random_state=0) + >>> lss = LabelShuffleSplit(n_splits=4, test_size=0.5, random_state=0) >>> for train, test in lss.split(X, y, labels): ... print("%s %s" % (train, test)) ... diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst index e61749b00c191..475e1c5e5b385 100644 --- a/doc/tutorial/statistical_inference/model_selection.rst +++ b/doc/tutorial/statistical_inference/model_selection.rst @@ -61,7 +61,7 @@ This example shows an example usage of the ``split`` method. >>> from sklearn.model_selection import KFold, cross_val_score >>> X = ["a", "a", "b", "c", "c", "c"] - >>> k_fold = KFold(n_folds=3) + >>> k_fold = KFold(n_splits=3) >>> for train_indices, test_indices in k_fold.split(X): ... print('Train: %s | test: %s' % (train_indices, test_indices)) Train: [2 3 4 5] | test: [0 1] @@ -70,7 +70,7 @@ This example shows an example usage of the ``split`` method. The cross-validation can then be performed easily:: - >>> kfold = KFold(n_folds=3) + >>> kfold = KFold(n_splits=3) >>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) ... for train, test in k_fold.split(X_digits)] [0.93489148580968284, 0.95659432387312182, 0.93989983305509184] @@ -106,11 +106,11 @@ scoring method. * - - :class:`KFold` **(n_folds, shuffle, random_state)** + - :class:`KFold` **(n_splits, shuffle, random_state)** - :class:`StratifiedKFold` **(n_iter, test_size, train_size, random_state)** - - :class:`LabelKFold` **(n_folds, shuffle, random_state)** + - :class:`LabelKFold` **(n_splits, shuffle, random_state)** * diff --git a/doc/whats_new.rst b/doc/whats_new.rst index eef97a178e7bb..5b804c218596c 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -62,6 +62,17 @@ Model Selection Enhancements and API Changes the corresponding parameter is not applicable. Additionally a list of all the parameter dicts are stored at ``results_['params']``. + - **Parameters ``n_folds`` and ``n_iter`` renamed to ``n_splits``** + + Some parameter names have changed: + The ``n_folds`` parameter in :class:`model_selection.KFold`, + :class:`model_selection.LabelKFold`, and + :class:`model_selection.StratifiedKFold` is now renamed to ``n_splits``. + The ``n_iter`` parameter in :class:`model_selection.ShuffleSplit`, + :class:`model_selection.LabelShuffleSplit`, + and :class:`model_selection.StratifiedShuffleSplit` is now renamed + to ``n_splits``. + New features ............ @@ -353,6 +364,12 @@ API changes summary (`#6697 `_) by `Raghav R V`_. + - The parameters ``n_iter`` or ``n_folds`` in old CV splitters are replaced + by the new parameter ``n_splits`` since it can provide a consistent + and unambiguous interface to represent the number of train-test splits. + (`#7187 `_) + by `YenChen Lin`_. + .. currentmodule:: sklearn diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py index 39e623f261cca..dfae1ad9b8a98 100644 --- a/examples/ensemble/plot_gradient_boosting_oob.py +++ b/examples/ensemble/plot_gradient_boosting_oob.py @@ -74,14 +74,14 @@ def heldout_score(clf, X_test, y_test): return score -def cv_estimate(n_folds=3): - cv = KFold(n_folds=n_folds) +def cv_estimate(n_splits=3): + cv = KFold(n_splits=n_splits) cv_clf = ensemble.GradientBoostingClassifier(**params) val_scores = np.zeros((n_estimators,), dtype=np.float64) for train, test in cv.split(X_train, y_train): cv_clf.fit(X_train[train], y_train[train]) val_scores += heldout_score(cv_clf, X_train[test], y_train[test]) - val_scores /= n_folds + val_scores /= n_splits return val_scores diff --git a/examples/mixture/plot_gmm_covariances.py b/examples/mixture/plot_gmm_covariances.py index dbd5be50f93e1..2b4cd88642a98 100644 --- a/examples/mixture/plot_gmm_covariances.py +++ b/examples/mixture/plot_gmm_covariances.py @@ -69,7 +69,7 @@ def make_ellipses(gmm, ax): # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. -skf = StratifiedKFold(n_folds=4) +skf = StratifiedKFold(n_splits=4) # Only take the first fold. train_index, test_index = next(iter(skf.split(iris.data, iris.target))) diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py index 505ceb827338a..cb8cd87a78030 100644 --- a/examples/model_selection/plot_learning_curve.py +++ b/examples/model_selection/plot_learning_curve.py @@ -101,14 +101,14 @@ def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, title = "Learning Curves (Naive Bayes)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. -cv = ShuffleSplit(n_iter=100, test_size=0.2, random_state=0) +cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: -cv = ShuffleSplit(n_iter=10, test_size=0.2, random_state=0) +cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index b208fc3d69079..6678dcb1af8b3 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -58,7 +58,7 @@ # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves -cv = StratifiedKFold(n_folds=6) +cv = StratifiedKFold(n_splits=6) classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index abbac81b18a0b..b71d6b22dc7c4 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -59,7 +59,7 @@ We should also note that small differences in scores results from the random splits of the cross-validation procedure. Those spurious variations can be -smoothed out by increasing the number of CV iterations ``n_iter`` at the +smoothed out by increasing the number of CV iterations ``n_splits`` at the expense of compute time. Increasing the value number of ``C_range`` and ``gamma_range`` steps will increase the resolution of the hyper-parameter heat map. @@ -128,7 +128,7 @@ def __call__(self, value, clip=None): C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) -cv = StratifiedShuffleSplit(n_iter=5, test_size=0.2, random_state=42) +cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X, y) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 996a0190e943a..09934c2f5d859 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -128,8 +128,8 @@ # To get nice curve, we need a large number of iterations to # reduce the variance grid = GridSearchCV(clf, refit=False, param_grid=param_grid, - cv=ShuffleSplit(train_size=train_size, n_iter=250, - random_state=1)) + cv=ShuffleSplit(train_size=train_size, + n_splits=250, random_state=1)) grid.fit(X, y) scores = grid.results_['test_mean_score'] diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 165d746d163ad..f378acfee1ed6 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -122,7 +122,7 @@ class LeaveOneOut(BaseCrossValidator): sample is used once as a test set (singleton) while the remaining samples form the training set. - Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_folds=n)`` and + Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and ``LeavePOut(p=1)`` where ``n`` is the number of samples. Due to the high number of test sets (which is the same as the @@ -197,7 +197,7 @@ class LeavePOut(BaseCrossValidator): samples form the training set in each iteration. Note: ``LeavePOut(p)`` is NOT equivalent to - ``KFold(n_folds=n_samples // p)`` which creates non-overlapping test sets. + ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. Due to the high number of iterations which grows combinatorically with the number of samples this cross-validation method can be very costly. For @@ -264,24 +264,24 @@ class _BaseKFold(with_metaclass(ABCMeta, BaseCrossValidator)): """Base class for KFold, LabelKFold, and StratifiedKFold""" @abstractmethod - def __init__(self, n_folds, shuffle, random_state): - if not isinstance(n_folds, numbers.Integral): + def __init__(self, n_splits, shuffle, random_state): + if not isinstance(n_splits, numbers.Integral): raise ValueError('The number of folds must be of Integral type. ' '%s of type %s was passed.' - % (n_folds, type(n_folds))) - n_folds = int(n_folds) + % (n_splits, type(n_splits))) + n_splits = int(n_splits) - if n_folds <= 1: + if n_splits <= 1: raise ValueError( "k-fold cross-validation requires at least one" - " train/test split by setting n_folds=2 or more," - " got n_folds={0}.".format(n_folds)) + " train/test split by setting n_splits=2 or more," + " got n_splits={0}.".format(n_splits)) if not isinstance(shuffle, bool): raise TypeError("shuffle must be True or False;" " got {0}".format(shuffle)) - self.n_folds = n_folds + self.n_splits = n_splits self.shuffle = shuffle self.random_state = random_state @@ -311,10 +311,10 @@ def split(self, X, y=None, labels=None): """ X, y, labels = indexable(X, y, labels) n_samples = _num_samples(X) - if self.n_folds > n_samples: + if self.n_splits > n_samples: raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of samples: {1}.").format(self.n_folds, + ("Cannot have number of splits n_splits={0} greater" + " than the number of samples: {1}.").format(self.n_splits, n_samples)) for train, test in super(_BaseKFold, self).split(X, y, labels): @@ -339,7 +339,7 @@ def get_n_splits(self, X=None, y=None, labels=None): n_splits : int Returns the number of splitting iterations in the cross-validator. """ - return self.n_folds + return self.n_splits class KFold(_BaseKFold): @@ -355,7 +355,7 @@ class KFold(_BaseKFold): Parameters ---------- - n_folds : int, default=3 + n_splits : int, default=3 Number of folds. Must be at least 2. shuffle : boolean, optional @@ -370,11 +370,11 @@ class KFold(_BaseKFold): >>> from sklearn.model_selection import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) - >>> kf = KFold(n_folds=2) + >>> kf = KFold(n_splits=2) >>> kf.get_n_splits(X) 2 >>> print(kf) # doctest: +NORMALIZE_WHITESPACE - KFold(n_folds=2, random_state=None, shuffle=False) + KFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in kf.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -384,9 +384,9 @@ class KFold(_BaseKFold): Notes ----- - The first ``n_samples % n_folds`` folds have size - ``n_samples // n_folds + 1``, other folds have size - ``n_samples // n_folds``, where ``n_samples`` is the number of samples. + The first ``n_samples % n_splits`` folds have size + ``n_samples // n_splits + 1``, other folds have size + ``n_samples // n_splits``, where ``n_samples`` is the number of samples. See also -------- @@ -398,9 +398,9 @@ class KFold(_BaseKFold): LabelKFold: K-fold iterator variant with non-overlapping labels. """ - def __init__(self, n_folds=3, shuffle=False, + def __init__(self, n_splits=3, shuffle=False, random_state=None): - super(KFold, self).__init__(n_folds, shuffle, random_state) + super(KFold, self).__init__(n_splits, shuffle, random_state) def _iter_test_indices(self, X, y=None, labels=None): n_samples = _num_samples(X) @@ -408,9 +408,9 @@ def _iter_test_indices(self, X, y=None, labels=None): if self.shuffle: check_random_state(self.random_state).shuffle(indices) - n_folds = self.n_folds - fold_sizes = (n_samples // n_folds) * np.ones(n_folds, dtype=np.int) - fold_sizes[:n_samples % n_folds] += 1 + n_splits = self.n_splits + fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) + fold_sizes[:n_samples % n_splits] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size @@ -429,7 +429,7 @@ class LabelKFold(_BaseKFold): Parameters ---------- - n_folds : int, default=3 + n_splits : int, default=3 Number of folds. Must be at least 2. Examples @@ -438,11 +438,11 @@ class LabelKFold(_BaseKFold): >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> labels = np.array([0, 0, 2, 2]) - >>> label_kfold = LabelKFold(n_folds=2) + >>> label_kfold = LabelKFold(n_splits=2) >>> label_kfold.get_n_splits(X, y, labels) 2 >>> print(label_kfold) - LabelKFold(n_folds=2) + LabelKFold(n_splits=2) >>> for train_index, test_index in label_kfold.split(X, y, labels): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -464,8 +464,8 @@ class LabelKFold(_BaseKFold): For splitting the data according to explicit domain-specific stratification of the dataset. """ - def __init__(self, n_folds=3): - super(LabelKFold, self).__init__(n_folds, shuffle=False, + def __init__(self, n_splits=3): + super(LabelKFold, self).__init__(n_splits, shuffle=False, random_state=None) def _iter_test_indices(self, X, y, labels): @@ -475,10 +475,10 @@ def _iter_test_indices(self, X, y, labels): unique_labels, labels = np.unique(labels, return_inverse=True) n_labels = len(unique_labels) - if self.n_folds > n_labels: - raise ValueError("Cannot have number of folds n_folds=%d greater" + if self.n_splits > n_labels: + raise ValueError("Cannot have number of splits n_splits=%d greater" " than the number of labels: %d." - % (self.n_folds, n_labels)) + % (self.n_splits, n_labels)) # Weight labels by their number of occurrences n_samples_per_label = np.bincount(labels) @@ -488,7 +488,7 @@ def _iter_test_indices(self, X, y, labels): n_samples_per_label = n_samples_per_label[indices] # Total weight of each fold - n_samples_per_fold = np.zeros(self.n_folds) + n_samples_per_fold = np.zeros(self.n_splits) # Mapping from label index to fold index label_to_fold = np.zeros(len(unique_labels)) @@ -501,7 +501,7 @@ def _iter_test_indices(self, X, y, labels): indices = label_to_fold[labels] - for f in range(self.n_folds): + for f in range(self.n_splits): yield np.where(indices == f)[0] @@ -518,7 +518,7 @@ class StratifiedKFold(_BaseKFold): Parameters ---------- - n_folds : int, default=3 + n_splits : int, default=3 Number of folds. Must be at least 2. shuffle : boolean, optional @@ -534,11 +534,11 @@ class StratifiedKFold(_BaseKFold): >>> from sklearn.model_selection import StratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) - >>> skf = StratifiedKFold(n_folds=2) + >>> skf = StratifiedKFold(n_splits=2) >>> skf.get_n_splits(X, y) 2 >>> print(skf) # doctest: +NORMALIZE_WHITESPACE - StratifiedKFold(n_folds=2, random_state=None, shuffle=False) + StratifiedKFold(n_splits=2, random_state=None, shuffle=False) >>> for train_index, test_index in skf.split(X, y): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -548,13 +548,13 @@ class StratifiedKFold(_BaseKFold): Notes ----- - All the folds have size ``trunc(n_samples / n_folds)``, the last one has + All the folds have size ``trunc(n_samples / n_splits)``, the last one has the complementary. """ - def __init__(self, n_folds=3, shuffle=False, random_state=None): - super(StratifiedKFold, self).__init__(n_folds, shuffle, random_state) + def __init__(self, n_splits=3, shuffle=False, random_state=None): + super(StratifiedKFold, self).__init__(n_splits, shuffle, random_state) def _make_test_folds(self, X, y=None, labels=None): if self.shuffle: @@ -566,26 +566,26 @@ def _make_test_folds(self, X, y=None, labels=None): unique_y, y_inversed = np.unique(y, return_inverse=True) y_counts = bincount(y_inversed) min_labels = np.min(y_counts) - if np.all(self.n_folds > y_counts): + if np.all(self.n_splits > y_counts): raise ValueError("All the n_labels for individual classes" - " are less than %d folds." - % (self.n_folds)) - if self.n_folds > min_labels: + " are less than n_splits=%d." + % (self.n_splits)) + if self.n_splits > min_labels: warnings.warn(("The least populated class in y has only %d" " members, which is too few. The minimum" " number of labels for any class cannot" - " be less than n_folds=%d." - % (min_labels, self.n_folds)), Warning) + " be less than n_splits=%d." + % (min_labels, self.n_splits)), Warning) # pre-assign each sample to a test fold index using individual KFold # splitting strategies for each class so as to respect the balance of # classes # NOTE: Passing the data corresponding to ith class say X[y==class_i] # will break when the data is not 100% stratifiable for all classes. - # So we pass np.zeroes(max(c, n_folds)) as data to the KFold + # So we pass np.zeroes(max(c, n_splits)) as data to the KFold per_cls_cvs = [ - KFold(self.n_folds, shuffle=self.shuffle, - random_state=rng).split(np.zeros(max(count, self.n_folds))) + KFold(self.n_splits, shuffle=self.shuffle, + random_state=rng).split(np.zeros(max(count, self.n_splits))) for count in y_counts] test_folds = np.zeros(n_samples, dtype=np.int) @@ -593,7 +593,7 @@ def _make_test_folds(self, X, y=None, labels=None): for cls, (_, test_split) in zip(unique_y, per_cls_splits): cls_test_folds = test_folds[y == cls] # the test split can be too big because we used - # KFold(...).split(X[:max(c, n_folds)]) when data is not 100% + # KFold(...).split(X[:max(c, n_splits)]) when data is not 100% # stratifiable for all the classes # (we use a warning instead of raising an exception) # If this is the case, let's trim it: @@ -605,7 +605,7 @@ def _make_test_folds(self, X, y=None, labels=None): def _iter_test_masks(self, X, y=None, labels=None): test_folds = self._make_test_folds(X, y) - for i in range(self.n_folds): + for i in range(self.n_splits): yield test_folds == i def split(self, X, y, labels=None): @@ -634,6 +634,7 @@ def split(self, X, y, labels=None): """ return super(StratifiedKFold, self).split(X, y, labels) + class LeaveOneLabelOut(BaseCrossValidator): """Leave One Label Out cross-validator @@ -803,10 +804,10 @@ def get_n_splits(self, X, y, labels): class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - def __init__(self, n_iter=10, test_size=0.1, train_size=None, + def __init__(self, n_splits=10, test_size=0.1, train_size=None, random_state=None): _validate_shuffle_split_init(test_size, train_size) - self.n_iter = n_iter + self.n_splits = n_splits self.test_size = test_size self.train_size = train_size self.random_state = random_state @@ -862,7 +863,7 @@ def get_n_splits(self, X=None, y=None, labels=None): n_splits : int Returns the number of splitting iterations in the cross-validator. """ - return self.n_iter + return self.n_splits def __repr__(self): return _build_repr(self) @@ -881,7 +882,7 @@ class ShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_iter : int (default 10) + n_splits : int (default 10) Number of re-shuffling & splitting iterations. test_size : float, int, or None, default 0.1 @@ -904,18 +905,18 @@ class ShuffleSplit(BaseShuffleSplit): >>> from sklearn.model_selection import ShuffleSplit >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 1, 2]) - >>> rs = ShuffleSplit(n_iter=3, test_size=.25, random_state=0) + >>> rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) >>> rs.get_n_splits(X) 3 >>> print(rs) - ShuffleSplit(n_iter=3, random_state=0, test_size=0.25, train_size=None) + ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None) >>> for train_index, test_index in rs.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) ... # doctest: +ELLIPSIS TRAIN: [3 1 0] TEST: [2] TRAIN: [2 1 3] TEST: [0] TRAIN: [0 2 1] TEST: [3] - >>> rs = ShuffleSplit(n_iter=3, train_size=0.5, test_size=.25, + >>> rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25, ... random_state=0) >>> for train_index, test_index in rs.split(X): ... print("TRAIN:", train_index, "TEST:", test_index) @@ -930,7 +931,7 @@ def _iter_indices(self, X, y=None, labels=None): n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) rng = check_random_state(self.random_state) - for i in range(self.n_iter): + for i in range(self.n_splits): # random partition permutation = rng.permutation(n_samples) ind_test = permutation[:n_test] @@ -955,7 +956,7 @@ class LabelShuffleSplit(ShuffleSplit): For example, a less computationally intensive alternative to ``LeavePLabelOut(p=10)`` would be - ``LabelShuffleSplit(test_size=10, n_iter=100)``. + ``LabelShuffleSplit(test_size=10, n_splits=100)``. Note: The parameters ``test_size`` and ``train_size`` refer to labels, and not to samples, as in ShuffleSplit. @@ -963,7 +964,7 @@ class LabelShuffleSplit(ShuffleSplit): Parameters ---------- - n_iter : int (default 5) + n_splits : int (default 5) Number of re-shuffling & splitting iterations. test_size : float (default 0.2), int, or None @@ -982,10 +983,10 @@ class LabelShuffleSplit(ShuffleSplit): Pseudo-random number generator state used for random sampling. ''' - def __init__(self, n_iter=5, test_size=0.2, train_size=None, + def __init__(self, n_splits=5, test_size=0.2, train_size=None, random_state=None): super(LabelShuffleSplit, self).__init__( - n_iter=n_iter, + n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state) @@ -1022,7 +1023,7 @@ class StratifiedShuffleSplit(BaseShuffleSplit): Parameters ---------- - n_iter : int (default 10) + n_splits : int (default 10) Number of re-shuffling & splitting iterations. test_size : float (default 0.1), int, or None @@ -1045,11 +1046,11 @@ class StratifiedShuffleSplit(BaseShuffleSplit): >>> from sklearn.model_selection import StratifiedShuffleSplit >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) - >>> sss = StratifiedShuffleSplit(n_iter=3, test_size=0.5, random_state=0) + >>> sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0) >>> sss.get_n_splits(X, y) 3 >>> print(sss) # doctest: +ELLIPSIS - StratifiedShuffleSplit(n_iter=3, random_state=0, ...) + StratifiedShuffleSplit(n_splits=3, random_state=0, ...) >>> for train_index, test_index in sss.split(X, y): ... print("TRAIN:", train_index, "TEST:", test_index) ... X_train, X_test = X[train_index], X[test_index] @@ -1059,10 +1060,10 @@ class StratifiedShuffleSplit(BaseShuffleSplit): TRAIN: [0 2] TEST: [3 1] """ - def __init__(self, n_iter=10, test_size=0.1, train_size=None, + def __init__(self, n_splits=10, test_size=0.1, train_size=None, random_state=None): super(StratifiedShuffleSplit, self).__init__( - n_iter, test_size, train_size, random_state) + n_splits, test_size, train_size, random_state) def _iter_indices(self, X, y, labels=None): n_samples = _num_samples(X) @@ -1093,7 +1094,7 @@ def _iter_indices(self, X, y, labels=None): t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int)) - for _ in range(self.n_iter): + for _ in range(self.n_splits): train = [] test = [] diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c3365bd3a7e60..75e0d5f71cb40 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -454,7 +454,7 @@ def test_X_as_list(): y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(check_X=lambda x: isinstance(x, list)) - cv = KFold(n_folds=3) + cv = KFold(n_splits=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X.tolist(), y).score(X, y) assert_true(hasattr(grid_search, "results_")) @@ -466,7 +466,7 @@ def test_y_as_list(): y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(check_y=lambda x: isinstance(x, list)) - cv = KFold(n_folds=3) + cv = KFold(n_splits=3) grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) grid_search.fit(X, y.tolist()).score(X, y) assert_true(hasattr(grid_search, "results_")) @@ -597,14 +597,14 @@ def test_grid_search_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) - n_folds = 3 + n_splits = 3 n_grid_points = 6 params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]), dict(kernel=['poly', ], degree=[1, 2])] - grid_search = GridSearchCV(SVC(), cv=n_folds, iid=False, + grid_search = GridSearchCV(SVC(), cv=n_splits, iid=False, param_grid=params) grid_search.fit(X, y) - grid_search_iid = GridSearchCV(SVC(), cv=n_folds, iid=True, + grid_search_iid = GridSearchCV(SVC(), cv=n_splits, iid=True, param_grid=params) grid_search_iid.fit(X, y) @@ -645,14 +645,15 @@ def test_random_search_results(): # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. - n_folds = 3 + n_splits = 3 n_search_iter = 30 params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) - random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, + random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, + cv=n_splits, iid=False, param_distributions=params) random_search.fit(X, y) random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter, - cv=n_folds, iid=True, + cv=n_splits, iid=True, param_distributions=params) random_search_iid.fit(X, y) @@ -779,22 +780,22 @@ def test_search_results_none_param(): def test_grid_search_correct_score_results(): # test that correct scores are used - n_folds = 3 + n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: - grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_folds) + grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("test_mean_score", "test_rank_score") + tuple("test_split%d_score" % cv_i - for cv_i in range(n_folds))) + for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) - cv = StratifiedKFold(n_folds=n_folds) + cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 69749f8e4c0aa..89b227efcee94 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -132,9 +132,9 @@ def get_params(self, deep=False): def test_cross_validator_with_default_params(): n_samples = 4 n_unique_labels = 4 - n_folds = 2 + n_splits = 2 p = 2 - n_iter = 10 # (the default value) + n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) @@ -142,8 +142,8 @@ def test_cross_validator_with_default_params(): labels = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) - kf = KFold(n_folds) - skf = StratifiedKFold(n_folds) + kf = KFold(n_splits) + skf = StratifiedKFold(n_splits) lolo = LeaveOneLabelOut() lopo = LeavePLabelOut(p) ss = ShuffleSplit(random_state=0) @@ -151,23 +151,24 @@ def test_cross_validator_with_default_params(): loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" - kf_repr = "KFold(n_folds=2, random_state=None, shuffle=False)" - skf_repr = "StratifiedKFold(n_folds=2, random_state=None, shuffle=False)" + kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" + skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneLabelOut()" lopo_repr = "LeavePLabelOut(n_labels=2)" - ss_repr = ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, " + ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" - n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds, - n_unique_labels, comb(n_unique_labels, p), n_iter, 2] + n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, + n_unique_labels, comb(n_unique_labels, p), + n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly - assert_equal(n_splits[i], cv.get_n_splits(X, y, labels)) + assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, labels)) # Test if the cross-validator works as expected even if # the data is 1d @@ -194,13 +195,13 @@ def check_valid_split(train, test, n_samples=None): assert_equal(train.union(test), set(range(n_samples))) -def check_cv_coverage(cv, X, y, labels, expected_n_iter=None): +def check_cv_coverage(cv, X, y, labels, expected_n_splits=None): n_samples = _num_samples(X) # Check that a all the samples appear at least once in a test fold - if expected_n_iter is not None: - assert_equal(cv.get_n_splits(X, y, labels), expected_n_iter) + if expected_n_splits is not None: + assert_equal(cv.get_n_splits(X, y, labels), expected_n_splits) else: - expected_n_iter = cv.get_n_splits(X, y, labels) + expected_n_splits = cv.get_n_splits(X, y, labels) collected_test_samples = set() iterations = 0 @@ -210,7 +211,7 @@ def check_cv_coverage(cv, X, y, labels, expected_n_iter=None): collected_test_samples.update(test) # Check that the accumulated test samples cover the whole dataset - assert_equal(iterations, expected_n_iter) + assert_equal(iterations, expected_n_splits) if n_samples is not None: assert_equal(collected_test_samples, set(range(n_samples))) @@ -234,10 +235,10 @@ def test_kfold_valueerrors(): # side of the split at each split with warnings.catch_warnings(): warnings.simplefilter("ignore") - check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3) + check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3) # Check that errors are raised if all n_labels for individual - # classes are less than n_folds. + # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) assert_raises(ValueError, next, skf_3.split(X2, y)) @@ -252,27 +253,27 @@ def test_kfold_valueerrors(): assert_raise_message(ValueError, error_string, StratifiedKFold, 1) - # When n_folds is not integer: + # When n_splits is not integer: assert_raises(ValueError, KFold, 1.5) assert_raises(ValueError, KFold, 2.0) assert_raises(ValueError, StratifiedKFold, 1.5) assert_raises(ValueError, StratifiedKFold, 2.0) # When shuffle is not a bool: - assert_raises(TypeError, KFold, n_folds=4, shuffle=None) + assert_raises(TypeError, KFold, n_splits=4, shuffle=None) def test_kfold_indices(): # Check all indices are returned in the test folds X1 = np.ones(18) kf = KFold(3) - check_cv_coverage(kf, X1, y=None, labels=None, expected_n_iter=3) + check_cv_coverage(kf, X1, y=None, labels=None, expected_n_splits=3) # Check all indices are returned in the test folds even when equal-sized # folds are not possible X2 = np.ones(17) kf = KFold(3) - check_cv_coverage(kf, X2, y=None, labels=None, expected_n_iter=3) + check_cv_coverage(kf, X2, y=None, labels=None, expected_n_splits=3) # Check if get_n_splits returns the number of folds assert_equal(5, KFold(5).get_n_splits(X2)) @@ -441,7 +442,7 @@ def test_shuffle_stratifiedkfold(): for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) - check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5) + check_cv_coverage(kf0, X_40, y, labels=None, expected_n_splits=5) def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 @@ -456,9 +457,9 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 X, y = digits.data[:600], digits.target[:600] model = SVC(C=10, gamma=0.005) - n_folds = 3 + n_splits = 3 - cv = KFold(n_folds=n_folds, shuffle=False) + cv = KFold(n_splits=n_splits, shuffle=False) mean_score = cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.92, mean_score) assert_greater(mean_score, 0.80) @@ -467,11 +468,11 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 # overfitting of the model with regards to the writing style of the authors # by yielding a seriously overestimated score: - cv = KFold(n_folds, shuffle=True, random_state=0) + cv = KFold(n_splits, shuffle=True, random_state=0) mean_score = cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.92) - cv = KFold(n_folds, shuffle=True, random_state=1) + cv = KFold(n_splits, shuffle=True, random_state=1) mean_score = cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.92) @@ -482,7 +483,7 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 # the estimated mean score is close to the score measured with # non-shuffled KFold - cv = StratifiedKFold(n_folds) + cv = StratifiedKFold(n_splits) mean_score = cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.93, mean_score) assert_greater(mean_score, 0.80) @@ -562,7 +563,7 @@ def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 - n_iter = 1000 + n_splits = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts @@ -577,19 +578,19 @@ def assert_counts_are_ok(idx_counts, p): for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) - splits = StratifiedShuffleSplit(n_iter=n_iter, + splits = StratifiedShuffleSplit(n_splits=n_splits, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples - n_splits = 0 + n_splits_actual = 0 for train, test in splits.split(X=np.ones(n_samples), y=labels): - n_splits += 1 + n_splits_actual += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 - assert_equal(n_splits, n_iter) + assert_equal(n_splits_actual, n_splits) n_train, n_test = _validate_shuffle_split(n_samples, test_size=1./n_folds, @@ -616,10 +617,10 @@ def test_stratified_shuffle_split_overlap_train_test_bug(): y = [0, 1, 2, 3] * 3 + [4, 5] * 5 X = np.ones_like(y) - splits = StratifiedShuffleSplit(n_iter=1, - test_size=0.5, random_state=0) + sss = StratifiedShuffleSplit(n_splits=1, + test_size=0.5, random_state=0) - train, test = next(iter(splits.split(X=X, y=y))) + train, test = next(iter(sss.split(X=X, y=y))) assert_array_equal(np.intersect1d(train, test), []) @@ -653,15 +654,15 @@ def test_label_shuffle_split(): for l in labels: X = y = np.ones(len(l)) - n_iter = 6 + n_splits = 6 test_size = 1./3 - slo = LabelShuffleSplit(n_iter, test_size=test_size, random_state=0) + slo = LabelShuffleSplit(n_splits, test_size=test_size, random_state=0) # Make sure the repr works repr(slo) # Test that the length is correct - assert_equal(slo.get_n_splits(X, y, labels=l), n_iter) + assert_equal(slo.get_n_splits(X, y, labels=l), n_splits) l_unique = np.unique(l) @@ -906,7 +907,7 @@ def test_label_kfold(): # Parameters of the test n_labels = 15 n_samples = 1000 - n_folds = 5 + n_splits = 5 X = y = np.ones(n_samples) @@ -914,12 +915,12 @@ def test_label_kfold(): tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) - ideal_n_labels_per_fold = n_samples // n_folds + ideal_n_labels_per_fold = n_samples // n_splits len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) - lkf = LabelKFold(n_folds=n_folds) + lkf = LabelKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i @@ -949,9 +950,9 @@ def test_label_kfold(): n_labels = len(np.unique(labels)) n_samples = len(labels) - n_folds = 5 + n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed - ideal_n_labels_per_fold = n_samples // n_folds + ideal_n_labels_per_fold = n_samples // n_splits X = y = np.ones(n_samples) @@ -980,8 +981,8 @@ def test_label_kfold(): # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) - assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", - next, LabelKFold(n_folds=3).split(X, y, labels)) + assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", + next, LabelKFold(n_splits=3).split(X, y, labels)) def test_nested_cv(): @@ -992,7 +993,7 @@ def test_nested_cv(): labels = rng.randint(0, 5, 15) cvs = [LeaveOneLabelOut(), LeaveOneOut(), LabelKFold(), StratifiedKFold(), - StratifiedShuffleSplit(n_iter=3, random_state=0)] + StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]}, diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2e694fd45e59a..62a86000562f6 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -136,7 +136,8 @@ def _is_training_data(self, X): X = np.ones((10, 2)) X_sparse = coo_matrix(X) y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) -# The number of samples per class needs to be > n_folds, for StratifiedKFold(3) +# The number of samples per class needs to be > n_splits, +# for StratifiedKFold(n_splits=3) y2 = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 3]) @@ -701,7 +702,7 @@ def test_learning_curve_with_boolean_indices(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) - cv = KFold(n_folds=3) + cv = KFold(n_splits=3) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10)) assert_array_equal(train_sizes, np.linspace(2, 20, 10))