From eb4d1796bfd6b7cccd28d134cb66d391e50861d8 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 13:20:58 +0100
Subject: [PATCH 01/14] Remove deprecated (0.18) cross_validation.py in favor
 of model_selection

---
 sklearn/__init__.py                    |   18 +-
 sklearn/cross_validation.py            | 2075 ------------------------
 sklearn/tests/test_cross_validation.py | 1252 --------------
 3 files changed, 9 insertions(+), 3336 deletions(-)
 delete mode 100644 sklearn/cross_validation.py
 delete mode 100644 sklearn/tests/test_cross_validation.py

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index c45728106ad53..27879e16be363 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -135,15 +135,15 @@ def config_context(**new_config):
     __check_build  # avoid flakes unused variable error
 
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
-               'cross_validation', 'datasets', 'decomposition', 'dummy',
-               'ensemble', 'exceptions', 'externals', 'feature_extraction',
-               'feature_selection', 'gaussian_process', 'grid_search',
-               'isotonic', 'kernel_approximation', 'kernel_ridge',
-               'learning_curve', 'linear_model', 'manifold', 'metrics',
-               'mixture', 'model_selection', 'multiclass', 'multioutput',
-               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
-               'preprocessing', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis',
+               'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
+               'externals', 'feature_extraction', 'feature_selection',
+               'gaussian_process', 'grid_search', 'isotonic',
+               'kernel_approximation', 'kernel_ridge', 'learning_curve',
+               'linear_model', 'manifold', 'metrics', 'mixture',
+               'model_selection', 'multiclass', 'multioutput', 'naive_bayes',
+               'neighbors', 'neural_network', 'pipeline', 'preprocessing',
+               'random_projection', 'semi_supervised', 'svm', 'tree',
+               'discriminant_analysis',
                # Non-modules:
                'clone']
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
deleted file mode 100644
index 7646459da3936..0000000000000
--- a/sklearn/cross_validation.py
+++ /dev/null
@@ -1,2075 +0,0 @@
-"""
-The :mod:`sklearn.cross_validation` module includes utilities for cross-
-validation and performance evaluation.
-"""
-
-# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
-#         Gael Varoquaux <gael.varoquaux@normalesup.org>,
-#         Olivier Grisel <olivier.grisel@ensta.org>
-# License: BSD 3 clause
-
-from __future__ import print_function
-from __future__ import division
-
-import warnings
-from itertools import chain, combinations
-from math import ceil, floor, factorial
-import numbers
-import time
-from abc import ABCMeta, abstractmethod
-
-import numpy as np
-import scipy.sparse as sp
-
-from .base import is_classifier, clone
-from .utils import indexable, check_random_state, safe_indexing
-from .utils.validation import (_is_arraylike, _num_samples,
-                               column_or_1d)
-from .utils.multiclass import type_of_target
-from .externals.joblib import Parallel, delayed, logger
-from .externals.six import with_metaclass
-from .externals.six.moves import zip
-from .metrics.scorer import check_scoring
-from .gaussian_process.kernels import Kernel as GPKernel
-from .exceptions import FitFailedWarning
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
-              "model_selection module into which all the refactored classes "
-              "and functions are moved. Also note that the interface of the "
-              "new CV iterators are different from that of this module. "
-              "This module will be removed in 0.20.", DeprecationWarning)
-
-
-__all__ = ['KFold',
-           'LabelKFold',
-           'LeaveOneLabelOut',
-           'LeaveOneOut',
-           'LeavePLabelOut',
-           'LeavePOut',
-           'ShuffleSplit',
-           'StratifiedKFold',
-           'StratifiedShuffleSplit',
-           'PredefinedSplit',
-           'LabelShuffleSplit',
-           'check_cv',
-           'cross_val_score',
-           'cross_val_predict',
-           'permutation_test_score',
-           'train_test_split']
-
-
-class _PartitionIterator(with_metaclass(ABCMeta)):
-    """Base class for CV iterators where train_mask = ~test_mask
-
-    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-    """
-
-    def __init__(self, n):
-        if abs(n - int(n)) >= np.finfo('f').eps:
-            raise ValueError("n must be an integer")
-        self.n = int(n)
-
-    def __iter__(self):
-        ind = np.arange(self.n)
-        for test_index in self._iter_test_masks():
-            train_index = np.logical_not(test_index)
-            train_index = ind[train_index]
-            test_index = ind[test_index]
-            yield train_index, test_index
-
-    # Since subclasses must implement either _iter_test_masks or
-    # _iter_test_indices, neither can be abstract.
-    def _iter_test_masks(self):
-        """Generates boolean masks corresponding to test sets.
-
-        By default, delegates to _iter_test_indices()
-        """
-        for test_index in self._iter_test_indices():
-            test_mask = self._empty_mask()
-            test_mask[test_index] = True
-            yield test_mask
-
-    def _iter_test_indices(self):
-        """Generates integer indices corresponding to test sets."""
-        raise NotImplementedError
-
-    def _empty_mask(self):
-        return np.zeros(self.n, dtype=np.bool)
-
-
-class LeaveOneOut(_PartitionIterator):
-    """Leave-One-Out cross validation iterator.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.LeaveOneOut` instead.
-
-    Provides train/test indices to split data in train test sets. Each
-    sample is used once as a test set (singleton) while the remaining
-    samples form the training set.
-
-    Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and
-    ``LeavePOut(n, p=1)``.
-
-    Due to the high number of test sets (which is the same as the
-    number of samples) this cross validation method can be very costly.
-    For large datasets one should favor KFold, StratifiedKFold or
-    ShuffleSplit.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4]])
-    >>> y = np.array([1, 2])
-    >>> loo = cross_validation.LeaveOneOut(2)
-    >>> len(loo)
-    2
-    >>> print(loo)
-    sklearn.cross_validation.LeaveOneOut(n=2)
-    >>> for train_index, test_index in loo:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [1] TEST: [0]
-    [[3 4]] [[1 2]] [2] [1]
-    TRAIN: [0] TEST: [1]
-    [[1 2]] [[3 4]] [1] [2]
-
-    See also
-    --------
-    LeaveOneLabelOut for splitting the data according to explicit,
-    domain-specific stratification of the dataset.
-    """
-
-    def _iter_test_indices(self):
-        return range(self.n)
-
-    def __repr__(self):
-        return '%s.%s(n=%i)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-        )
-
-    def __len__(self):
-        return self.n
-
-
-class LeavePOut(_PartitionIterator):
-    """Leave-P-Out cross validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.LeavePOut` instead.
-
-    Provides train/test indices to split data in train test sets. This results
-    in testing on all distinct samples of size p, while the remaining n - p
-    samples form the training set in each iteration.
-
-    Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)``
-    which creates non-overlapping test sets.
-
-    Due to the high number of iterations which grows combinatorically with the
-    number of samples this cross validation method can be very costly. For
-    large datasets one should favor KFold, StratifiedKFold or ShuffleSplit.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in dataset.
-
-    p : int
-        Size of the test sets.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    >>> y = np.array([1, 2, 3, 4])
-    >>> lpo = cross_validation.LeavePOut(4, 2)
-    >>> len(lpo)
-    6
-    >>> print(lpo)
-    sklearn.cross_validation.LeavePOut(n=4, p=2)
-    >>> for train_index, test_index in lpo:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [2 3] TEST: [0 1]
-    TRAIN: [1 3] TEST: [0 2]
-    TRAIN: [1 2] TEST: [0 3]
-    TRAIN: [0 3] TEST: [1 2]
-    TRAIN: [0 2] TEST: [1 3]
-    TRAIN: [0 1] TEST: [2 3]
-    """
-
-    def __init__(self, n, p):
-        super(LeavePOut, self).__init__(n)
-        self.p = p
-
-    def _iter_test_indices(self):
-        for comb in combinations(range(self.n), self.p):
-            yield np.array(comb)
-
-    def __repr__(self):
-        return '%s.%s(n=%i, p=%i)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-            self.p,
-        )
-
-    def __len__(self):
-        return int(factorial(self.n) / factorial(self.n - self.p)
-                   / factorial(self.p))
-
-
-class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
-    """Base class to validate KFold approaches"""
-
-    @abstractmethod
-    def __init__(self, n, n_folds, shuffle, random_state):
-        super(_BaseKFold, self).__init__(n)
-
-        if abs(n_folds - int(n_folds)) >= np.finfo('f').eps:
-            raise ValueError("n_folds must be an integer")
-        self.n_folds = n_folds = int(n_folds)
-
-        if n_folds <= 1:
-            raise ValueError(
-                "k-fold cross validation requires at least one"
-                " train / test split by setting n_folds=2 or more,"
-                " got n_folds={0}.".format(n_folds))
-        if n_folds > self.n:
-            raise ValueError(
-                ("Cannot have number of folds n_folds={0} greater"
-                 " than the number of samples: {1}.").format(n_folds, n))
-
-        if not isinstance(shuffle, bool):
-            raise TypeError("shuffle must be True or False;"
-                            " got {0}".format(shuffle))
-        self.shuffle = shuffle
-        self.random_state = random_state
-
-
-class KFold(_BaseKFold):
-    """K-Folds cross validation iterator.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.KFold` instead.
-
-    Provides train/test indices to split data in train test sets. Split
-    dataset into k consecutive folds (without shuffling by default).
-
-    Each fold is then used as a validation set once while the k - 1 remaining
-    fold(s) form the training set.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements.
-
-    n_folds : int, default=3
-        Number of folds. Must be at least 2.
-
-    shuffle : boolean, optional
-        Whether to shuffle the data before splitting into batches.
-
-    random_state : int, RandomState instance or None, optional, default=None
-        If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`. Used when ``shuffle`` == True.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import KFold
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([1, 2, 3, 4])
-    >>> kf = KFold(4, n_folds=2)
-    >>> len(kf)
-    2
-    >>> print(kf)  # doctest: +NORMALIZE_WHITESPACE
-    sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False,
-                                   random_state=None)
-    >>> for train_index, test_index in kf:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [2 3] TEST: [0 1]
-    TRAIN: [0 1] TEST: [2 3]
-
-    Notes
-    -----
-    The first n % n_folds folds have size n // n_folds + 1, other folds have
-    size n // n_folds.
-
-    See also
-    --------
-    StratifiedKFold take label information into account to avoid building
-    folds with imbalanced class distributions (for binary or multiclass
-    classification tasks).
-
-    LabelKFold: K-fold iterator variant with non-overlapping labels.
-    """
-
-    def __init__(self, n, n_folds=3, shuffle=False,
-                 random_state=None):
-        super(KFold, self).__init__(n, n_folds, shuffle, random_state)
-        self.idxs = np.arange(n)
-        if shuffle:
-            rng = check_random_state(self.random_state)
-            rng.shuffle(self.idxs)
-
-    def _iter_test_indices(self):
-        n = self.n
-        n_folds = self.n_folds
-        fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
-        fold_sizes[:n % n_folds] += 1
-        current = 0
-        for fold_size in fold_sizes:
-            start, stop = current, current + fold_size
-            yield self.idxs[start:stop]
-            current = stop
-
-    def __repr__(self):
-        return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-            self.n_folds,
-            self.shuffle,
-            self.random_state,
-        )
-
-    def __len__(self):
-        return self.n_folds
-
-
-class LabelKFold(_BaseKFold):
-    """K-fold iterator variant with non-overlapping labels.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.GroupKFold` instead.
-
-    The same label will not appear in two different folds (the number of
-    distinct labels has to be at least equal to the number of folds).
-
-    The folds are approximately balanced in the sense that the number of
-    distinct labels is approximately the same in each fold.
-
-    .. versionadded:: 0.17
-
-    Parameters
-    ----------
-    labels : array-like with shape (n_samples, )
-        Contains a label for each sample.
-        The folds are built so that the same label does not appear in two
-        different folds.
-
-    n_folds : int, default=3
-        Number of folds. Must be at least 2.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import LabelKFold
-    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    >>> y = np.array([1, 2, 3, 4])
-    >>> labels = np.array([0, 0, 2, 2])
-    >>> label_kfold = LabelKFold(labels, n_folds=2)
-    >>> len(label_kfold)
-    2
-    >>> print(label_kfold)
-    sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2)
-    >>> for train_index, test_index in label_kfold:
-    ...     print("TRAIN:", train_index, "TEST:", test_index)
-    ...     X_train, X_test = X[train_index], X[test_index]
-    ...     y_train, y_test = y[train_index], y[test_index]
-    ...     print(X_train, X_test, y_train, y_test)
-    ...
-    TRAIN: [0 1] TEST: [2 3]
-    [[1 2]
-     [3 4]] [[5 6]
-     [7 8]] [1 2] [3 4]
-    TRAIN: [2 3] TEST: [0 1]
-    [[5 6]
-     [7 8]] [[1 2]
-     [3 4]] [3 4] [1 2]
-
-    See also
-    --------
-    LeaveOneLabelOut for splitting the data according to explicit,
-    domain-specific stratification of the dataset.
-    """
-    def __init__(self, labels, n_folds=3):
-        super(LabelKFold, self).__init__(len(labels), n_folds,
-                                         shuffle=False, random_state=None)
-
-        unique_labels, labels = np.unique(labels, return_inverse=True)
-        n_labels = len(unique_labels)
-
-        if n_folds > n_labels:
-            raise ValueError(
-                ("Cannot have number of folds n_folds={0} greater"
-                 " than the number of labels: {1}.").format(n_folds,
-                                                            n_labels))
-
-        # Weight labels by their number of occurrences
-        n_samples_per_label = np.bincount(labels)
-
-        # Distribute the most frequent labels first
-        indices = np.argsort(n_samples_per_label)[::-1]
-        n_samples_per_label = n_samples_per_label[indices]
-
-        # Total weight of each fold
-        n_samples_per_fold = np.zeros(n_folds)
-
-        # Mapping from label index to fold index
-        label_to_fold = np.zeros(len(unique_labels))
-
-        # Distribute samples by adding the largest weight to the lightest fold
-        for label_index, weight in enumerate(n_samples_per_label):
-            lightest_fold = np.argmin(n_samples_per_fold)
-            n_samples_per_fold[lightest_fold] += weight
-            label_to_fold[indices[label_index]] = lightest_fold
-
-        self.idxs = label_to_fold[labels]
-
-    def _iter_test_indices(self):
-        for f in range(self.n_folds):
-            yield np.where(self.idxs == f)[0]
-
-    def __repr__(self):
-        return '{0}.{1}(n_labels={2}, n_folds={3})'.format(
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.n,
-            self.n_folds,
-        )
-
-    def __len__(self):
-        return self.n_folds
-
-
-class StratifiedKFold(_BaseKFold):
-    """Stratified K-Folds cross validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.StratifiedKFold` instead.
-
-    Provides train/test indices to split data in train test sets.
-
-    This cross-validation object is a variation of KFold that
-    returns stratified folds. The folds are made by preserving
-    the percentage of samples for each class.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    y : array-like, [n_samples]
-        Samples to split in K folds.
-
-    n_folds : int, default=3
-        Number of folds. Must be at least 2.
-
-    shuffle : boolean, optional
-        Whether to shuffle each stratification of the data before splitting
-        into batches.
-
-    random_state : int, RandomState instance or None, optional, default=None
-        If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`. Used when ``shuffle`` == True.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import StratifiedKFold
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([0, 0, 1, 1])
-    >>> skf = StratifiedKFold(y, n_folds=2)
-    >>> len(skf)
-    2
-    >>> print(skf)  # doctest: +NORMALIZE_WHITESPACE
-    sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2,
-                                             shuffle=False, random_state=None)
-    >>> for train_index, test_index in skf:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [1 3] TEST: [0 2]
-    TRAIN: [0 2] TEST: [1 3]
-
-    Notes
-    -----
-    All the folds have size trunc(n_samples / n_folds), the last one has the
-    complementary.
-
-    See also
-    --------
-    LabelKFold: K-fold iterator variant with non-overlapping labels.
-    """
-
-    def __init__(self, y, n_folds=3, shuffle=False,
-                 random_state=None):
-        super(StratifiedKFold, self).__init__(
-            len(y), n_folds, shuffle, random_state)
-        y = np.asarray(y)
-        n_samples = y.shape[0]
-        unique_labels, y_inversed = np.unique(y, return_inverse=True)
-        label_counts = np.bincount(y_inversed)
-        min_labels = np.min(label_counts)
-        if np.all(self.n_folds > label_counts):
-            raise ValueError("All the n_labels for individual classes"
-                             " are less than %d folds."
-                             % (self.n_folds))
-        if self.n_folds > min_labels:
-            warnings.warn(("The least populated class in y has only %d"
-                           " members, which is too few. The minimum"
-                           " number of labels for any class cannot"
-                           " be less than n_folds=%d."
-                           % (min_labels, self.n_folds)), Warning)
-
-        # don't want to use the same seed in each label's shuffle
-        if self.shuffle:
-            rng = check_random_state(self.random_state)
-        else:
-            rng = self.random_state
-
-        # pre-assign each sample to a test fold index using individual KFold
-        # splitting strategies for each label so as to respect the
-        # balance of labels
-        per_label_cvs = [
-            KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle,
-                  random_state=rng) for c in label_counts]
-        test_folds = np.zeros(n_samples, dtype=np.int)
-        for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
-            for label, (_, test_split) in zip(unique_labels, per_label_splits):
-                label_test_folds = test_folds[y == label]
-                # the test split can be too big because we used
-                # KFold(max(c, self.n_folds), self.n_folds) instead of
-                # KFold(c, self.n_folds) to make it possible to not crash even
-                # if the data is not 100% stratifiable for all the labels
-                # (we use a warning instead of raising an exception)
-                # If this is the case, let's trim it:
-                test_split = test_split[test_split < len(label_test_folds)]
-                label_test_folds[test_split] = test_fold_idx
-                test_folds[y == label] = label_test_folds
-
-        self.test_folds = test_folds
-        self.y = y
-
-    def _iter_test_masks(self):
-        for i in range(self.n_folds):
-            yield self.test_folds == i
-
-    def __repr__(self):
-        return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.y,
-            self.n_folds,
-            self.shuffle,
-            self.random_state,
-        )
-
-    def __len__(self):
-        return self.n_folds
-
-
-class LeaveOneLabelOut(_PartitionIterator):
-    """Leave-One-Label_Out cross-validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.LeaveOneGroupOut` instead.
-
-    Provides train/test indices to split data according to a third-party
-    provided label. This label information can be used to encode arbitrary
-    domain specific stratifications of the samples as integers.
-
-    For instance the labels could be the year of collection of the samples
-    and thus allow for cross-validation against time-based splits.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    labels : array-like of int with shape (n_samples,)
-        Arbitrary domain-specific stratification of the data to be used
-        to draw the splits.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    >>> y = np.array([1, 2, 1, 2])
-    >>> labels = np.array([1, 1, 2, 2])
-    >>> lol = cross_validation.LeaveOneLabelOut(labels)
-    >>> len(lol)
-    2
-    >>> print(lol)
-    sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2])
-    >>> for train_index, test_index in lol:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [2 3] TEST: [0 1]
-    [[5 6]
-     [7 8]] [[1 2]
-     [3 4]] [1 2] [1 2]
-    TRAIN: [0 1] TEST: [2 3]
-    [[1 2]
-     [3 4]] [[5 6]
-     [7 8]] [1 2] [1 2]
-
-    See also
-    --------
-    LabelKFold: K-fold iterator variant with non-overlapping labels.
-    """
-
-    def __init__(self, labels):
-        super(LeaveOneLabelOut, self).__init__(len(labels))
-        # We make a copy of labels to avoid side-effects during iteration
-        self.labels = np.array(labels, copy=True)
-        self.unique_labels = np.unique(labels)
-        self.n_unique_labels = len(self.unique_labels)
-
-    def _iter_test_masks(self):
-        for i in self.unique_labels:
-            yield self.labels == i
-
-    def __repr__(self):
-        return '%s.%s(labels=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.labels,
-        )
-
-    def __len__(self):
-        return self.n_unique_labels
-
-
-class LeavePLabelOut(_PartitionIterator):
-    """Leave-P-Label_Out cross-validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.LeavePGroupsOut` instead.
-
-    Provides train/test indices to split data according to a third-party
-    provided label. This label information can be used to encode arbitrary
-    domain specific stratifications of the samples as integers.
-
-    For instance the labels could be the year of collection of the samples
-    and thus allow for cross-validation against time-based splits.
-
-    The difference between LeavePLabelOut and LeaveOneLabelOut is that
-    the former builds the test sets with all the samples assigned to
-    ``p`` different values of the labels while the latter uses samples
-    all assigned the same labels.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    labels : array-like of int with shape (n_samples,)
-        Arbitrary domain-specific stratification of the data to be used
-        to draw the splits.
-
-    p : int
-        Number of samples to leave out in the test split.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
-    >>> y = np.array([1, 2, 1])
-    >>> labels = np.array([1, 2, 3])
-    >>> lpl = cross_validation.LeavePLabelOut(labels, p=2)
-    >>> len(lpl)
-    3
-    >>> print(lpl)
-    sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2)
-    >>> for train_index, test_index in lpl:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    ...    print(X_train, X_test, y_train, y_test)
-    TRAIN: [2] TEST: [0 1]
-    [[5 6]] [[1 2]
-     [3 4]] [1] [1 2]
-    TRAIN: [1] TEST: [0 2]
-    [[3 4]] [[1 2]
-     [5 6]] [2] [1 1]
-    TRAIN: [0] TEST: [1 2]
-    [[1 2]] [[3 4]
-     [5 6]] [1] [2 1]
-
-    See also
-    --------
-    LabelKFold: K-fold iterator variant with non-overlapping labels.
-    """
-
-    def __init__(self, labels, p):
-        # We make a copy of labels to avoid side-effects during iteration
-        super(LeavePLabelOut, self).__init__(len(labels))
-        self.labels = np.array(labels, copy=True)
-        self.unique_labels = np.unique(labels)
-        self.n_unique_labels = len(self.unique_labels)
-        self.p = p
-
-    def _iter_test_masks(self):
-        comb = combinations(range(self.n_unique_labels), self.p)
-        for idx in comb:
-            test_index = self._empty_mask()
-            idx = np.array(idx)
-            for l in self.unique_labels[idx]:
-                test_index[self.labels == l] = True
-            yield test_index
-
-    def __repr__(self):
-        return '%s.%s(labels=%s, p=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.labels,
-            self.p,
-        )
-
-    def __len__(self):
-        return int(factorial(self.n_unique_labels) /
-                   factorial(self.n_unique_labels - self.p) /
-                   factorial(self.p))
-
-
-class BaseShuffleSplit(with_metaclass(ABCMeta)):
-    """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
-    def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
-        self.n = n
-        self.n_iter = n_iter
-        self.test_size = test_size
-        self.train_size = train_size
-        self.random_state = random_state
-        self.n_train, self.n_test = _validate_shuffle_split(n, test_size,
-                                                            train_size)
-
-    def __iter__(self):
-        for train, test in self._iter_indices():
-            yield train, test
-        return
-
-    @abstractmethod
-    def _iter_indices(self):
-        """Generate (train, test) indices"""
-
-
-class ShuffleSplit(BaseShuffleSplit):
-    """Random permutation cross-validation iterator.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.ShuffleSplit` instead.
-
-    Yields indices to split data into training and test sets.
-
-    Note: contrary to other cross-validation strategies, random splits
-    do not guarantee that all folds will be different, although this is
-    still very likely for sizeable datasets.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    n : int
-        Total number of elements in the dataset.
-
-    n_iter : int (default 10)
-        Number of re-shuffling & splitting iterations.
-
-    test_size : float (default 0.1), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int, RandomState instance or None, optional (default None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Examples
-    --------
-    >>> from sklearn import cross_validation
-    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
-    ...     test_size=.25, random_state=0)
-    >>> len(rs)
-    3
-    >>> print(rs)
-    ... # doctest: +ELLIPSIS
-    ShuffleSplit(4, n_iter=3, test_size=0.25, ...)
-    >>> for train_index, test_index in rs:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...
-    TRAIN: [3 1 0] TEST: [2]
-    TRAIN: [2 1 3] TEST: [0]
-    TRAIN: [0 2 1] TEST: [3]
-
-    >>> rs = cross_validation.ShuffleSplit(4, n_iter=3,
-    ...     train_size=0.5, test_size=.25, random_state=0)
-    >>> for train_index, test_index in rs:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...
-    TRAIN: [3 1] TEST: [2]
-    TRAIN: [2 1] TEST: [0]
-    TRAIN: [0 2] TEST: [3]
-
-    """
-
-    def _iter_indices(self):
-        rng = check_random_state(self.random_state)
-        for i in range(self.n_iter):
-            # random partition
-            permutation = rng.permutation(self.n)
-            ind_test = permutation[:self.n_test]
-            ind_train = permutation[self.n_test:self.n_test + self.n_train]
-            yield ind_train, ind_test
-
-    def __repr__(self):
-        return ('%s(%d, n_iter=%d, test_size=%s, '
-                'random_state=%s)' % (
-                    self.__class__.__name__,
-                    self.n,
-                    self.n_iter,
-                    str(self.test_size),
-                    self.random_state,
-                ))
-
-    def __len__(self):
-        return self.n_iter
-
-
-def _validate_shuffle_split(n, test_size, train_size):
-    if test_size is None and train_size is None:
-        raise ValueError(
-            'test_size and train_size can not both be None')
-
-    if test_size is not None:
-        if np.asarray(test_size).dtype.kind == 'f':
-            if test_size >= 1.:
-                raise ValueError(
-                    'test_size=%f should be smaller '
-                    'than 1.0 or be an integer' % test_size)
-        elif np.asarray(test_size).dtype.kind == 'i':
-            if test_size >= n:
-                raise ValueError(
-                    'test_size=%d should be smaller '
-                    'than the number of samples %d' % (test_size, n))
-        else:
-            raise ValueError("Invalid value for test_size: %r" % test_size)
-
-    if train_size is not None:
-        if np.asarray(train_size).dtype.kind == 'f':
-            if train_size >= 1.:
-                raise ValueError("train_size=%f should be smaller "
-                                 "than 1.0 or be an integer" % train_size)
-            elif np.asarray(test_size).dtype.kind == 'f' and \
-                    train_size + test_size > 1.:
-                raise ValueError('The sum of test_size and train_size = %f, '
-                                 'should be smaller than 1.0. Reduce '
-                                 'test_size and/or train_size.' %
-                                 (train_size + test_size))
-        elif np.asarray(train_size).dtype.kind == 'i':
-            if train_size >= n:
-                raise ValueError("train_size=%d should be smaller "
-                                 "than the number of samples %d" %
-                                 (train_size, n))
-        else:
-            raise ValueError("Invalid value for train_size: %r" % train_size)
-
-    if np.asarray(test_size).dtype.kind == 'f':
-        n_test = ceil(test_size * n)
-    elif np.asarray(test_size).dtype.kind == 'i':
-        n_test = float(test_size)
-
-    if train_size is None:
-        n_train = n - n_test
-    else:
-        if np.asarray(train_size).dtype.kind == 'f':
-            n_train = floor(train_size * n)
-        else:
-            n_train = float(train_size)
-
-    if test_size is None:
-        n_test = n - n_train
-
-    if n_train + n_test > n:
-        raise ValueError('The sum of train_size and test_size = %d, '
-                         'should be smaller than the number of '
-                         'samples %d. Reduce test_size and/or '
-                         'train_size.' % (n_train + n_test, n))
-
-    return int(n_train), int(n_test)
-
-
-def _approximate_mode(class_counts, n_draws, rng):
-    """Computes approximate mode of multivariate hypergeometric.
-
-    This is an approximation to the mode of the multivariate
-    hypergeometric given by class_counts and n_draws.
-    It shouldn't be off by more than one.
-
-    It is the mostly likely outcome of drawing n_draws many
-    samples from the population given by class_counts.
-
-    Parameters
-    ----------
-    class_counts : ndarray of int
-        Population per class.
-    n_draws : int
-        Number of draws (samples to draw) from the overall population.
-    rng : random state
-        Used to break ties.
-
-    Returns
-    -------
-    sampled_classes : ndarray of int
-        Number of samples drawn from each class.
-        np.sum(sampled_classes) == n_draws
-    """
-    # this computes a bad approximation to the mode of the
-    # multivariate hypergeometric given by class_counts and n_draws
-    continuous = n_draws * class_counts / class_counts.sum()
-    # floored means we don't overshoot n_samples, but probably undershoot
-    floored = np.floor(continuous)
-    # we add samples according to how much "left over" probability
-    # they had, until we arrive at n_samples
-    need_to_add = int(n_draws - floored.sum())
-    if need_to_add > 0:
-        remainder = continuous - floored
-        values = np.sort(np.unique(remainder))[::-1]
-        # add according to remainder, but break ties
-        # randomly to avoid biases
-        for value in values:
-            inds, = np.where(remainder == value)
-            # if we need_to_add less than what's in inds
-            # we draw randomly from them.
-            # if we need to add more, we add them all and
-            # go to the next value
-            add_now = min(len(inds), need_to_add)
-            inds = rng.choice(inds, size=add_now, replace=False)
-            floored[inds] += 1
-            need_to_add -= add_now
-            if need_to_add == 0:
-                    break
-    return floored.astype(np.int)
-
-
-class StratifiedShuffleSplit(BaseShuffleSplit):
-    """Stratified ShuffleSplit cross validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.StratifiedShuffleSplit` instead.
-
-    Provides train/test indices to split data in train test sets.
-
-    This cross-validation object is a merge of StratifiedKFold and
-    ShuffleSplit, which returns stratified randomized folds. The folds
-    are made by preserving the percentage of samples for each class.
-
-    Note: like the ShuffleSplit strategy, stratified random splits
-    do not guarantee that all folds will be different, although this is
-    still very likely for sizeable datasets.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    y : array, [n_samples]
-        Labels of samples.
-
-    n_iter : int (default 10)
-        Number of re-shuffling & splitting iterations.
-
-    test_size : float (default 0.1), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int, RandomState instance or None, optional (default None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import StratifiedShuffleSplit
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([0, 0, 1, 1])
-    >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
-    >>> len(sss)
-    3
-    >>> print(sss)       # doctest: +ELLIPSIS
-    StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
-    >>> for train_index, test_index in sss:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [1 2] TEST: [3 0]
-    TRAIN: [0 2] TEST: [1 3]
-    TRAIN: [0 2] TEST: [3 1]
-    """
-
-    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
-
-        super(StratifiedShuffleSplit, self).__init__(
-            len(y), n_iter, test_size, train_size, random_state)
-
-        self.y = np.array(y)
-        self.classes, self.y_indices = np.unique(y, return_inverse=True)
-        n_cls = self.classes.shape[0]
-
-        if np.min(np.bincount(self.y_indices)) < 2:
-            raise ValueError("The least populated class in y has only 1"
-                             " member, which is too few. The minimum"
-                             " number of labels for any class cannot"
-                             " be less than 2.")
-
-        if self.n_train < n_cls:
-            raise ValueError('The train_size = %d should be greater or '
-                             'equal to the number of classes = %d' %
-                             (self.n_train, n_cls))
-        if self.n_test < n_cls:
-            raise ValueError('The test_size = %d should be greater or '
-                             'equal to the number of classes = %d' %
-                             (self.n_test, n_cls))
-
-    def _iter_indices(self):
-        rng = check_random_state(self.random_state)
-        cls_count = np.bincount(self.y_indices)
-
-        for n in range(self.n_iter):
-            # if there are ties in the class-counts, we want
-            # to make sure to break them anew in each iteration
-            n_i = _approximate_mode(cls_count, self.n_train, rng)
-            class_counts_remaining = cls_count - n_i
-            t_i = _approximate_mode(class_counts_remaining, self.n_test, rng)
-
-            train = []
-            test = []
-
-            for i, _ in enumerate(self.classes):
-                permutation = rng.permutation(cls_count[i])
-                perm_indices_class_i = np.where(
-                    (i == self.y_indices))[0][permutation]
-
-                train.extend(perm_indices_class_i[:n_i[i]])
-                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
-            train = rng.permutation(train)
-            test = rng.permutation(test)
-
-            yield train, test
-
-    def __repr__(self):
-        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
-                'random_state=%s)' % (
-                    self.__class__.__name__,
-                    self.y,
-                    self.n_iter,
-                    str(self.test_size),
-                    self.random_state,
-                ))
-
-    def __len__(self):
-        return self.n_iter
-
-
-class PredefinedSplit(_PartitionIterator):
-    """Predefined split cross validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.PredefinedSplit` instead.
-
-    Splits the data into training/test set folds according to a predefined
-    scheme. Each sample can be assigned to at most one test set fold, as
-    specified by the user through the ``test_fold`` parameter.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    test_fold : "array-like, shape (n_samples,)
-        test_fold[i] gives the test set fold of sample i. A value of -1
-        indicates that the corresponding sample is not part of any test set
-        folds, but will instead always be put into the training fold.
-
-    Examples
-    --------
-    >>> from sklearn.cross_validation import PredefinedSplit
-    >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
-    >>> y = np.array([0, 0, 1, 1])
-    >>> ps = PredefinedSplit(test_fold=[0, 1, -1, 1])
-    >>> len(ps)
-    2
-    >>> print(ps)       # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    sklearn.cross_validation.PredefinedSplit(test_fold=[ 0  1 -1  1])
-    >>> for train_index, test_index in ps:
-    ...    print("TRAIN:", train_index, "TEST:", test_index)
-    ...    X_train, X_test = X[train_index], X[test_index]
-    ...    y_train, y_test = y[train_index], y[test_index]
-    TRAIN: [1 2 3] TEST: [0]
-    TRAIN: [0 2] TEST: [1 3]
-    """
-
-    def __init__(self, test_fold):
-        super(PredefinedSplit, self).__init__(len(test_fold))
-        self.test_fold = np.array(test_fold, dtype=np.int)
-        self.test_fold = column_or_1d(self.test_fold)
-        self.unique_folds = np.unique(self.test_fold)
-        self.unique_folds = self.unique_folds[self.unique_folds != -1]
-
-    def _iter_test_indices(self):
-        for f in self.unique_folds:
-            yield np.where(self.test_fold == f)[0]
-
-    def __repr__(self):
-        return '%s.%s(test_fold=%s)' % (
-            self.__class__.__module__,
-            self.__class__.__name__,
-            self.test_fold)
-
-    def __len__(self):
-        return len(self.unique_folds)
-
-
-class LabelShuffleSplit(ShuffleSplit):
-    """Shuffle-Labels-Out cross-validation iterator
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.GroupShuffleSplit` instead.
-
-    Provides randomized train/test indices to split data according to a
-    third-party provided label. This label information can be used to encode
-    arbitrary domain specific stratifications of the samples as integers.
-
-    For instance the labels could be the year of collection of the samples
-    and thus allow for cross-validation against time-based splits.
-
-    The difference between LeavePLabelOut and LabelShuffleSplit is that
-    the former generates splits using all subsets of size ``p`` unique labels,
-    whereas LabelShuffleSplit generates a user-determined number of random
-    test splits, each with a user-determined fraction of unique labels.
-
-    For example, a less computationally intensive alternative to
-    ``LeavePLabelOut(labels, p=10)`` would be
-    ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``.
-
-    Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
-    not to samples, as in ShuffleSplit.
-
-    .. versionadded:: 0.17
-
-    Parameters
-    ----------
-    labels :  array, [n_samples]
-        Labels of samples
-
-    n_iter : int (default 5)
-        Number of re-shuffling and splitting iterations.
-
-    test_size : float (default 0.2), int, or None
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the labels to include in the test split. If
-        int, represents the absolute number of test labels. If None,
-        the value is automatically set to the complement of the train size.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the labels to include in the train split. If
-        int, represents the absolute number of train labels. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int, RandomState instance or None, optional (default None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    """
-    def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
-                 random_state=None):
-
-        classes, label_indices = np.unique(labels, return_inverse=True)
-
-        super(LabelShuffleSplit, self).__init__(
-            len(classes),
-            n_iter=n_iter,
-            test_size=test_size,
-            train_size=train_size,
-            random_state=random_state)
-
-        self.labels = labels
-        self.classes = classes
-        self.label_indices = label_indices
-
-    def __repr__(self):
-        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
-                'random_state=%s)' % (
-                    self.__class__.__name__,
-                    self.labels,
-                    self.n_iter,
-                    str(self.test_size),
-                    self.random_state,
-                ))
-
-    def __len__(self):
-        return self.n_iter
-
-    def _iter_indices(self):
-        for label_train, label_test in super(LabelShuffleSplit,
-                                             self)._iter_indices():
-            # these are the indices of classes in the partition
-            # invert them into data indices
-
-            train = np.flatnonzero(np.in1d(self.label_indices, label_train))
-            test = np.flatnonzero(np.in1d(self.label_indices, label_test))
-
-            yield train, test
-
-
-##############################################################################
-def _index_param_value(X, v, indices):
-    """Private helper function for parameter value indexing."""
-    if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
-        # pass through: skip indexing
-        return v
-    if sp.issparse(v):
-        v = v.tocsr()
-    return safe_indexing(v, indices)
-
-
-def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
-                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
-    """Generate cross-validated estimates for each input data point
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.cross_val_predict` instead.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit' and 'predict'
-        The object to use to fit the data.
-
-    X : array-like
-        The data to fit. Can be, for example a list, or an array at least 2d.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    n_jobs : integer, optional
-        The number of CPUs to use to do the computation. -1 means
-        'all CPUs'.
-
-    verbose : integer, optional
-        The verbosity level.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method of the estimator.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    Returns
-    -------
-    preds : ndarray
-        This is the result of calling 'predict'
-
-    Examples
-    --------
-    >>> from sklearn import datasets, linear_model
-    >>> from sklearn.cross_validation import cross_val_predict
-    >>> diabetes = datasets.load_diabetes()
-    >>> X = diabetes.data[:150]
-    >>> y = diabetes.target[:150]
-    >>> lasso = linear_model.Lasso()
-    >>> y_pred = cross_val_predict(lasso, X, y)
-    """
-    X, y = indexable(X, y)
-
-    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-    # We clone the estimator to make sure that all the folds are
-    # independent, and that it is pickle-able.
-    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
-                        pre_dispatch=pre_dispatch)
-    preds_blocks = parallel(delayed(_fit_and_predict)(clone(estimator), X, y,
-                                                      train, test, verbose,
-                                                      fit_params)
-                            for train, test in cv)
-
-    preds = [p for p, _ in preds_blocks]
-    locs = np.concatenate([loc for _, loc in preds_blocks])
-    if not _check_is_partition(locs, _num_samples(X)):
-        raise ValueError('cross_val_predict only works for partitions')
-    inv_locs = np.empty(len(locs), dtype=int)
-    inv_locs[locs] = np.arange(len(locs))
-
-    # Check for sparse predictions
-    if sp.issparse(preds[0]):
-        preds = sp.vstack(preds, format=preds[0].format)
-    else:
-        preds = np.concatenate(preds)
-    return preds[inv_locs]
-
-
-def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
-    """Fit estimator and predict values for a given dataset split.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit' and 'predict'
-        The object to use to fit the data.
-
-    X : array-like of shape at least 2D
-        The data to fit.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    train : array-like, shape (n_train_samples,)
-        Indices of training samples.
-
-    test : array-like, shape (n_test_samples,)
-        Indices of test samples.
-
-    verbose : integer
-        The verbosity level.
-
-    fit_params : dict or None
-        Parameters that will be passed to ``estimator.fit``.
-
-    Returns
-    -------
-    preds : sequence
-        Result of calling 'estimator.predict'
-
-    test : array-like
-        This is the value of the test parameter
-    """
-    # Adjust length of sample weights
-    fit_params = fit_params if fit_params is not None else {}
-    fit_params = dict([(k, _index_param_value(X, v, train))
-                      for k, v in fit_params.items()])
-
-    X_train, y_train = _safe_split(estimator, X, y, train)
-    X_test, _ = _safe_split(estimator, X, y, test, train)
-
-    if y_train is None:
-        estimator.fit(X_train, **fit_params)
-    else:
-        estimator.fit(X_train, y_train, **fit_params)
-    preds = estimator.predict(X_test)
-    return preds, test
-
-
-def _check_is_partition(locs, n):
-    """Check whether locs is a reordering of the array np.arange(n)
-
-    Parameters
-    ----------
-    locs : ndarray
-        integer array to test
-    n : int
-        number of expected elements
-
-    Returns
-    -------
-    is_partition : bool
-        True iff sorted(locs) is range(n)
-    """
-    if len(locs) != n:
-        return False
-    hit = np.zeros(n, bool)
-    hit[locs] = True
-    if not np.all(hit):
-        return False
-    return True
-
-
-def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
-                    verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
-    """Evaluate a score by cross-validation
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.cross_val_score` instead.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like
-        The data to fit. Can be, for example a list, or an array at least 2d.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    n_jobs : integer, optional
-        The number of CPUs to use to do the computation. -1 means
-        'all CPUs'.
-
-    verbose : integer, optional
-        The verbosity level.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method of the estimator.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    Returns
-    -------
-    scores : array of float, shape=(len(list(cv)),)
-        Array of scores of the estimator for each run of the cross validation.
-
-    Examples
-    --------
-    >>> from sklearn import datasets, linear_model
-    >>> from sklearn.cross_validation import cross_val_score
-    >>> diabetes = datasets.load_diabetes()
-    >>> X = diabetes.data[:150]
-    >>> y = diabetes.target[:150]
-    >>> lasso = linear_model.Lasso()
-    >>> print(cross_val_score(lasso, X, y))  # doctest:  +ELLIPSIS
-    [ 0.33150734  0.08022311  0.03531764]
-
-    See Also
-    ---------
-    :func:`sklearn.metrics.make_scorer`:
-        Make a scorer from a performance metric or loss function.
-
-    """
-    X, y = indexable(X, y)
-
-    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, scoring=scoring)
-    # We clone the estimator to make sure that all the folds are
-    # independent, and that it is pickle-able.
-    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
-                        pre_dispatch=pre_dispatch)
-    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
-                                              train, test, verbose, None,
-                                              fit_params)
-                      for train, test in cv)
-    return np.array(scores)[:, 0]
-
-
-def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
-                   parameters, fit_params, return_train_score=False,
-                   return_parameters=False, error_score='raise'):
-    """Fit estimator and compute scores for a given dataset split.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like of shape at least 2D
-        The data to fit.
-
-    y : array-like, optional, default: None
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scorer : callable
-        A scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    train : array-like, shape (n_train_samples,)
-        Indices of training samples.
-
-    test : array-like, shape (n_test_samples,)
-        Indices of test samples.
-
-    verbose : integer
-        The verbosity level.
-
-    error_score : 'raise' (default) or numeric
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-    parameters : dict or None
-        Parameters to be set on the estimator.
-
-    fit_params : dict or None
-        Parameters that will be passed to ``estimator.fit``.
-
-    return_train_score : boolean, optional, default: False
-        Compute and return score on training set.
-
-    return_parameters : boolean, optional, default: False
-        Return parameters that has been used for the estimator.
-
-    Returns
-    -------
-    train_score : float, optional
-        Score on training set, returned only if `return_train_score` is `True`.
-
-    test_score : float
-        Score on test set.
-
-    n_test_samples : int
-        Number of test samples.
-
-    scoring_time : float
-        Time spent for fitting and scoring in seconds.
-
-    parameters : dict or None, optional
-        The parameters that have been evaluated.
-    """
-    if verbose > 1:
-        if parameters is None:
-            msg = ''
-        else:
-            msg = '%s' % (', '.join('%s=%s' % (k, v)
-                          for k, v in parameters.items()))
-        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
-
-    # Adjust length of sample weights
-    fit_params = fit_params if fit_params is not None else {}
-    fit_params = dict([(k, _index_param_value(X, v, train))
-                      for k, v in fit_params.items()])
-
-    if parameters is not None:
-        estimator.set_params(**parameters)
-
-    start_time = time.time()
-
-    X_train, y_train = _safe_split(estimator, X, y, train)
-    X_test, y_test = _safe_split(estimator, X, y, test, train)
-
-    try:
-        if y_train is None:
-            estimator.fit(X_train, **fit_params)
-        else:
-            estimator.fit(X_train, y_train, **fit_params)
-
-    except Exception as e:
-        if error_score == 'raise':
-            raise
-        elif isinstance(error_score, numbers.Number):
-            test_score = error_score
-            if return_train_score:
-                train_score = error_score
-            warnings.warn("Classifier fit failed. The score on this train-test"
-                          " partition for these parameters will be set to %f. "
-                          "Details: \n%r" % (error_score, e), FitFailedWarning)
-        else:
-            raise ValueError("error_score must be the string 'raise' or a"
-                             " numeric value. (Hint: if using 'raise', please"
-                             " make sure that it has been spelled correctly.)"
-                             )
-
-    else:
-        test_score = _score(estimator, X_test, y_test, scorer)
-        if return_train_score:
-            train_score = _score(estimator, X_train, y_train, scorer)
-
-    scoring_time = time.time() - start_time
-
-    if verbose > 2:
-        msg += ", score=%f" % test_score
-    if verbose > 1:
-        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
-        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
-
-    ret = [train_score] if return_train_score else []
-    ret.extend([test_score, _num_samples(X_test), scoring_time])
-    if return_parameters:
-        ret.append(parameters)
-    return ret
-
-
-def _safe_split(estimator, X, y, indices, train_indices=None):
-    """Create subset of dataset and properly handle kernels."""
-    if hasattr(estimator, 'kernel') and callable(estimator.kernel) \
-       and not isinstance(estimator.kernel, GPKernel):
-        # cannot compute the kernel values with custom function
-        raise ValueError("Cannot use a custom kernel function. "
-                         "Precompute the kernel matrix instead.")
-
-    if not hasattr(X, "shape"):
-        if getattr(estimator, "_pairwise", False):
-            raise ValueError("Precomputed kernels or affinity matrices have "
-                             "to be passed as arrays or sparse matrices.")
-        X_subset = [X[idx] for idx in indices]
-    else:
-        if getattr(estimator, "_pairwise", False):
-            # X is a precomputed square kernel matrix
-            if X.shape[0] != X.shape[1]:
-                raise ValueError("X should be a square kernel matrix")
-            if train_indices is None:
-                X_subset = X[np.ix_(indices, indices)]
-            else:
-                X_subset = X[np.ix_(indices, train_indices)]
-        else:
-            X_subset = safe_indexing(X, indices)
-
-    if y is not None:
-        y_subset = safe_indexing(y, indices)
-    else:
-        y_subset = None
-
-    return X_subset, y_subset
-
-
-def _score(estimator, X_test, y_test, scorer):
-    """Compute the score of an estimator on a given test set."""
-    if y_test is None:
-        score = scorer(estimator, X_test)
-    else:
-        score = scorer(estimator, X_test, y_test)
-    if hasattr(score, 'item'):
-        try:
-            # e.g. unwrap memmapped scalars
-            score = score.item()
-        except ValueError:
-            # non-scalar?
-            pass
-    if not isinstance(score, numbers.Number):
-        raise ValueError("scoring must return a number, got %s (%s) instead."
-                         % (str(score), type(score)))
-    return score
-
-
-def _permutation_test_score(estimator, X, y, cv, scorer):
-    """Auxiliary function for permutation_test_score"""
-    avg_score = []
-    for train, test in cv:
-        X_train, y_train = _safe_split(estimator, X, y, train)
-        X_test, y_test = _safe_split(estimator, X, y, test, train)
-        estimator.fit(X_train, y_train)
-        avg_score.append(scorer(estimator, X_test, y_test))
-    return np.mean(avg_score)
-
-
-def _shuffle(y, labels, random_state):
-    """Return a shuffled copy of y eventually shuffle among same labels."""
-    if labels is None:
-        ind = random_state.permutation(len(y))
-    else:
-        ind = np.arange(len(labels))
-        for label in np.unique(labels):
-            this_mask = (labels == label)
-            ind[this_mask] = random_state.permutation(ind[this_mask])
-    return safe_indexing(y, ind)
-
-
-def check_cv(cv, X=None, y=None, classifier=False):
-    """Input checker utility for building a CV in a user friendly way.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.check_cv` instead.
-
-    Parameters
-    ----------
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if classifier is True and ``y`` is binary or
-        multiclass, :class:`StratifiedKFold` is used. In all other cases,
-        :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    X : array-like
-        The data the cross-val object will be applied on.
-
-    y : array-like
-        The target variable for a supervised learning problem.
-
-    classifier : boolean optional
-        Whether the task is a classification task, in which case
-        stratified KFold will be used.
-
-    Returns
-    -------
-    checked_cv : a cross-validation generator instance.
-        The return value is guaranteed to be a cv generator instance, whatever
-        the input type.
-    """
-    is_sparse = sp.issparse(X)
-    if cv is None:
-        cv = 3
-    if isinstance(cv, numbers.Integral):
-        if classifier:
-            if type_of_target(y) in ['binary', 'multiclass']:
-                cv = StratifiedKFold(y, cv)
-            else:
-                cv = KFold(_num_samples(y), cv)
-        else:
-            if not is_sparse:
-                n_samples = len(X)
-            else:
-                n_samples = X.shape[0]
-            cv = KFold(n_samples, cv)
-    return cv
-
-
-def permutation_test_score(estimator, X, y, cv=None,
-                           n_permutations=100, n_jobs=1, labels=None,
-                           random_state=0, verbose=0, scoring=None):
-    """Evaluate the significance of a cross-validated score with permutations
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.permutation_test_score` instead.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    estimator : estimator object implementing 'fit'
-        The object to use to fit the data.
-
-    X : array-like of shape at least 2D
-        The data to fit.
-
-    y : array-like
-        The target variable to try to predict in the case of
-        supervised learning.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    n_permutations : integer, optional
-        Number of times to permute ``y``.
-
-    n_jobs : integer, optional
-        The number of CPUs to use to do the computation. -1 means
-        'all CPUs'.
-
-    labels : array-like of shape [n_samples] (optional)
-        Labels constrain the permutation among groups of samples with
-        a same label.
-
-    random_state : int, RandomState instance or None, optional (default=0)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    verbose : integer, optional
-        The verbosity level.
-
-    Returns
-    -------
-    score : float
-        The true score without permuting targets.
-
-    permutation_scores : array, shape (n_permutations,)
-        The scores obtained for each permutations.
-
-    pvalue : float
-        The p-value, which approximates the probability that the score would
-        be obtained by chance. This is calculated as:
-
-        `(C + 1) / (n_permutations + 1)`
-
-        Where C is the number of permutations whose score >= the true score.
-
-        The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
-
-    Notes
-    -----
-    This function implements Test 1 in:
-
-        Ojala and Garriga. Permutation Tests for Studying Classifier
-        Performance.  The Journal of Machine Learning Research (2010)
-        vol. 11
-
-    """
-    X, y = indexable(X, y)
-    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, scoring=scoring)
-    random_state = check_random_state(random_state)
-
-    # We clone the estimator to make sure that all the folds are
-    # independent, and that it is pickle-able.
-    score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
-    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(_permutation_test_score)(
-            clone(estimator), X, _shuffle(y, labels, random_state), cv,
-            scorer)
-        for _ in range(n_permutations))
-    permutation_scores = np.array(permutation_scores)
-    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
-    return score, permutation_scores, pvalue
-
-
-permutation_test_score.__test__ = False  # to avoid a pb with nosetests
-
-
-def train_test_split(*arrays, **options):
-    """Split arrays or matrices into random train and test subsets
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.train_test_split` instead.
-
-    Quick utility that wraps input validation and
-    ``next(iter(ShuffleSplit(n_samples)))`` and application to input
-    data into a single call for splitting (and optionally subsampling)
-    data in a oneliner.
-
-    Read more in the :ref:`User Guide <cross_validation>`.
-
-    Parameters
-    ----------
-    *arrays : sequence of indexables with same length / shape[0]
-        Allowed inputs are lists, numpy arrays, scipy-sparse
-        matrices or pandas dataframes.
-
-    test_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
-        If train size is also None, test size is set to 0.25.
-
-    train_size : float, int, or None (default is None)
-        If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the train split. If
-        int, represents the absolute number of train samples. If None,
-        the value is automatically set to the complement of the test size.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    stratify : array-like or None (default is None)
-        If not None, data is split in a stratified fashion, using this as
-        the labels array.
-
-        .. versionadded:: 0.17
-           *stratify* splitting
-
-    Returns
-    -------
-    splitting : list, length = 2 * len(arrays),
-        List containing train-test split of inputs.
-
-        .. versionadded:: 0.16
-            If the input is sparse, the output will be a
-            ``scipy.sparse.csr_matrix``. Else, output type is the same as the
-            input type.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.cross_validation import train_test_split
-    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
-    >>> X
-    array([[0, 1],
-           [2, 3],
-           [4, 5],
-           [6, 7],
-           [8, 9]])
-    >>> list(y)
-    [0, 1, 2, 3, 4]
-
-    >>> X_train, X_test, y_train, y_test = train_test_split(
-    ...     X, y, test_size=0.33, random_state=42)
-    ...
-    >>> X_train
-    array([[4, 5],
-           [0, 1],
-           [6, 7]])
-    >>> y_train
-    [2, 0, 3]
-    >>> X_test
-    array([[2, 3],
-           [8, 9]])
-    >>> y_test
-    [1, 4]
-
-    """
-    n_arrays = len(arrays)
-    if n_arrays == 0:
-        raise ValueError("At least one array required as input")
-
-    test_size = options.pop('test_size', None)
-    train_size = options.pop('train_size', None)
-    random_state = options.pop('random_state', None)
-    stratify = options.pop('stratify', None)
-
-    if options:
-        raise TypeError("Invalid parameters passed: %s" % str(options))
-
-    if test_size is None and train_size is None:
-        test_size = 0.25
-    arrays = indexable(*arrays)
-    if stratify is not None:
-        cv = StratifiedShuffleSplit(stratify, test_size=test_size,
-                                    train_size=train_size,
-                                    random_state=random_state)
-    else:
-        n_samples = _num_samples(arrays[0])
-        cv = ShuffleSplit(n_samples, test_size=test_size,
-                          train_size=train_size,
-                          random_state=random_state)
-
-    train, test = next(iter(cv))
-    return list(chain.from_iterable((safe_indexing(a, train),
-                                     safe_indexing(a, test)) for a in arrays))
-
-
-train_test_split.__test__ = False  # to avoid a pb with nosetests
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
deleted file mode 100644
index 4d756bdaa0cf8..0000000000000
--- a/sklearn/tests/test_cross_validation.py
+++ /dev/null
@@ -1,1252 +0,0 @@
-"""Test the cross_validation module"""
-from __future__ import division
-import warnings
-
-import numpy as np
-from scipy.sparse import coo_matrix
-from scipy.sparse import csr_matrix
-from scipy import stats
-
-from sklearn.exceptions import ConvergenceWarning
-from sklearn.utils.testing import assert_true
-from sklearn.utils.testing import assert_false
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_greater
-from sklearn.utils.testing import assert_greater_equal
-from sklearn.utils.testing import assert_less
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_warns_message
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-with warnings.catch_warnings():
-    warnings.simplefilter('ignore')
-    from sklearn import cross_validation as cval
-
-from sklearn.datasets import make_regression
-from sklearn.datasets import load_boston
-from sklearn.datasets import load_digits
-from sklearn.datasets import load_iris
-from sklearn.datasets import make_multilabel_classification
-from sklearn.metrics import explained_variance_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import precision_score
-from sklearn.externals import six
-from sklearn.externals.six.moves import zip
-
-from sklearn.linear_model import Ridge
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.cluster import KMeans
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-class MockClassifier(object):
-    """Dummy classifier to test the cross-validation"""
-
-    def __init__(self, a=0, allow_nd=False):
-        self.a = a
-        self.allow_nd = allow_nd
-
-    def fit(self, X, Y=None, sample_weight=None, class_prior=None,
-            sparse_sample_weight=None, sparse_param=None, dummy_int=None,
-            dummy_str=None, dummy_obj=None, callback=None):
-        """The dummy arguments are to test that this fit function can
-        accept non-array arguments through cross-validation, such as:
-            - int
-            - str (this is actually array-like)
-            - object
-            - function
-        """
-        self.dummy_int = dummy_int
-        self.dummy_str = dummy_str
-        self.dummy_obj = dummy_obj
-        if callback is not None:
-            callback(self)
-
-        if self.allow_nd:
-            X = X.reshape(len(X), -1)
-        if X.ndim >= 3 and not self.allow_nd:
-            raise ValueError('X cannot be d')
-        if sample_weight is not None:
-            assert_true(sample_weight.shape[0] == X.shape[0],
-                        'MockClassifier extra fit_param sample_weight.shape[0]'
-                        ' is {0}, should be {1}'.format(sample_weight.shape[0],
-                                                        X.shape[0]))
-        if class_prior is not None:
-            assert_true(class_prior.shape[0] == len(np.unique(y)),
-                        'MockClassifier extra fit_param class_prior.shape[0]'
-                        ' is {0}, should be {1}'.format(class_prior.shape[0],
-                                                        len(np.unique(y))))
-        if sparse_sample_weight is not None:
-            fmt = ('MockClassifier extra fit_param sparse_sample_weight'
-                   '.shape[0] is {0}, should be {1}')
-            assert_true(sparse_sample_weight.shape[0] == X.shape[0],
-                        fmt.format(sparse_sample_weight.shape[0], X.shape[0]))
-        if sparse_param is not None:
-            fmt = ('MockClassifier extra fit_param sparse_param.shape '
-                   'is ({0}, {1}), should be ({2}, {3})')
-            assert_true(sparse_param.shape == P_sparse.shape,
-                        fmt.format(sparse_param.shape[0],
-                                   sparse_param.shape[1],
-                                   P_sparse.shape[0], P_sparse.shape[1]))
-        return self
-
-    def predict(self, T):
-        if self.allow_nd:
-            T = T.reshape(len(T), -1)
-        return T[:, 0]
-
-    def score(self, X=None, Y=None):
-        return 1. / (1 + np.abs(self.a))
-
-    def get_params(self, deep=False):
-        return {'a': self.a, 'allow_nd': self.allow_nd}
-
-X = np.ones((10, 2))
-X_sparse = coo_matrix(X)
-W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))),
-                      shape=(10, 1))
-P_sparse = coo_matrix(np.eye(5))
-
-# avoid StratifiedKFold's Warning about least populated class in y
-y = np.arange(10) % 3
-
-##############################################################################
-# Tests
-
-
-def check_valid_split(train, test, n_samples=None):
-    # Use python sets to get more informative assertion failure messages
-    train, test = set(train), set(test)
-
-    # Train and test split should not overlap
-    assert_equal(train.intersection(test), set())
-
-    if n_samples is not None:
-        # Check that the union of train an test split cover all the indices
-        assert_equal(train.union(test), set(range(n_samples)))
-
-
-def check_cv_coverage(cv, expected_n_iter=None, n_samples=None):
-    # Check that a all the samples appear at least once in a test fold
-    if expected_n_iter is not None:
-        assert_equal(len(cv), expected_n_iter)
-    else:
-        expected_n_iter = len(cv)
-
-    collected_test_samples = set()
-    iterations = 0
-    for train, test in cv:
-        check_valid_split(train, test, n_samples=n_samples)
-        iterations += 1
-        collected_test_samples.update(test)
-
-    # Check that the accumulated test samples cover the whole dataset
-    assert_equal(iterations, expected_n_iter)
-    if n_samples is not None:
-        assert_equal(collected_test_samples, set(range(n_samples)))
-
-
-def test_kfold_valueerrors():
-    # Check that errors are raised if there is not enough samples
-    assert_raises(ValueError, cval.KFold, 3, 4)
-
-    # Check that a warning is raised if the least populated class has too few
-    # members.
-    y = [3, 3, -1, -1, 3]
-
-    cv = assert_warns_message(Warning, "The least populated class",
-                              cval.StratifiedKFold, y, 3)
-
-    # Check that despite the warning the folds are still computed even
-    # though all the classes are not necessarily represented at on each
-    # side of the split at each split
-    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))
-
-    # Check that errors are raised if all n_labels for individual
-    # classes are less than n_folds.
-    y = [3, 3, -1, -1, 2]
-
-    assert_raises(ValueError, cval.StratifiedKFold, y, 3)
-
-    # Error when number of folds is <= 1
-    assert_raises(ValueError, cval.KFold, 2, 0)
-    assert_raises(ValueError, cval.KFold, 2, 1)
-    error_string = ("k-fold cross validation requires at least one"
-                    " train / test split")
-    assert_raise_message(ValueError, error_string,
-                         cval.StratifiedKFold, y, 0)
-    assert_raise_message(ValueError, error_string,
-                         cval.StratifiedKFold, y, 1)
-
-    # When n is not integer:
-    assert_raises(ValueError, cval.KFold, 2.5, 2)
-
-    # When n_folds is not integer:
-    assert_raises(ValueError, cval.KFold, 5, 1.5)
-    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
-
-
-def test_kfold_indices():
-    # Check all indices are returned in the test folds
-    kf = cval.KFold(300, 3)
-    check_cv_coverage(kf, expected_n_iter=3, n_samples=300)
-
-    # Check all indices are returned in the test folds even when equal-sized
-    # folds are not possible
-    kf = cval.KFold(17, 3)
-    check_cv_coverage(kf, expected_n_iter=3, n_samples=17)
-
-
-def test_kfold_no_shuffle():
-    # Manually check that KFold preserves the data ordering on toy datasets
-    splits = iter(cval.KFold(4, 2))
-    train, test = next(splits)
-    assert_array_equal(test, [0, 1])
-    assert_array_equal(train, [2, 3])
-
-    train, test = next(splits)
-    assert_array_equal(test, [2, 3])
-    assert_array_equal(train, [0, 1])
-
-    splits = iter(cval.KFold(5, 2))
-    train, test = next(splits)
-    assert_array_equal(test, [0, 1, 2])
-    assert_array_equal(train, [3, 4])
-
-    train, test = next(splits)
-    assert_array_equal(test, [3, 4])
-    assert_array_equal(train, [0, 1, 2])
-
-
-def test_stratified_kfold_no_shuffle():
-    # Manually check that StratifiedKFold preserves the data ordering as much
-    # as possible on toy datasets in order to avoid hiding sample dependencies
-    # when possible
-    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
-    train, test = next(splits)
-    assert_array_equal(test, [0, 2])
-    assert_array_equal(train, [1, 3])
-
-    train, test = next(splits)
-    assert_array_equal(test, [1, 3])
-    assert_array_equal(train, [0, 2])
-
-    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
-    train, test = next(splits)
-    assert_array_equal(test, [0, 1, 3, 4])
-    assert_array_equal(train, [2, 5, 6])
-
-    train, test = next(splits)
-    assert_array_equal(test, [2, 5, 6])
-    assert_array_equal(train, [0, 1, 3, 4])
-
-
-def test_stratified_kfold_ratios():
-    # Check that stratified kfold preserves label ratios in individual splits
-    # Repeat with shuffling turned off and on
-    n_samples = 1000
-    labels = np.array([4] * int(0.10 * n_samples) +
-                      [0] * int(0.89 * n_samples) +
-                      [1] * int(0.01 * n_samples))
-    for shuffle in [False, True]:
-        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
-            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
-                                2)
-            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
-                                2)
-            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
-                                2)
-            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
-            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
-            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)
-
-
-def test_kfold_balance():
-    # Check that KFold returns folds with balanced sizes
-    for kf in [cval.KFold(i, 5) for i in range(11, 17)]:
-        sizes = []
-        for _, test in kf:
-            sizes.append(len(test))
-
-        assert_true((np.max(sizes) - np.min(sizes)) <= 1)
-        assert_equal(np.sum(sizes), kf.n)
-
-
-def test_stratifiedkfold_balance():
-    # Check that KFold returns folds with balanced sizes (only when
-    # stratification is possible)
-    # Repeat with shuffling turned off and on
-    labels = [0] * 3 + [1] * 14
-    for shuffle in [False, True]:
-        for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle)
-                    for i in range(11, 17)]:
-            sizes = []
-            for _, test in skf:
-                sizes.append(len(test))
-
-            assert_true((np.max(sizes) - np.min(sizes)) <= 1)
-            assert_equal(np.sum(sizes), skf.n)
-
-
-def test_shuffle_kfold():
-    # Check the indices are shuffled properly, and that all indices are
-    # returned in the different test folds
-    kf = cval.KFold(300, 3, shuffle=True, random_state=0)
-    ind = np.arange(300)
-
-    all_folds = None
-    for train, test in kf:
-        assert_true(np.any(np.arange(100) != ind[test]))
-        assert_true(np.any(np.arange(100, 200) != ind[test]))
-        assert_true(np.any(np.arange(200, 300) != ind[test]))
-
-        if all_folds is None:
-            all_folds = ind[test].copy()
-        else:
-            all_folds = np.concatenate((all_folds, ind[test]))
-
-    all_folds.sort()
-    assert_array_equal(all_folds, ind)
-
-
-def test_shuffle_stratifiedkfold():
-    # Check that shuffling is happening when requested, and for proper
-    # sample coverage
-    labels = [0] * 20 + [1] * 20
-    kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
-    kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
-    for (_, test0), (_, test1) in zip(kf0, kf1):
-        assert_true(set(test0) != set(test1))
-    check_cv_coverage(kf0, expected_n_iter=5, n_samples=40)
-
-
-def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
-    # The digits samples are dependent: they are apparently grouped by authors
-    # although we don't have any information on the groups segment locations
-    # for this data. We can highlight this fact be computing k-fold cross-
-    # validation with and without shuffling: we observe that the shuffling case
-    # wrongly makes the IID assumption and is therefore too optimistic: it
-    # estimates a much higher accuracy (around 0.96) than the non
-    # shuffling variant (around 0.86).
-
-    digits = load_digits()
-    X, y = digits.data[:800], digits.target[:800]
-    model = SVC(C=10, gamma=0.005)
-    n = len(y)
-
-    cv = cval.KFold(n, 5, shuffle=False)
-    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
-    assert_greater(0.88, mean_score)
-    assert_greater(mean_score, 0.85)
-
-    # Shuffling the data artificially breaks the dependency and hides the
-    # overfitting of the model with regards to the writing style of the authors
-    # by yielding a seriously overestimated score:
-
-    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
-    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
-    assert_greater(mean_score, 0.95)
-
-    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
-    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
-    assert_greater(mean_score, 0.95)
-
-    # Similarly, StratifiedKFold should try to shuffle the data as little
-    # as possible (while respecting the balanced class constraints)
-    # and thus be able to detect the dependency by not overestimating
-    # the CV score either. As the digits dataset is approximately balanced
-    # the estimated mean score is close to the score measured with
-    # non-shuffled KFold
-
-    cv = cval.StratifiedKFold(y, 5)
-    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
-    assert_greater(0.88, mean_score)
-    assert_greater(mean_score, 0.85)
-
-
-def test_label_kfold():
-    rng = np.random.RandomState(0)
-
-    # Parameters of the test
-    n_labels = 15
-    n_samples = 1000
-    n_folds = 5
-
-    # Construct the test data
-    tolerance = 0.05 * n_samples  # 5 percent error allowed
-    labels = rng.randint(0, n_labels, n_samples)
-    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
-    ideal_n_labels_per_fold = n_samples // n_folds
-
-    # Check that folds have approximately the same size
-    assert_equal(len(folds), len(labels))
-    for i in np.unique(folds):
-        assert_greater_equal(tolerance,
-                             abs(sum(folds == i) - ideal_n_labels_per_fold))
-
-    # Check that each label appears only in 1 fold
-    for label in np.unique(labels):
-        assert_equal(len(np.unique(folds[labels == label])), 1)
-
-    # Check that no label is on both sides of the split
-    labels = np.asarray(labels, dtype=object)
-    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
-        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
-    # Construct the test data
-    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
-              'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
-              'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
-              'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
-              'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
-              'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
-              'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
-    labels = np.asarray(labels, dtype=object)
-
-    n_labels = len(np.unique(labels))
-    n_samples = len(labels)
-    n_folds = 5
-    tolerance = 0.05 * n_samples  # 5 percent error allowed
-    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
-    ideal_n_labels_per_fold = n_samples // n_folds
-
-    # Check that folds have approximately the same size
-    assert_equal(len(folds), len(labels))
-    for i in np.unique(folds):
-        assert_greater_equal(tolerance,
-                             abs(sum(folds == i) - ideal_n_labels_per_fold))
-
-    # Check that each label appears only in 1 fold
-    for label in np.unique(labels):
-        assert_equal(len(np.unique(folds[labels == label])), 1)
-
-    # Check that no label is on both sides of the split
-    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
-        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)
-
-    # Should fail if there are more folds than labels
-    labels = np.array([1, 1, 1, 2, 2])
-    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
-
-
-def test_shuffle_split():
-    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
-    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
-    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
-    for typ in six.integer_types:
-        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
-    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
-        assert_array_equal(t1[0], t2[0])
-        assert_array_equal(t2[0], t3[0])
-        assert_array_equal(t3[0], t4[0])
-        assert_array_equal(t1[1], t2[1])
-        assert_array_equal(t2[1], t3[1])
-        assert_array_equal(t3[1], t4[1])
-
-
-def test_stratified_shuffle_split_init():
-    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
-    # Check that error is raised if there is a class with only one sample
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)
-
-    # Check that error is raised if the test set size is smaller than n_classes
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
-    # Check that error is raised if the train set size is smaller than
-    # n_classes
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)
-
-    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
-    # Check that errors are raised if there is not enough samples
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)
-
-    # Train size or test size too small
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
-    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
-
-
-def test_stratified_shuffle_split_iter():
-    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
-          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
-          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
-          np.array([-1] * 800 + [1] * 50)
-          ]
-
-    for y in ys:
-        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
-                                          random_state=0)
-        test_size = np.ceil(0.33 * len(y))
-        train_size = len(y) - test_size
-        for train, test in sss:
-            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
-            # Checks if folds keep classes proportions
-            p_train = (np.bincount(np.unique(y[train],
-                                   return_inverse=True)[1]) /
-                       float(len(y[train])))
-            p_test = (np.bincount(np.unique(y[test],
-                                  return_inverse=True)[1]) /
-                      float(len(y[test])))
-            assert_array_almost_equal(p_train, p_test, 1)
-            assert_equal(len(train) + len(test), y.size)
-            assert_equal(len(train), train_size)
-            assert_equal(len(test), test_size)
-            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
-
-
-def test_stratified_shuffle_split_even():
-    # Test the StratifiedShuffleSplit, indices are drawn with a
-    # equal chance
-    n_folds = 5
-    n_iter = 1000
-
-    def assert_counts_are_ok(idx_counts, p):
-        # Here we test that the distribution of the counts
-        # per index is close enough to a binomial
-        threshold = 0.05 / n_splits
-        bf = stats.binom(n_splits, p)
-        for count in idx_counts:
-            p = bf.pmf(count)
-            assert_true(p > threshold,
-                        "An index is not drawn with chance corresponding "
-                        "to even draws")
-
-    for n_samples in (6, 22):
-        labels = np.array((n_samples // 2) * [0, 1])
-        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
-                                             test_size=1. / n_folds,
-                                             random_state=0)
-
-        train_counts = [0] * n_samples
-        test_counts = [0] * n_samples
-        n_splits = 0
-        for train, test in splits:
-            n_splits += 1
-            for counter, ids in [(train_counts, train), (test_counts, test)]:
-                for id in ids:
-                    counter[id] += 1
-        assert_equal(n_splits, n_iter)
-
-        assert_equal(len(train), splits.n_train)
-        assert_equal(len(test), splits.n_test)
-        assert_equal(len(set(train).intersection(test)), 0)
-
-        label_counts = np.unique(labels)
-        assert_equal(splits.test_size, 1.0 / n_folds)
-        assert_equal(splits.n_train + splits.n_test, len(labels))
-        assert_equal(len(label_counts), 2)
-        ex_test_p = float(splits.n_test) / n_samples
-        ex_train_p = float(splits.n_train) / n_samples
-
-        assert_counts_are_ok(train_counts, ex_train_p)
-        assert_counts_are_ok(test_counts, ex_test_p)
-
-
-def test_stratified_shuffle_split_overlap_train_test_bug():
-    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
-    # the original bug report
-    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5
-
-    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
-                                         test_size=0.5, random_state=0)
-    train, test = next(iter(splits))
-
-    assert_array_equal(np.intersect1d(train, test), [])
-
-
-def test_predefinedsplit_with_kfold_split():
-    # Check that PredefinedSplit can reproduce a split generated by Kfold.
-    folds = -1 * np.ones(10)
-    kf_train = []
-    kf_test = []
-    for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
-        kf_train.append(train_ind)
-        kf_test.append(test_ind)
-        folds[test_ind] = i
-    ps_train = []
-    ps_test = []
-    ps = cval.PredefinedSplit(folds)
-    for train_ind, test_ind in ps:
-        ps_train.append(train_ind)
-        ps_test.append(test_ind)
-    assert_array_equal(ps_train, kf_train)
-    assert_array_equal(ps_test, kf_test)
-
-
-def test_label_shuffle_split():
-    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
-          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
-          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
-          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
-          ]
-
-    for y in ys:
-        n_iter = 6
-        test_size = 1. / 3
-        slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size,
-                                     random_state=0)
-
-        # Make sure the repr works
-        repr(slo)
-
-        # Test that the length is correct
-        assert_equal(len(slo), n_iter)
-
-        y_unique = np.unique(y)
-
-        for train, test in slo:
-            # First test: no train label is in the test set and vice versa
-            y_train_unique = np.unique(y[train])
-            y_test_unique = np.unique(y[test])
-            assert_false(np.any(np.in1d(y[train], y_test_unique)))
-            assert_false(np.any(np.in1d(y[test], y_train_unique)))
-
-            # Second test: train and test add up to all the data
-            assert_equal(y[train].size + y[test].size, y.size)
-
-            # Third test: train and test are disjoint
-            assert_array_equal(np.intersect1d(train, test), [])
-
-            # Fourth test: # unique train and test labels are correct,
-            #              +- 1 for rounding error
-            assert_true(abs(len(y_test_unique) -
-                            round(test_size * len(y_unique))) <= 1)
-            assert_true(abs(len(y_train_unique) -
-                            round((1.0 - test_size) * len(y_unique))) <= 1)
-
-
-def test_leave_label_out_changing_labels():
-    # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
-    # the labels variable is changed before calling __iter__
-    labels = np.array([0, 1, 2, 1, 1, 2, 0, 0])
-    labels_changing = np.array(labels, copy=True)
-    lolo = cval.LeaveOneLabelOut(labels)
-    lolo_changing = cval.LeaveOneLabelOut(labels_changing)
-    lplo = cval.LeavePLabelOut(labels, p=2)
-    lplo_changing = cval.LeavePLabelOut(labels_changing, p=2)
-    labels_changing[:] = 0
-    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
-        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
-            assert_array_equal(train, train_chan)
-            assert_array_equal(test, test_chan)
-
-
-def test_cross_val_score():
-    clf = MockClassifier()
-    for a in range(-10, 10):
-        clf.a = a
-        # Smoke test
-        scores = cval.cross_val_score(clf, X, y)
-        assert_array_equal(scores, clf.score(X, y))
-
-        # test with multioutput y
-        scores = cval.cross_val_score(clf, X_sparse, X)
-        assert_array_equal(scores, clf.score(X_sparse, X))
-
-        scores = cval.cross_val_score(clf, X_sparse, y)
-        assert_array_equal(scores, clf.score(X_sparse, y))
-
-        # test with multioutput y
-        scores = cval.cross_val_score(clf, X_sparse, X)
-        assert_array_equal(scores, clf.score(X_sparse, X))
-
-    # test with X and y as list
-    list_check = lambda x: isinstance(x, list)
-    clf = CheckingClassifier(check_X=list_check)
-    scores = cval.cross_val_score(clf, X.tolist(), y.tolist())
-
-    clf = CheckingClassifier(check_y=list_check)
-    scores = cval.cross_val_score(clf, X, y.tolist())
-
-    assert_raises(ValueError, cval.cross_val_score, clf, X, y,
-                  scoring="sklearn")
-
-    # test with 3d X and
-    X_3d = X[:, :, np.newaxis]
-    clf = MockClassifier(allow_nd=True)
-    scores = cval.cross_val_score(clf, X_3d, y)
-
-    clf = MockClassifier(allow_nd=False)
-    assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y)
-
-
-def test_cross_val_score_pandas():
-    # check cross_val_score doesn't destroy pandas dataframe
-    types = [(MockDataFrame, MockDataFrame)]
-    try:
-        from pandas import Series, DataFrame
-        types.append((Series, DataFrame))
-    except ImportError:
-        pass
-    for TargetType, InputFeatureType in types:
-        # X dataframe, y series
-        X_df, y_ser = InputFeatureType(X), TargetType(y)
-        check_df = lambda x: isinstance(x, InputFeatureType)
-        check_series = lambda x: isinstance(x, TargetType)
-        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
-        cval.cross_val_score(clf, X_df, y_ser)
-
-
-def test_cross_val_score_mask():
-    # test that cross_val_score works with boolean masks
-    svm = SVC(kernel="linear")
-    iris = load_iris()
-    X, y = iris.data, iris.target
-    cv_indices = cval.KFold(len(y), 5)
-    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
-    cv_indices = cval.KFold(len(y), 5)
-    cv_masks = []
-    for train, test in cv_indices:
-        mask_train = np.zeros(len(y), dtype=np.bool)
-        mask_test = np.zeros(len(y), dtype=np.bool)
-        mask_train[train] = 1
-        mask_test[test] = 1
-        cv_masks.append((train, test))
-    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
-    assert_array_equal(scores_indices, scores_masks)
-
-
-def test_cross_val_score_precomputed():
-    # test for svm with precomputed kernel
-    svm = SVC(kernel="precomputed")
-    iris = load_iris()
-    X, y = iris.data, iris.target
-    linear_kernel = np.dot(X, X.T)
-    score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
-    svm = SVC(kernel="linear")
-    score_linear = cval.cross_val_score(svm, X, y)
-    assert_array_equal(score_precomputed, score_linear)
-
-    # Error raised for non-square X
-    svm = SVC(kernel="precomputed")
-    assert_raises(ValueError, cval.cross_val_score, svm, X, y)
-
-    # test error is raised when the precomputed kernel is not array-like
-    # or sparse
-    assert_raises(ValueError, cval.cross_val_score, svm,
-                  linear_kernel.tolist(), y)
-
-
-def test_cross_val_score_fit_params():
-    clf = MockClassifier()
-    n_samples = X.shape[0]
-    n_classes = len(np.unique(y))
-
-    DUMMY_INT = 42
-    DUMMY_STR = '42'
-    DUMMY_OBJ = object()
-
-    def assert_fit_params(clf):
-        # Function to test that the values are passed correctly to the
-        # classifier arguments for non-array type
-
-        assert_equal(clf.dummy_int, DUMMY_INT)
-        assert_equal(clf.dummy_str, DUMMY_STR)
-        assert_equal(clf.dummy_obj, DUMMY_OBJ)
-
-    fit_params = {'sample_weight': np.ones(n_samples),
-                  'class_prior': np.ones(n_classes) / n_classes,
-                  'sparse_sample_weight': W_sparse,
-                  'sparse_param': P_sparse,
-                  'dummy_int': DUMMY_INT,
-                  'dummy_str': DUMMY_STR,
-                  'dummy_obj': DUMMY_OBJ,
-                  'callback': assert_fit_params}
-    cval.cross_val_score(clf, X, y, fit_params=fit_params)
-
-
-def test_cross_val_score_score_func():
-    clf = MockClassifier()
-    _score_func_args = []
-
-    def score_func(y_test, y_predict):
-        _score_func_args.append((y_test, y_predict))
-        return 1.0
-
-    with warnings.catch_warnings(record=True):
-        scoring = make_scorer(score_func)
-        score = cval.cross_val_score(clf, X, y, scoring=scoring)
-    assert_array_equal(score, [1.0, 1.0, 1.0])
-    assert len(_score_func_args) == 3
-
-
-def test_cross_val_score_errors():
-    class BrokenEstimator:
-        pass
-
-    assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X)
-
-
-def test_train_test_split_errors():
-    assert_raises(ValueError, cval.train_test_split)
-    assert_raises(ValueError, cval.train_test_split, range(3), train_size=1.1)
-    assert_raises(ValueError, cval.train_test_split, range(3), test_size=0.6,
-                  train_size=0.6)
-    assert_raises(ValueError, cval.train_test_split, range(3),
-                  test_size=np.float32(0.6), train_size=np.float32(0.6))
-    assert_raises(ValueError, cval.train_test_split, range(3),
-                  test_size="wrong_type")
-    assert_raises(ValueError, cval.train_test_split, range(3), test_size=2,
-                  train_size=4)
-    assert_raises(TypeError, cval.train_test_split, range(3),
-                  some_argument=1.1)
-    assert_raises(ValueError, cval.train_test_split, range(3), range(42))
-
-
-def test_train_test_split():
-    X = np.arange(100).reshape((10, 10))
-    X_s = coo_matrix(X)
-    y = np.arange(10)
-
-    # simple test
-    split = cval.train_test_split(X, y, test_size=None, train_size=.5)
-    X_train, X_test, y_train, y_test = split
-    assert_equal(len(y_test), len(y_train))
-    # test correspondence of X and y
-    assert_array_equal(X_train[:, 0], y_train * 10)
-    assert_array_equal(X_test[:, 0], y_test * 10)
-
-    # conversion of lists to arrays (deprecated?)
-    with warnings.catch_warnings(record=True):
-        split = cval.train_test_split(X, X_s, y.tolist())
-    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
-    assert_array_equal(X_train, X_s_train.toarray())
-    assert_array_equal(X_test, X_s_test.toarray())
-
-    # don't convert lists to anything else by default
-    split = cval.train_test_split(X, X_s, y.tolist())
-    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
-    assert_true(isinstance(y_train, list))
-    assert_true(isinstance(y_test, list))
-
-    # allow nd-arrays
-    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
-    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
-    split = cval.train_test_split(X_4d, y_3d)
-    assert_equal(split[0].shape, (7, 5, 3, 2))
-    assert_equal(split[1].shape, (3, 5, 3, 2))
-    assert_equal(split[2].shape, (7, 7, 11))
-    assert_equal(split[3].shape, (3, 7, 11))
-
-    # test stratification option
-    y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
-    for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75],
-                                        [2, 4, 2, 4, 6]):
-        train, test = cval.train_test_split(y,
-                                            test_size=test_size,
-                                            stratify=y,
-                                            random_state=0)
-        assert_equal(len(test), exp_test_size)
-        assert_equal(len(test) + len(train), len(y))
-        # check the 1:1 ratio of ones and twos in the data is preserved
-        assert_equal(np.sum(train == 1), np.sum(train == 2))
-
-
-def train_test_split_pandas():
-    # check cross_val_score doesn't destroy pandas dataframe
-    types = [MockDataFrame]
-    try:
-        from pandas import DataFrame
-        types.append(DataFrame)
-    except ImportError:
-        pass
-    for InputFeatureType in types:
-        # X dataframe
-        X_df = InputFeatureType(X)
-        X_train, X_test = cval.train_test_split(X_df)
-        assert_true(isinstance(X_train, InputFeatureType))
-        assert_true(isinstance(X_test, InputFeatureType))
-
-def train_test_split_mock_pandas():
-    # X mock dataframe
-    X_df = MockDataFrame(X)
-    X_train, X_test = cval.train_test_split(X_df)
-    assert_true(isinstance(X_train, MockDataFrame))
-    assert_true(isinstance(X_test, MockDataFrame))
-
-
-def test_cross_val_score_with_score_func_classification():
-    iris = load_iris()
-    clf = SVC(kernel='linear')
-
-    # Default score (should be the accuracy score)
-    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
-    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
-    # Correct classification score (aka. zero / one score) - should be the
-    # same as the default estimator score
-    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
-                                     scoring="accuracy", cv=5)
-    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
-    # F1 score (class are balanced so f1_score should be equal to zero/one
-    # score
-    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
-                                     scoring="f1_weighted", cv=5)
-    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
-
-
-def test_cross_val_score_with_score_func_regression():
-    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
-                           random_state=0)
-    reg = Ridge()
-
-    # Default score of the Ridge regression estimator
-    scores = cval.cross_val_score(reg, X, y, cv=5)
-    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
-    # R2 score (aka. determination coefficient) - should be the
-    # same as the default estimator score
-    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
-    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
-    # Mean squared error; this is a loss function, so "scores" are negative
-    neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5,
-                                          scoring="neg_mean_squared_error")
-    expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
-    assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
-
-    # Explained variance
-    scoring = make_scorer(explained_variance_score)
-    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
-    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
-
-
-def test_permutation_score():
-    iris = load_iris()
-    X = iris.data
-    X_sparse = coo_matrix(X)
-    y = iris.target
-    svm = SVC(kernel='linear')
-    cv = cval.StratifiedKFold(y, 2)
-
-    score, scores, pvalue = cval.permutation_test_score(
-        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
-    assert_greater(score, 0.9)
-    assert_almost_equal(pvalue, 0.0, 1)
-
-    score_label, _, pvalue_label = cval.permutation_test_score(
-        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
-        labels=np.ones(y.size), random_state=0)
-    assert_true(score_label == score)
-    assert_true(pvalue_label == pvalue)
-
-    # check that we obtain the same results with a sparse representation
-    svm_sparse = SVC(kernel='linear')
-    cv_sparse = cval.StratifiedKFold(y, 2)
-    score_label, _, pvalue_label = cval.permutation_test_score(
-        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
-        scoring="accuracy", labels=np.ones(y.size), random_state=0)
-
-    assert_true(score_label == score)
-    assert_true(pvalue_label == pvalue)
-
-    # test with custom scoring object
-    def custom_score(y_true, y_pred):
-        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
-                / y_true.shape[0])
-
-    scorer = make_scorer(custom_score)
-    score, _, pvalue = cval.permutation_test_score(
-        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
-    assert_almost_equal(score, .93, 2)
-    assert_almost_equal(pvalue, 0.01, 3)
-
-    # set random y
-    y = np.mod(np.arange(len(y)), 3)
-
-    score, scores, pvalue = cval.permutation_test_score(
-        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
-
-    assert_less(score, 0.5)
-    assert_greater(pvalue, 0.2)
-
-
-def test_cross_val_generator_with_indices():
-    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    y = np.array([1, 1, 2, 2])
-    labels = np.array([1, 2, 3, 4])
-    # explicitly passing indices value is deprecated
-    loo = cval.LeaveOneOut(4)
-    lpo = cval.LeavePOut(4, 2)
-    kf = cval.KFold(4, 2)
-    skf = cval.StratifiedKFold(y, 2)
-    lolo = cval.LeaveOneLabelOut(labels)
-    lopo = cval.LeavePLabelOut(labels, 2)
-    ps = cval.PredefinedSplit([1, 1, 2, 2])
-    ss = cval.ShuffleSplit(2)
-    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
-        for train, test in cv:
-            assert_not_equal(np.asarray(train).dtype.kind, 'b')
-            assert_not_equal(np.asarray(train).dtype.kind, 'b')
-            X[train], X[test]
-            y[train], y[test]
-
-
-@ignore_warnings
-def test_cross_val_generator_with_default_indices():
-    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
-    y = np.array([1, 1, 2, 2])
-    labels = np.array([1, 2, 3, 4])
-    loo = cval.LeaveOneOut(4)
-    lpo = cval.LeavePOut(4, 2)
-    kf = cval.KFold(4, 2)
-    skf = cval.StratifiedKFold(y, 2)
-    lolo = cval.LeaveOneLabelOut(labels)
-    lopo = cval.LeavePLabelOut(labels, 2)
-    ss = cval.ShuffleSplit(2)
-    ps = cval.PredefinedSplit([1, 1, 2, 2])
-    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
-        for train, test in cv:
-            assert_not_equal(np.asarray(train).dtype.kind, 'b')
-            assert_not_equal(np.asarray(train).dtype.kind, 'b')
-            X[train], X[test]
-            y[train], y[test]
-
-
-def test_shufflesplit_errors():
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
-                  train_size=0.95)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
-                  train_size=None)
-
-
-def test_shufflesplit_reproducible():
-    # Check that iterating twice on the ShuffleSplit gives the same
-    # sequence of train-test when the random_state is given
-    ss = cval.ShuffleSplit(10, random_state=21)
-    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
-
-
-def test_safe_split_with_precomputed_kernel():
-    clf = SVC()
-    clfp = SVC(kernel="precomputed")
-
-    iris = load_iris()
-    X, y = iris.data, iris.target
-    K = np.dot(X, X.T)
-
-    cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
-    tr, te = list(cv)[0]
-
-    X_tr, y_tr = cval._safe_split(clf, X, y, tr)
-    K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
-    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
-
-    X_te, y_te = cval._safe_split(clf, X, y, te, tr)
-    K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
-    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
-
-
-def test_cross_val_score_allow_nans():
-    # Check that cross_val_score allows input data with NaNs
-    X = np.arange(200, dtype=np.float64).reshape(10, -1)
-    X[2, :] = np.nan
-    y = np.repeat([0, 1], X.shape[0] / 2)
-    p = Pipeline([
-        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
-        ('classifier', MockClassifier()),
-    ])
-    cval.cross_val_score(p, X, y, cv=5)
-
-
-def test_train_test_split_allow_nans():
-    # Check that train_test_split allows input data with NaNs
-    X = np.arange(200, dtype=np.float64).reshape(10, -1)
-    X[2, :] = np.nan
-    y = np.repeat([0, 1], X.shape[0] / 2)
-    cval.train_test_split(X, y, test_size=0.2, random_state=42)
-
-
-def test_permutation_test_score_allow_nans():
-    # Check that permutation_test_score allows input data with NaNs
-    X = np.arange(200, dtype=np.float64).reshape(10, -1)
-    X[2, :] = np.nan
-    y = np.repeat([0, 1], X.shape[0] / 2)
-    p = Pipeline([
-        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
-        ('classifier', MockClassifier()),
-    ])
-    cval.permutation_test_score(p, X, y, cv=5)
-
-
-def test_check_cv_return_types():
-    X = np.ones((9, 2))
-    cv = cval.check_cv(3, X, classifier=False)
-    assert_true(isinstance(cv, cval.KFold))
-
-    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
-    cv = cval.check_cv(3, X, y_binary, classifier=True)
-    assert_true(isinstance(cv, cval.StratifiedKFold))
-
-    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
-    cv = cval.check_cv(3, X, y_multiclass, classifier=True)
-    assert_true(isinstance(cv, cval.StratifiedKFold))
-
-    X = np.ones((5, 2))
-    y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]]
-    cv = cval.check_cv(3, X, y_multilabel, classifier=True)
-    assert_true(isinstance(cv, cval.KFold))
-
-    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
-    cv = cval.check_cv(3, X, y_multioutput, classifier=True)
-    assert_true(isinstance(cv, cval.KFold))
-
-
-def test_cross_val_score_multilabel():
-    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
-                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
-    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
-                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
-    clf = KNeighborsClassifier(n_neighbors=1)
-    scoring_micro = make_scorer(precision_score, average='micro')
-    scoring_macro = make_scorer(precision_score, average='macro')
-    scoring_samples = make_scorer(precision_score, average='samples')
-    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
-    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
-    score_samples = cval.cross_val_score(clf, X, y,
-                                         scoring=scoring_samples, cv=5)
-    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
-    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
-    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
-
-
-def test_cross_val_predict():
-    boston = load_boston()
-    X, y = boston.data, boston.target
-    cv = cval.KFold(len(boston.target))
-
-    est = Ridge()
-
-    # Naive loop (should be same as cross_val_predict):
-    preds2 = np.zeros_like(y)
-    for train, test in cv:
-        est.fit(X[train], y[train])
-        preds2[test] = est.predict(X[test])
-
-    preds = cval.cross_val_predict(est, X, y, cv=cv)
-    assert_array_almost_equal(preds, preds2)
-
-    preds = cval.cross_val_predict(est, X, y)
-    assert_equal(len(preds), len(y))
-
-    cv = cval.LeaveOneOut(len(y))
-    preds = cval.cross_val_predict(est, X, y, cv=cv)
-    assert_equal(len(preds), len(y))
-
-    Xsp = X.copy()
-    Xsp *= (Xsp > np.median(Xsp))
-    Xsp = coo_matrix(Xsp)
-    preds = cval.cross_val_predict(est, Xsp, y)
-    assert_array_almost_equal(len(preds), len(y))
-
-    preds = cval.cross_val_predict(KMeans(), X)
-    assert_equal(len(preds), len(y))
-
-    def bad_cv():
-        for i in range(4):
-            yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
-
-    assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
-
-
-def test_cross_val_predict_input_types():
-    clf = Ridge()
-    # Smoke test
-    predictions = cval.cross_val_predict(clf, X, y)
-    assert_equal(predictions.shape, (10,))
-
-    # test with multioutput y
-    with ignore_warnings(category=ConvergenceWarning):
-        predictions = cval.cross_val_predict(clf, X_sparse, X)
-    assert_equal(predictions.shape, (10, 2))
-
-    predictions = cval.cross_val_predict(clf, X_sparse, y)
-    assert_array_equal(predictions.shape, (10,))
-
-    # test with multioutput y
-    with ignore_warnings(category=ConvergenceWarning):
-        predictions = cval.cross_val_predict(clf, X_sparse, X)
-    assert_array_equal(predictions.shape, (10, 2))
-
-    # test with X and y as list
-    list_check = lambda x: isinstance(x, list)
-    clf = CheckingClassifier(check_X=list_check)
-    predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist())
-
-    clf = CheckingClassifier(check_y=list_check)
-    predictions = cval.cross_val_predict(clf, X, y.tolist())
-
-    # test with 3d X and
-    X_3d = X[:, :, np.newaxis]
-    check_3d = lambda x: x.ndim == 3
-    clf = CheckingClassifier(check_X=check_3d)
-    predictions = cval.cross_val_predict(clf, X_3d, y)
-    assert_array_equal(predictions.shape, (10,))
-
-
-def test_cross_val_predict_pandas():
-    # check cross_val_score doesn't destroy pandas dataframe
-    types = [(MockDataFrame, MockDataFrame)]
-    try:
-        from pandas import Series, DataFrame
-        types.append((Series, DataFrame))
-    except ImportError:
-        pass
-    for TargetType, InputFeatureType in types:
-        # X dataframe, y series
-        X_df, y_ser = InputFeatureType(X), TargetType(y)
-        check_df = lambda x: isinstance(x, InputFeatureType)
-        check_series = lambda x: isinstance(x, TargetType)
-        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
-        cval.cross_val_predict(clf, X_df, y_ser)
-
-
-def test_sparse_fit_params():
-    iris = load_iris()
-    X, y = iris.data, iris.target
-    clf = MockClassifier()
-    fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
-    a = cval.cross_val_score(clf, X, y, fit_params=fit_params)
-    assert_array_equal(a, np.ones(3))
-
-
-def test_check_is_partition():
-    p = np.arange(100)
-    assert_true(cval._check_is_partition(p, 100))
-    assert_false(cval._check_is_partition(np.delete(p, 23), 100))
-
-    p[0] = 23
-    assert_false(cval._check_is_partition(p, 100))
-
-
-def test_cross_val_predict_sparse_prediction():
-    # check that cross_val_predict gives same result for sparse and dense input
-    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
-                                          allow_unlabeled=False,
-                                          return_indicator=True,
-                                          random_state=1)
-    X_sparse = csr_matrix(X)
-    y_sparse = csr_matrix(y)
-    classif = OneVsRestClassifier(SVC(kernel='linear'))
-    preds = cval.cross_val_predict(classif, X, y, cv=10)
-    preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
-    preds_sparse = preds_sparse.toarray()
-    assert_array_almost_equal(preds_sparse, preds)

From 6dfe9aa732a6860ea0d24489b62efe98b289cd06 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 15:44:44 +0100
Subject: [PATCH 02/14] Fix imports (from corss_validation module to
 model_selection module)

---
 sklearn/feature_selection/rfe.py     | 3 ++-
 sklearn/grid_search.py               | 4 ++--
 sklearn/learning_curve.py            | 5 +++--
 sklearn/tests/test_grid_search.py    | 2 +-
 sklearn/tests/test_learning_curve.py | 2 +-
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 5bde9e57c3f9f..576c872982f5a 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -9,6 +9,7 @@
 import numpy as np
 from ..utils import check_X_y, safe_sqr
 from ..utils.metaestimators import if_delegate_has_method
+from ..utils.metaestimators import _safe_split
 from ..utils.validation import check_is_fitted
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
@@ -16,7 +17,7 @@
 from ..base import is_classifier
 from ..externals.joblib import Parallel, delayed
 from ..model_selection import check_cv
-from ..model_selection._validation import _safe_split, _score
+from ..model_selection._validation import _score
 from ..metrics.scorer import check_scoring
 from .base import SelectorMixin
 
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 76cdaa7cb1de5..e36d22c501621 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -21,8 +21,8 @@
 
 from .base import BaseEstimator, is_classifier, clone
 from .base import MetaEstimatorMixin
-from .cross_validation import check_cv
-from .cross_validation import _fit_and_score
+from .model_selection import check_cv
+from .model_selection._validation import _fit_and_score
 from .externals.joblib import Parallel, delayed
 from .externals import six
 from .utils import check_random_state
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
index 5571138d68d83..0bb24046680ec 100644
--- a/sklearn/learning_curve.py
+++ b/sklearn/learning_curve.py
@@ -9,9 +9,10 @@
 import numpy as np
 
 from .base import is_classifier, clone
-from .cross_validation import check_cv
+from .model_selection import check_cv
 from .externals.joblib import Parallel, delayed
-from .cross_validation import _safe_split, _score, _fit_and_score
+from .utils.metaestimators import _safe_split
+from .model_selection._validation import _fit_and_score, _score
 from .metrics.scorer import check_scoring
 from .utils import indexable
 
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
index f3c003e8c5be5..3605da1613e13 100644
--- a/sklearn/tests/test_grid_search.py
+++ b/sklearn/tests/test_grid_search.py
@@ -45,12 +45,12 @@
 from sklearn.linear_model import Ridge
 
 from sklearn.exceptions import FitFailedWarning
+from sklearn.model_selection import KFold, StratifiedKFold
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
     from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
                                      ParameterGrid, ParameterSampler)
-    from sklearn.cross_validation import KFold, StratifiedKFold
 
 from sklearn.preprocessing import Imputer
 from sklearn.pipeline import Pipeline
diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py
index afaae84b92b04..d75e6bc82f6b3 100644
--- a/sklearn/tests/test_learning_curve.py
+++ b/sklearn/tests/test_learning_curve.py
@@ -14,11 +14,11 @@
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_false
 from sklearn.datasets import make_classification
+from sklearn.model_selection import KFold
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
     from sklearn.learning_curve import learning_curve, validation_curve
-    from sklearn.cross_validation import KFold
 
 from sklearn.linear_model import PassiveAggressiveClassifier
 

From af424240be12734ef2a365fb4205892d32acd72d Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 15:45:40 +0100
Subject: [PATCH 03/14] Remove tests checking old implementation

---
 sklearn/model_selection/tests/test_split.py | 26 ---------------------
 1 file changed, 26 deletions(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 3f54aaf3c66fc..0071129d8ce73 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -1210,36 +1210,10 @@ def test_check_cv():
     cv = check_cv(3, y_multioutput, classifier=True)
     np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
 
-    # Check if the old style classes are wrapped to have a split method
-    X = np.ones(9)
-    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
-    cv1 = check_cv(3, y_multiclass, classifier=True)
-
-    with warnings.catch_warnings(record=True):
-        from sklearn.cross_validation import StratifiedKFold as OldSKF
-
-    cv2 = check_cv(OldSKF(y_multiclass, n_folds=3))
-    np.testing.assert_equal(list(cv1.split(X, y_multiclass)),
-                            list(cv2.split()))
-
     assert_raises(ValueError, check_cv, cv="lolo")
 
 
 def test_cv_iterable_wrapper():
-    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
-
-    with warnings.catch_warnings(record=True):
-        from sklearn.cross_validation import StratifiedKFold as OldSKF
-
-    cv = OldSKF(y_multiclass, n_folds=3)
-    wrapped_old_skf = _CVIterableWrapper(cv)
-
-    # Check if split works correctly
-    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))
-
-    # Check if get_n_splits works correctly
-    assert_equal(len(cv), wrapped_old_skf.get_n_splits())
-
     kf_iter = KFold(n_splits=5).split(X, y)
     kf_iter_wrapped = check_cv(kf_iter)
     # Since the wrapped iterable is enlisted and stored,

From 2362011efcbf6651ed6ce4c3cea2cafab67857e2 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 16:47:37 +0100
Subject: [PATCH 04/14] Remove grid_search and learning_curve also deprecated

---
 sklearn/__init__.py                  |   13 +-
 sklearn/grid_search.py               | 1046 --------------------------
 sklearn/learning_curve.py            |  361 ---------
 sklearn/tests/test_grid_search.py    |  815 --------------------
 sklearn/tests/test_learning_curve.py |  312 --------
 5 files changed, 6 insertions(+), 2541 deletions(-)
 delete mode 100644 sklearn/grid_search.py
 delete mode 100644 sklearn/learning_curve.py
 delete mode 100644 sklearn/tests/test_grid_search.py
 delete mode 100644 sklearn/tests/test_learning_curve.py

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 27879e16be363..4c1f6f8e829e0 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -137,13 +137,12 @@ def config_context(**new_config):
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
                'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
                'externals', 'feature_extraction', 'feature_selection',
-               'gaussian_process', 'grid_search', 'isotonic',
-               'kernel_approximation', 'kernel_ridge', 'learning_curve',
-               'linear_model', 'manifold', 'metrics', 'mixture',
-               'model_selection', 'multiclass', 'multioutput', 'naive_bayes',
-               'neighbors', 'neural_network', 'pipeline', 'preprocessing',
-               'random_projection', 'semi_supervised', 'svm', 'tree',
-               'discriminant_analysis',
+               'gaussian_process', 'isotonic', 'kernel_approximation',
+               'kernel_ridge', 'linear_model', 'manifold', 'metrics',
+               'mixture', 'model_selection', 'multiclass', 'multioutput',
+               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
+               'preprocessing', 'random_projection', 'semi_supervised', 'svm',
+               'tree', 'discriminant_analysis',
                # Non-modules:
                'clone']
 
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
deleted file mode 100644
index e36d22c501621..0000000000000
--- a/sklearn/grid_search.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-"""
-The :mod:`sklearn.grid_search` includes utilities to fine-tune the parameters
-of an estimator.
-"""
-from __future__ import print_function
-
-# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
-#         Gael Varoquaux <gael.varoquaux@normalesup.org>
-#         Andreas Mueller <amueller@ais.uni-bonn.de>
-#         Olivier Grisel <olivier.grisel@ensta.org>
-# License: BSD 3 clause
-
-from abc import ABCMeta, abstractmethod
-from collections import Mapping, namedtuple, Sized
-from functools import partial, reduce
-from itertools import product
-import operator
-import warnings
-
-import numpy as np
-
-from .base import BaseEstimator, is_classifier, clone
-from .base import MetaEstimatorMixin
-from .model_selection import check_cv
-from .model_selection._validation import _fit_and_score
-from .externals.joblib import Parallel, delayed
-from .externals import six
-from .utils import check_random_state
-from .utils.random import sample_without_replacement
-from .utils.validation import _num_samples, indexable
-from .utils.metaestimators import if_delegate_has_method
-from .metrics.scorer import check_scoring
-
-
-__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
-           'ParameterSampler', 'RandomizedSearchCV']
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
-              "model_selection module into which all the refactored classes "
-              "and functions are moved. This module will be removed in 0.20.",
-              DeprecationWarning)
-
-
-class ParameterGrid(object):
-    """Grid of parameters with a discrete number of values for each.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.ParameterGrid` instead.
-
-    Can be used to iterate over parameter value combinations with the
-    Python built-in function iter.
-
-    Read more in the :ref:`User Guide <grid_search>`.
-
-    Parameters
-    ----------
-    param_grid : dict of string to sequence, or sequence of such
-        The parameter grid to explore, as a dictionary mapping estimator
-        parameters to sequences of allowed values.
-
-        An empty dict signifies default parameters.
-
-        A sequence of dicts signifies a sequence of grids to search, and is
-        useful to avoid exploring parameter combinations that make no sense
-        or have no effect. See the examples below.
-
-    Examples
-    --------
-    >>> from sklearn.grid_search import ParameterGrid
-    >>> param_grid = {'a': [1, 2], 'b': [True, False]}
-    >>> list(ParameterGrid(param_grid)) == (
-    ...    [{'a': 1, 'b': True}, {'a': 1, 'b': False},
-    ...     {'a': 2, 'b': True}, {'a': 2, 'b': False}])
-    True
-
-    >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
-    >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
-    ...                               {'kernel': 'rbf', 'gamma': 1},
-    ...                               {'kernel': 'rbf', 'gamma': 10}]
-    True
-    >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
-    True
-
-    See also
-    --------
-    :class:`GridSearchCV`:
-        uses ``ParameterGrid`` to perform a full parallelized parameter search.
-    """
-
-    def __init__(self, param_grid):
-        if isinstance(param_grid, Mapping):
-            # wrap dictionary in a singleton list to support either dict
-            # or list of dicts
-            param_grid = [param_grid]
-        self.param_grid = param_grid
-
-    def __iter__(self):
-        """Iterate over the points in the grid.
-
-        Returns
-        -------
-        params : iterator over dict of string to any
-            Yields dictionaries mapping each estimator parameter to one of its
-            allowed values.
-        """
-        for p in self.param_grid:
-            # Always sort the keys of a dictionary, for reproducibility
-            items = sorted(p.items())
-            if not items:
-                yield {}
-            else:
-                keys, values = zip(*items)
-                for v in product(*values):
-                    params = dict(zip(keys, v))
-                    yield params
-
-    def __len__(self):
-        """Number of points on the grid."""
-        # Product function that can handle iterables (np.product can't).
-        product = partial(reduce, operator.mul)
-        return sum(product(len(v) for v in p.values()) if p else 1
-                   for p in self.param_grid)
-
-    def __getitem__(self, ind):
-        """Get the parameters that would be ``ind``th in iteration
-
-        Parameters
-        ----------
-        ind : int
-            The iteration index
-
-        Returns
-        -------
-        params : dict of string to any
-            Equal to list(self)[ind]
-        """
-        # This is used to make discrete sampling without replacement memory
-        # efficient.
-        for sub_grid in self.param_grid:
-            # XXX: could memoize information used here
-            if not sub_grid:
-                if ind == 0:
-                    return {}
-                else:
-                    ind -= 1
-                    continue
-
-            # Reverse so most frequent cycling parameter comes first
-            keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
-            sizes = [len(v_list) for v_list in values_lists]
-            total = np.product(sizes)
-
-            if ind >= total:
-                # Try the next grid
-                ind -= total
-            else:
-                out = {}
-                for key, v_list, n in zip(keys, values_lists, sizes):
-                    ind, offset = divmod(ind, n)
-                    out[key] = v_list[offset]
-                return out
-
-        raise IndexError('ParameterGrid index out of range')
-
-
-class ParameterSampler(object):
-    """Generator on parameters sampled from given distributions.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.ParameterSampler` instead.
-
-    Non-deterministic iterable over random candidate combinations for hyper-
-    parameter search. If all parameters are presented as a list,
-    sampling without replacement is performed. If at least one parameter
-    is given as a distribution, sampling with replacement is used.
-    It is highly recommended to use continuous distributions for continuous
-    parameters.
-
-    Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept
-    a custom RNG instance and always use the singleton RNG from
-    ``numpy.random``. Hence setting ``random_state`` will not guarantee a
-    deterministic iteration whenever ``scipy.stats`` distributions are used to
-    define the parameter search space.
-
-    Read more in the :ref:`User Guide <grid_search>`.
-
-    Parameters
-    ----------
-    param_distributions : dict
-        Dictionary where the keys are parameters and values
-        are distributions from which a parameter is to be sampled.
-        Distributions either have to provide a ``rvs`` function
-        to sample from them, or can be given as a list of values,
-        where a uniform distribution is assumed.
-
-    n_iter : integer
-        Number of parameter settings that are produced.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        Pseudo random number generator state used for random uniform sampling
-        from lists of possible values instead of scipy.stats distributions.
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Returns
-    -------
-    params : dict of string to any
-        **Yields** dictionaries mapping each estimator parameter to
-        as sampled value.
-
-    Examples
-    --------
-    >>> from sklearn.grid_search import ParameterSampler
-    >>> from scipy.stats.distributions import expon
-    >>> import numpy as np
-    >>> np.random.seed(0)
-    >>> param_grid = {'a':[1, 2], 'b': expon()}
-    >>> param_list = list(ParameterSampler(param_grid, n_iter=4))
-    >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
-    ...                 for d in param_list]
-    >>> rounded_list == [{'b': 0.89856, 'a': 1},
-    ...                  {'b': 0.923223, 'a': 1},
-    ...                  {'b': 1.878964, 'a': 2},
-    ...                  {'b': 1.038159, 'a': 2}]
-    True
-    """
-    def __init__(self, param_distributions, n_iter, random_state=None):
-        self.param_distributions = param_distributions
-        self.n_iter = n_iter
-        self.random_state = random_state
-
-    def __iter__(self):
-        # check if all distributions are given as lists
-        # in this case we want to sample without replacement
-        all_lists = np.all([not hasattr(v, "rvs")
-                            for v in self.param_distributions.values()])
-        rnd = check_random_state(self.random_state)
-
-        if all_lists:
-            # look up sampled parameter settings in parameter grid
-            param_grid = ParameterGrid(self.param_distributions)
-            grid_size = len(param_grid)
-
-            if grid_size < self.n_iter:
-                raise ValueError(
-                    "The total space of parameters %d is smaller "
-                    "than n_iter=%d." % (grid_size, self.n_iter)
-                    + " For exhaustive searches, use GridSearchCV.")
-            for i in sample_without_replacement(grid_size, self.n_iter,
-                                                random_state=rnd):
-                yield param_grid[i]
-
-        else:
-            # Always sort the keys of a dictionary, for reproducibility
-            items = sorted(self.param_distributions.items())
-            for _ in six.moves.range(self.n_iter):
-                params = dict()
-                for k, v in items:
-                    if hasattr(v, "rvs"):
-                        params[k] = v.rvs()
-                    else:
-                        params[k] = v[rnd.randint(len(v))]
-                yield params
-
-    def __len__(self):
-        """Number of points that will be sampled."""
-        return self.n_iter
-
-
-def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
-                   verbose, error_score='raise', **fit_params):
-    """Run fit on one set of parameters.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.fit_grid_point` instead.
-
-    Parameters
-    ----------
-    X : array-like, sparse matrix or list
-        Input data.
-
-    y : array-like or None
-        Targets for input data.
-
-    estimator : estimator object
-        A object of that type is instantiated for each grid point.
-        This is assumed to implement the scikit-learn estimator interface.
-        Either estimator needs to provide a ``score`` function,
-        or ``scoring`` must be passed.
-
-    parameters : dict
-        Parameters to be set on estimator for this grid point.
-
-    train : ndarray, dtype int or bool
-        Boolean mask or indices for training set.
-
-    test : ndarray, dtype int or bool
-        Boolean mask or indices for test set.
-
-    scorer : callable or None.
-        If provided must be a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    verbose : int
-        Verbosity level.
-
-    **fit_params : kwargs
-        Additional parameter passed to the fit function of the estimator.
-
-    error_score : 'raise' (default) or numeric
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-    Returns
-    -------
-    score : float
-        Score of this parameter setting on given training / test split.
-
-    parameters : dict
-        The parameters that have been evaluated.
-
-    n_samples_test : int
-        Number of test samples in this split.
-    """
-    score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
-                                              test, verbose, parameters,
-                                              fit_params, error_score)
-    return score, parameters, n_samples_test
-
-
-def _check_param_grid(param_grid):
-    if hasattr(param_grid, 'items'):
-        param_grid = [param_grid]
-
-    for p in param_grid:
-        for name, v in p.items():
-            if isinstance(v, np.ndarray) and v.ndim > 1:
-                raise ValueError("Parameter array should be one-dimensional.")
-
-            check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
-            if True not in check:
-                raise ValueError("Parameter values for parameter ({0}) need "
-                                 "to be a sequence.".format(name))
-
-            if len(v) == 0:
-                raise ValueError("Parameter values for parameter ({0}) need "
-                                 "to be a non-empty sequence.".format(name))
-
-
-class _CVScoreTuple (namedtuple('_CVScoreTuple',
-                                ('parameters',
-                                 'mean_validation_score',
-                                 'cv_validation_scores'))):
-    # A raw namedtuple is very memory efficient as it packs the attributes
-    # in a struct to get rid of the __dict__ of attributes in particular it
-    # does not copy the string for the keys on each instance.
-    # By deriving a namedtuple class just to introduce the __repr__ method we
-    # would also reintroduce the __dict__ on the instance. By telling the
-    # Python interpreter that this subclass uses static __slots__ instead of
-    # dynamic attributes. Furthermore we don't need any additional slot in the
-    # subclass so we set __slots__ to the empty tuple.
-    __slots__ = ()
-
-    def __repr__(self):
-        """Simple custom repr to summarize the main info"""
-        return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
-            self.mean_validation_score,
-            np.std(self.cv_validation_scores),
-            self.parameters)
-
-
-class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
-                                      MetaEstimatorMixin)):
-    """Base class for hyper parameter search with cross-validation."""
-
-    @abstractmethod
-    def __init__(self, estimator, scoring=None,
-                 fit_params=None, n_jobs=1, iid=True,
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
-                 error_score='raise'):
-
-        self.scoring = scoring
-        self.estimator = estimator
-        self.n_jobs = n_jobs
-        self.fit_params = fit_params if fit_params is not None else {}
-        self.iid = iid
-        self.refit = refit
-        self.cv = cv
-        self.verbose = verbose
-        self.pre_dispatch = pre_dispatch
-        self.error_score = error_score
-
-    @property
-    def _estimator_type(self):
-        return self.estimator._estimator_type
-
-    @property
-    def classes_(self):
-        return self.best_estimator_.classes_
-
-    def score(self, X, y=None):
-        """Returns the score on the given data, if the estimator has been refit.
-
-        This uses the score defined by ``scoring`` where provided, and the
-        ``best_estimator_.score`` method otherwise.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Input data, where n_samples is the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        Returns
-        -------
-        score : float
-
-        Notes
-        -----
-         * The long-standing behavior of this method changed in version 0.16.
-         * It no longer uses the metric provided by ``estimator.score`` if the
-           ``scoring`` parameter was set when fitting.
-
-        """
-        if self.scorer_ is None:
-            raise ValueError("No score function explicitly defined, "
-                             "and the estimator doesn't provide one %s"
-                             % self.best_estimator_)
-        return self.scorer_(self.best_estimator_, X, y)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def predict(self, X):
-        """Call predict on the estimator with the best found parameters.
-
-        Only available if ``refit=True`` and the underlying estimator supports
-        ``predict``.
-
-        Parameters
-        -----------
-        X : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.predict(X)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def predict_proba(self, X):
-        """Call predict_proba on the estimator with the best found parameters.
-
-        Only available if ``refit=True`` and the underlying estimator supports
-        ``predict_proba``.
-
-        Parameters
-        -----------
-        X : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.predict_proba(X)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def predict_log_proba(self, X):
-        """Call predict_log_proba on the estimator with the best found parameters.
-
-        Only available if ``refit=True`` and the underlying estimator supports
-        ``predict_log_proba``.
-
-        Parameters
-        -----------
-        X : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.predict_log_proba(X)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def decision_function(self, X):
-        """Call decision_function on the estimator with the best found parameters.
-
-        Only available if ``refit=True`` and the underlying estimator supports
-        ``decision_function``.
-
-        Parameters
-        -----------
-        X : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.decision_function(X)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def transform(self, X):
-        """Call transform on the estimator with the best found parameters.
-
-        Only available if the underlying estimator supports ``transform`` and
-        ``refit=True``.
-
-        Parameters
-        -----------
-        X : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.transform(X)
-
-    @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
-    def inverse_transform(self, Xt):
-        """Call inverse_transform on the estimator with the best found parameters.
-
-        Only available if the underlying estimator implements ``inverse_transform`` and
-        ``refit=True``.
-
-        Parameters
-        -----------
-        Xt : indexable, length n_samples
-            Must fulfill the input assumptions of the
-            underlying estimator.
-
-        """
-        return self.best_estimator_.inverse_transform(Xt)
-
-    def _fit(self, X, y, parameter_iterable):
-        """Actual fitting,  performing the search over parameters."""
-
-        estimator = self.estimator
-        cv = self.cv
-        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
-
-        n_samples = _num_samples(X)
-        X, y = indexable(X, y)
-
-        if y is not None:
-            if len(y) != n_samples:
-                raise ValueError('Target variable (y) has a different number '
-                                 'of samples (%i) than data (X: %i samples)'
-                                 % (len(y), n_samples))
-        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-
-        if self.verbose > 0:
-            if isinstance(parameter_iterable, Sized):
-                n_candidates = len(parameter_iterable)
-                print("Fitting {0} folds for each of {1} candidates, totalling"
-                      " {2} fits".format(len(cv), n_candidates,
-                                         n_candidates * len(cv)))
-
-        base_estimator = clone(self.estimator)
-
-        pre_dispatch = self.pre_dispatch
-
-        out = Parallel(
-            n_jobs=self.n_jobs, verbose=self.verbose,
-            pre_dispatch=pre_dispatch
-        )(
-            delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
-                                    train, test, self.verbose, parameters,
-                                    self.fit_params, return_parameters=True,
-                                    error_score=self.error_score)
-                for parameters in parameter_iterable
-                for train, test in cv)
-
-        # Out is a list of triplet: score, estimator, n_test_samples
-        n_fits = len(out)
-        n_folds = len(cv)
-
-        scores = list()
-        grid_scores = list()
-        for grid_start in range(0, n_fits, n_folds):
-            n_test_samples = 0
-            score = 0
-            all_scores = []
-            for this_score, this_n_test_samples, _, parameters in \
-                    out[grid_start:grid_start + n_folds]:
-                all_scores.append(this_score)
-                if self.iid:
-                    this_score *= this_n_test_samples
-                    n_test_samples += this_n_test_samples
-                score += this_score
-            if self.iid:
-                score /= float(n_test_samples)
-            else:
-                score /= float(n_folds)
-            scores.append((score, parameters))
-            # TODO: shall we also store the test_fold_sizes?
-            grid_scores.append(_CVScoreTuple(
-                parameters,
-                score,
-                np.array(all_scores)))
-        # Store the computed scores
-        self.grid_scores_ = grid_scores
-
-        # Find the best parameters by comparing on the mean validation score:
-        # note that `sorted` is deterministic in the way it breaks ties
-        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
-                      reverse=True)[0]
-        self.best_params_ = best.parameters
-        self.best_score_ = best.mean_validation_score
-
-        if self.refit:
-            # fit the best estimator using the entire dataset
-            # clone first to work around broken estimators
-            best_estimator = clone(base_estimator).set_params(
-                **best.parameters)
-            if y is not None:
-                best_estimator.fit(X, y, **self.fit_params)
-            else:
-                best_estimator.fit(X, **self.fit_params)
-            self.best_estimator_ = best_estimator
-        return self
-
-
-class GridSearchCV(BaseSearchCV):
-    """Exhaustive search over specified parameter values for an estimator.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.GridSearchCV` instead.
-
-    Important members are fit, predict.
-
-    GridSearchCV implements a "fit" and a "score" method.
-    It also implements "predict", "predict_proba", "decision_function",
-    "transform" and "inverse_transform" if they are implemented in the
-    estimator used.
-
-    The parameters of the estimator used to apply these methods are optimized
-    by cross-validated grid-search over a parameter grid.
-
-    Read more in the :ref:`User Guide <grid_search>`.
-
-    Parameters
-    ----------
-    estimator : estimator object.
-        A object of that type is instantiated for each grid point.
-        This is assumed to implement the scikit-learn estimator interface.
-        Either estimator needs to provide a ``score`` function,
-        or ``scoring`` must be passed.
-
-    param_grid : dict or list of dictionaries
-        Dictionary with parameters names (string) as keys and lists of
-        parameter settings to try as values, or a list of such
-        dictionaries, in which case the grids spanned by each dictionary
-        in the list are explored. This enables searching over any sequence
-        of parameter settings.
-
-    scoring : string, callable or None, default=None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-        If ``None``, the ``score`` method of the estimator is used.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method.
-
-    n_jobs: int, default: 1 :
-        The maximum number of estimators fit in parallel.
-
-            - If -1 all CPUs are used.
-
-            - If 1 is given, no parallel computing code is used at all,
-              which is useful for debugging.
-
-            - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
-              For example, with ``n_jobs = -2`` all CPUs but one are used.
-
-        .. versionchanged:: 0.17
-           Upgraded to joblib 0.9.3.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    iid : boolean, default=True
-        If True, the data is assumed to be identically distributed across
-        the folds, and the loss minimized is the total loss per sample,
-        and not the mean loss across the folds.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass,
-        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
-        other cases, :class:`sklearn.model_selection.KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    refit : boolean, default=True
-        Refit the best estimator with the entire dataset.
-        If "False", it is impossible to make predictions using
-        this GridSearchCV instance after fitting.
-
-    verbose : integer
-        Controls the verbosity: the higher, the more messages.
-
-    error_score : 'raise' (default) or numeric
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-
-    Examples
-    --------
-    >>> from sklearn import svm, grid_search, datasets
-    >>> iris = datasets.load_iris()
-    >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
-    >>> svr = svm.SVC()
-    >>> clf = grid_search.GridSearchCV(svr, parameters)
-    >>> clf.fit(iris.data, iris.target)
-    ...                             # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-    GridSearchCV(cv=None, error_score=...,
-           estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
-                         decision_function_shape='ovr', degree=..., gamma=...,
-                         kernel='rbf', max_iter=-1, probability=False,
-                         random_state=None, shrinking=True, tol=...,
-                         verbose=False),
-           fit_params={}, iid=..., n_jobs=1,
-           param_grid=..., pre_dispatch=..., refit=...,
-           scoring=..., verbose=...)
-
-
-    Attributes
-    ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
-
-    best_estimator_ : estimator
-        Estimator that was chosen by the search, i.e. estimator
-        which gave highest score (or smallest loss if specified)
-        on the left out data. Not available if refit=False.
-
-    best_score_ : float
-        Score of best_estimator on the left out data.
-
-    best_params_ : dict
-        Parameter setting that gave the best results on the hold out data.
-
-    scorer_ : function
-        Scorer function used on the held out data to choose the best
-        parameters for the model.
-
-    Notes
-    ------
-    The parameters selected are those that maximize the score of the left out
-    data, unless an explicit score is passed in which case it is used instead.
-
-    If `n_jobs` was set to a value higher than one, the data is copied for each
-    point in the grid (and not `n_jobs` times). This is done for efficiency
-    reasons if individual jobs take very little time, but may raise errors if
-    the dataset is large and not enough memory is available.  A workaround in
-    this case is to set `pre_dispatch`. Then, the memory is copied only
-    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
-    n_jobs`.
-
-    See Also
-    ---------
-    :class:`ParameterGrid`:
-        generates all the combinations of a hyperparameter grid.
-
-    :func:`sklearn.cross_validation.train_test_split`:
-        utility function to split the data into a development set usable
-        for fitting a GridSearchCV instance and an evaluation set for
-        its final evaluation.
-
-    :func:`sklearn.metrics.make_scorer`:
-        Make a scorer from a performance metric or loss function.
-
-    """
-
-    def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
-                 n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
-                 pre_dispatch='2*n_jobs', error_score='raise'):
-
-        super(GridSearchCV, self).__init__(
-            estimator, scoring, fit_params, n_jobs, iid,
-            refit, cv, verbose, pre_dispatch, error_score)
-        self.param_grid = param_grid
-        _check_param_grid(param_grid)
-
-    def fit(self, X, y=None):
-        """Run fit with all sets of parameters.
-
-        Parameters
-        ----------
-
-        X : array-like, shape = [n_samples, n_features]
-            Training vector, where n_samples is the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        """
-        return self._fit(X, y, ParameterGrid(self.param_grid))
-
-
-class RandomizedSearchCV(BaseSearchCV):
-    """Randomized search on hyper parameters.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :class:`sklearn.model_selection.RandomizedSearchCV` instead.
-
-    RandomizedSearchCV implements a "fit" and a "score" method.
-    It also implements "predict", "predict_proba", "decision_function",
-    "transform" and "inverse_transform" if they are implemented in the
-    estimator used.
-
-    The parameters of the estimator used to apply these methods are optimized
-    by cross-validated search over parameter settings.
-
-    In contrast to GridSearchCV, not all parameter values are tried out, but
-    rather a fixed number of parameter settings is sampled from the specified
-    distributions. The number of parameter settings that are tried is
-    given by n_iter.
-
-    If all parameters are presented as a list,
-    sampling without replacement is performed. If at least one parameter
-    is given as a distribution, sampling with replacement is used.
-    It is highly recommended to use continuous distributions for continuous
-    parameters.
-
-    Read more in the :ref:`User Guide <randomized_parameter_search>`.
-
-    Parameters
-    ----------
-    estimator : estimator object.
-        A object of that type is instantiated for each grid point.
-        This is assumed to implement the scikit-learn estimator interface.
-        Either estimator needs to provide a ``score`` function,
-        or ``scoring`` must be passed.
-
-    param_distributions : dict
-        Dictionary with parameters names (string) as keys and distributions
-        or lists of parameters to try. Distributions must provide a ``rvs``
-        method for sampling (such as those from scipy.stats.distributions).
-        If a list is given, it is sampled uniformly.
-
-    n_iter : int, default=10
-        Number of parameter settings that are sampled. n_iter trades
-        off runtime vs quality of the solution.
-
-    scoring : string, callable or None, default=None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-        If ``None``, the ``score`` method of the estimator is used.
-
-    fit_params : dict, optional
-        Parameters to pass to the fit method.
-
-    n_jobs: int, default: 1 :
-        The maximum number of estimators fit in parallel.
-
-            - If -1 all CPUs are used.
-
-            - If 1 is given, no parallel computing code is used at all,
-              which is useful for debugging.
-
-            - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used.
-              For example, with ``n_jobs = -2`` all CPUs but one are used.
-
-    pre_dispatch : int, or string, optional
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A string, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    iid : boolean, default=True
-        If True, the data is assumed to be identically distributed across
-        the folds, and the loss minimized is the total loss per sample,
-        and not the mean loss across the folds.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass,
-        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
-        other cases, :class:`sklearn.model_selection.KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    refit : boolean, default=True
-        Refit the best estimator with the entire dataset.
-        If "False", it is impossible to make predictions using
-        this RandomizedSearchCV instance after fitting.
-
-    verbose : integer
-        Controls the verbosity: the higher, the more messages.
-
-    random_state : int, RandomState instance or None, optional, default=None
-        Pseudo random number generator state used for random uniform sampling
-        from lists of possible values instead of scipy.stats distributions.
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    error_score : 'raise' (default) or numeric
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-
-    Attributes
-    ----------
-    grid_scores_ : list of named tuples
-        Contains scores for all parameter combinations in param_grid.
-        Each entry corresponds to one parameter setting.
-        Each named tuple has the attributes:
-
-            * ``parameters``, a dict of parameter settings
-            * ``mean_validation_score``, the mean score over the
-              cross-validation folds
-            * ``cv_validation_scores``, the list of scores for each fold
-
-    best_estimator_ : estimator
-        Estimator that was chosen by the search, i.e. estimator
-        which gave highest score (or smallest loss if specified)
-        on the left out data. Not available if refit=False.
-
-    best_score_ : float
-        Score of best_estimator on the left out data.
-
-    best_params_ : dict
-        Parameter setting that gave the best results on the hold out data.
-
-    Notes
-    -----
-    The parameters selected are those that maximize the score of the held-out
-    data, according to the scoring parameter.
-
-    If `n_jobs` was set to a value higher than one, the data is copied for each
-    parameter setting(and not `n_jobs` times). This is done for efficiency
-    reasons if individual jobs take very little time, but may raise errors if
-    the dataset is large and not enough memory is available.  A workaround in
-    this case is to set `pre_dispatch`. Then, the memory is copied only
-    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
-    n_jobs`.
-
-    See Also
-    --------
-    :class:`GridSearchCV`:
-        Does exhaustive search over a grid of parameters.
-
-    :class:`ParameterSampler`:
-        A generator over parameter settings, constructed from
-        param_distributions.
-
-    """
-
-    def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
-                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
-                 verbose=0, pre_dispatch='2*n_jobs', random_state=None,
-                 error_score='raise'):
-
-        self.param_distributions = param_distributions
-        self.n_iter = n_iter
-        self.random_state = random_state
-        super(RandomizedSearchCV, self).__init__(
-            estimator=estimator, scoring=scoring, fit_params=fit_params,
-            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
-            pre_dispatch=pre_dispatch, error_score=error_score)
-
-    def fit(self, X, y=None):
-        """Run fit on the estimator with randomly drawn parameters.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training vector, where n_samples in the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
-            Target relative to X for classification or regression;
-            None for unsupervised learning.
-
-        """
-        sampled_params = ParameterSampler(self.param_distributions,
-                                          self.n_iter,
-                                          random_state=self.random_state)
-        return self._fit(X, y, sampled_params)
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
deleted file mode 100644
index 0bb24046680ec..0000000000000
--- a/sklearn/learning_curve.py
+++ /dev/null
@@ -1,361 +0,0 @@
-"""Utilities to evaluate models with respect to a variable
-"""
-# Author: Alexander Fabisch <afabisch@informatik.uni-bremen.de>
-#
-# License: BSD 3 clause
-
-import warnings
-
-import numpy as np
-
-from .base import is_classifier, clone
-from .model_selection import check_cv
-from .externals.joblib import Parallel, delayed
-from .utils.metaestimators import _safe_split
-from .model_selection._validation import _fit_and_score, _score
-from .metrics.scorer import check_scoring
-from .utils import indexable
-
-
-warnings.warn("This module was deprecated in version 0.18 in favor of the "
-              "model_selection module into which all the functions are moved."
-              " This module will be removed in 0.20",
-              DeprecationWarning)
-
-
-__all__ = ['learning_curve', 'validation_curve']
-
-
-def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
-                   cv=None, scoring=None, exploit_incremental_learning=False,
-                   n_jobs=1, pre_dispatch="all", verbose=0,
-                   error_score='raise'):
-    """Learning curve.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.learning_curve` instead.
-
-    Determines cross-validated training and test scores for different training
-    set sizes.
-
-    A cross-validation generator splits the whole dataset k times in training
-    and test data. Subsets of the training set with varying sizes will be used
-    to train the estimator and a score for each training subset size and the
-    test set will be computed. Afterwards, the scores will be averaged over
-    all k runs for each training subset size.
-
-    Read more in the :ref:`User Guide <learning_curves>`.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        An object of that type which is cloned for each validation.
-
-    X : array-like, shape (n_samples, n_features)
-        Training vector, where n_samples is the number of samples and
-        n_features is the number of features.
-
-    y : array-like, shape (n_samples) or (n_samples, n_features), optional
-        Target relative to X for classification or regression;
-        None for unsupervised learning.
-
-    train_sizes : array-like, shape (n_ticks,), dtype float or int
-        Relative or absolute numbers of training examples that will be used to
-        generate the learning curve. If the dtype is float, it is regarded as a
-        fraction of the maximum size of the training set (that is determined
-        by the selected validation method), i.e. it has to be within (0, 1].
-        Otherwise it is interpreted as absolute sizes of the training sets.
-        Note that for classification the number of samples usually have to
-        be big enough to contain at least one sample from each class.
-        (default: np.linspace(0.1, 1.0, 5))
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass,
-        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
-        other cases, :class:`sklearn.model_selection.KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    exploit_incremental_learning : boolean, optional, default: False
-        If the estimator supports incremental learning, this will be
-        used to speed up fitting for different training set sizes.
-
-    n_jobs : integer, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : integer or string, optional
-        Number of predispatched jobs for parallel execution (default is
-        all). The option can reduce the allocated memory. The string can
-        be an expression like '2*n_jobs'.
-
-    verbose : integer, optional
-        Controls the verbosity: the higher, the more messages.
-
-    error_score : 'raise' (default) or numeric
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-    Returns
-    -------
-    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
-        Numbers of training examples that has been used to generate the
-        learning curve. Note that the number of ticks might be less
-        than n_ticks because duplicate entries will be removed.
-
-    train_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on training sets.
-
-    test_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on test set.
-
-    Notes
-    -----
-    See :ref:`examples/model_selection/plot_learning_curve.py
-    <sphx_glr_auto_examples_model_selection_plot_learning_curve.py>`
-    """
-    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
-        raise ValueError("An estimator must support the partial_fit interface "
-                         "to exploit incremental learning")
-
-    X, y = indexable(X, y)
-    # Make a list since we will be iterating multiple times over the folds
-    cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator)))
-    scorer = check_scoring(estimator, scoring=scoring)
-
-    # HACK as long as boolean indices are allowed in cv generators
-    if cv[0][0].dtype == bool:
-        new_cv = []
-        for i in range(len(cv)):
-            new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0]))
-        cv = new_cv
-
-    n_max_training_samples = len(cv[0][0])
-    # Because the lengths of folds can be significantly different, it is
-    # not guaranteed that we use all of the available training data when we
-    # use the first 'n_max_training_samples' samples.
-    train_sizes_abs = _translate_train_sizes(train_sizes,
-                                             n_max_training_samples)
-    n_unique_ticks = train_sizes_abs.shape[0]
-    if verbose > 0:
-        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))
-
-    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
-                        verbose=verbose)
-    if exploit_incremental_learning:
-        classes = np.unique(y) if is_classifier(estimator) else None
-        out = parallel(delayed(_incremental_fit_estimator)(
-            clone(estimator), X, y, classes, train, test, train_sizes_abs,
-            scorer, verbose) for train, test in cv)
-    else:
-        out = parallel(delayed(_fit_and_score)(
-            clone(estimator), X, y, scorer, train[:n_train_samples], test,
-            verbose, parameters=None, fit_params=None, return_train_score=True,
-            error_score=error_score)
-            for train, test in cv for n_train_samples in train_sizes_abs)
-        out = np.array(out)[:, :2]
-        n_cv_folds = out.shape[0] // n_unique_ticks
-        out = out.reshape(n_cv_folds, n_unique_ticks, 2)
-
-    out = np.asarray(out).transpose((2, 1, 0))
-
-    return train_sizes_abs, out[0], out[1]
-
-
-def _translate_train_sizes(train_sizes, n_max_training_samples):
-    """Determine absolute sizes of training subsets and validate 'train_sizes'.
-
-    Examples:
-        _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]
-        _translate_train_sizes([5, 10], 10) -> [5, 10]
-
-    Parameters
-    ----------
-    train_sizes : array-like, shape (n_ticks,), dtype float or int
-        Numbers of training examples that will be used to generate the
-        learning curve. If the dtype is float, it is regarded as a
-        fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].
-
-    n_max_training_samples : int
-        Maximum number of training samples (upper bound of 'train_sizes').
-
-    Returns
-    -------
-    train_sizes_abs : array, shape (n_unique_ticks,), dtype int
-        Numbers of training examples that will be used to generate the
-        learning curve. Note that the number of ticks might be less
-        than n_ticks because duplicate entries will be removed.
-    """
-    train_sizes_abs = np.asarray(train_sizes)
-    n_ticks = train_sizes_abs.shape[0]
-    n_min_required_samples = np.min(train_sizes_abs)
-    n_max_required_samples = np.max(train_sizes_abs)
-    if np.issubdtype(train_sizes_abs.dtype, np.floating):
-        if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:
-            raise ValueError("train_sizes has been interpreted as fractions "
-                             "of the maximum number of training samples and "
-                             "must be within (0, 1], but is within [%f, %f]."
-                             % (n_min_required_samples,
-                                n_max_required_samples))
-        train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(
-                                 dtype=np.int, copy=False)
-        train_sizes_abs = np.clip(train_sizes_abs, 1,
-                                  n_max_training_samples)
-    else:
-        if (n_min_required_samples <= 0 or
-                n_max_required_samples > n_max_training_samples):
-            raise ValueError("train_sizes has been interpreted as absolute "
-                             "numbers of training samples and must be within "
-                             "(0, %d], but is within [%d, %d]."
-                             % (n_max_training_samples,
-                                n_min_required_samples,
-                                n_max_required_samples))
-
-    train_sizes_abs = np.unique(train_sizes_abs)
-    if n_ticks > train_sizes_abs.shape[0]:
-        warnings.warn("Removed duplicate entries from 'train_sizes'. Number "
-                      "of ticks will be less than the size of "
-                      "'train_sizes' %d instead of %d)."
-                      % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)
-
-    return train_sizes_abs
-
-
-def _incremental_fit_estimator(estimator, X, y, classes, train, test,
-                               train_sizes, scorer, verbose):
-    """Train estimator on training subsets incrementally and compute scores."""
-    train_scores, test_scores = [], []
-    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
-    for n_train_samples, partial_train in partitions:
-        train_subset = train[:n_train_samples]
-        X_train, y_train = _safe_split(estimator, X, y, train_subset)
-        X_partial_train, y_partial_train = _safe_split(estimator, X, y,
-                                                       partial_train)
-        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
-        if y_partial_train is None:
-            estimator.partial_fit(X_partial_train, classes=classes)
-        else:
-            estimator.partial_fit(X_partial_train, y_partial_train,
-                                  classes=classes)
-        train_scores.append(_score(estimator, X_train, y_train, scorer))
-        test_scores.append(_score(estimator, X_test, y_test, scorer))
-    return np.array((train_scores, test_scores)).T
-
-
-def validation_curve(estimator, X, y, param_name, param_range, cv=None,
-                     scoring=None, n_jobs=1, pre_dispatch="all", verbose=0):
-    """Validation curve.
-
-    .. deprecated:: 0.18
-        This module will be removed in 0.20.
-        Use :func:`sklearn.model_selection.validation_curve` instead.
-
-    Determine training and test scores for varying parameter values.
-
-    Compute scores for an estimator with different values of a specified
-    parameter. This is similar to grid search with one parameter. However, this
-    will also compute training scores and is merely a utility for plotting the
-    results.
-
-    Read more in the :ref:`User Guide <validation_curve>`.
-
-    Parameters
-    ----------
-    estimator : object type that implements the "fit" and "predict" methods
-        An object of that type which is cloned for each validation.
-
-    X : array-like, shape (n_samples, n_features)
-        Training vector, where n_samples is the number of samples and
-        n_features is the number of features.
-
-    y : array-like, shape (n_samples) or (n_samples, n_features), optional
-        Target relative to X for classification or regression;
-        None for unsupervised learning.
-
-    param_name : string
-        Name of the parameter that will be varied.
-
-    param_range : array-like, shape (n_values,)
-        The values of the parameter that will be evaluated.
-
-    cv : int, cross-validation generator or an iterable, optional
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - An object to be used as a cross-validation generator.
-        - An iterable yielding train/test splits.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass,
-        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
-        other cases, :class:`sklearn.model_selection.KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-    scoring : string, callable or None, optional, default: None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-
-    n_jobs : integer, optional
-        Number of jobs to run in parallel (default 1).
-
-    pre_dispatch : integer or string, optional
-        Number of predispatched jobs for parallel execution (default is
-        all). The option can reduce the allocated memory. The string can
-        be an expression like '2*n_jobs'.
-
-    verbose : integer, optional
-        Controls the verbosity: the higher, the more messages.
-
-    Returns
-    -------
-    train_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on training sets.
-
-    test_scores : array, shape (n_ticks, n_cv_folds)
-        Scores on test set.
-
-    Notes
-    -----
-    See
-    :ref:`examples/model_selection/plot_validation_curve.py
-    <sphx_glr_auto_examples_model_selection_plot_validation_curve.py>`
-    """
-    X, y = indexable(X, y)
-    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
-    scorer = check_scoring(estimator, scoring=scoring)
-
-    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
-                        verbose=verbose)
-    out = parallel(delayed(_fit_and_score)(
-        clone(estimator), X, y, scorer, train, test, verbose,
-        parameters={param_name: v}, fit_params=None, return_train_score=True)
-        for train, test in cv for v in param_range)
-
-    out = np.asarray(out)[:, :2]
-    n_params = len(param_range)
-    n_cv_folds = out.shape[0] // n_params
-    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
-
-    return out[0], out[1]
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
deleted file mode 100644
index 3605da1613e13..0000000000000
--- a/sklearn/tests/test_grid_search.py
+++ /dev/null
@@ -1,815 +0,0 @@
-"""
-Testing for grid search module (sklearn.grid_search)
-
-"""
-
-from collections import Iterable, Sized
-from sklearn.externals.six.moves import cStringIO as StringIO
-from sklearn.externals.six.moves import xrange
-from itertools import chain, product
-import pickle
-import warnings
-import sys
-
-import numpy as np
-import scipy.sparse as sp
-
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_not_equal
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_raise_message
-from sklearn.utils.testing import assert_false, assert_true
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_no_warnings
-from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.mocking import CheckingClassifier, MockDataFrame
-
-from scipy.stats import bernoulli, expon, uniform
-
-from sklearn.externals.six.moves import zip
-from sklearn.base import BaseEstimator
-from sklearn.datasets import make_classification
-from sklearn.datasets import make_blobs
-from sklearn.datasets import make_multilabel_classification
-from sklearn.svm import LinearSVC, SVC
-from sklearn.tree import DecisionTreeRegressor
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.cluster import KMeans
-from sklearn.neighbors import KernelDensity
-from sklearn.metrics import f1_score
-from sklearn.metrics import make_scorer
-from sklearn.metrics import roc_auc_score
-from sklearn.linear_model import Ridge
-
-from sklearn.exceptions import FitFailedWarning
-from sklearn.model_selection import KFold, StratifiedKFold
-
-with warnings.catch_warnings():
-    warnings.simplefilter('ignore')
-    from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV,
-                                     ParameterGrid, ParameterSampler)
-
-from sklearn.preprocessing import Imputer
-from sklearn.pipeline import Pipeline
-
-
-# Neither of the following two estimators inherit from BaseEstimator,
-# to test hyperparameter search on user-defined classifiers.
-class MockClassifier(object):
-    """Dummy classifier to test the cross-validation"""
-    def __init__(self, foo_param=0):
-        self.foo_param = foo_param
-
-    def fit(self, X, Y):
-        assert_true(len(X) == len(Y))
-        return self
-
-    def predict(self, T):
-        return T.shape[0]
-
-    def transform(self, X):
-        return X - self.foo_param
-
-    def inverse_transform(self, X):
-        return X + self.foo_param
-
-    predict_proba = predict
-    decision_function = predict
-
-    def score(self, X=None, Y=None):
-        if self.foo_param > 1:
-            score = 1.
-        else:
-            score = 0.
-        return score
-
-    def get_params(self, deep=False):
-        return {'foo_param': self.foo_param}
-
-    def set_params(self, **params):
-        self.foo_param = params['foo_param']
-        return self
-
-
-class LinearSVCNoScore(LinearSVC):
-    """An LinearSVC classifier that has no score method."""
-    @property
-    def score(self):
-        raise AttributeError
-
-X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
-y = np.array([1, 1, 2, 2])
-
-
-def assert_grid_iter_equals_getitem(grid):
-    assert_equal(list(grid), [grid[i] for i in range(len(grid))])
-
-
-def test_parameter_grid():
-    # Test basic properties of ParameterGrid.
-    params1 = {"foo": [1, 2, 3]}
-    grid1 = ParameterGrid(params1)
-    assert_true(isinstance(grid1, Iterable))
-    assert_true(isinstance(grid1, Sized))
-    assert_equal(len(grid1), 3)
-    assert_grid_iter_equals_getitem(grid1)
-
-    params2 = {"foo": [4, 2],
-               "bar": ["ham", "spam", "eggs"]}
-    grid2 = ParameterGrid(params2)
-    assert_equal(len(grid2), 6)
-
-    # loop to assert we can iterate over the grid multiple times
-    for i in xrange(2):
-        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
-        points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
-        assert_equal(points,
-                     set(("bar", x, "foo", y)
-                         for x, y in product(params2["bar"], params2["foo"])))
-
-    assert_grid_iter_equals_getitem(grid2)
-
-    # Special case: empty grid (useful to get default estimator settings)
-    empty = ParameterGrid({})
-    assert_equal(len(empty), 1)
-    assert_equal(list(empty), [{}])
-    assert_grid_iter_equals_getitem(empty)
-    assert_raises(IndexError, lambda: empty[1])
-
-    has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
-    assert_equal(len(has_empty), 4)
-    assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}])
-    assert_grid_iter_equals_getitem(has_empty)
-
-
-def test_grid_search():
-    # Test that the best estimator contains the right value for foo_param
-    clf = MockClassifier()
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
-    # make sure it selects the smallest parameter in case of ties
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    grid_search.fit(X, y)
-    sys.stdout = old_stdout
-    assert_equal(grid_search.best_estimator_.foo_param, 2)
-
-    for i, foo_i in enumerate([1, 2, 3]):
-        assert_true(grid_search.grid_scores_[i][0]
-                    == {'foo_param': foo_i})
-    # Smoke test the score etc:
-    grid_search.score(X, y)
-    grid_search.predict_proba(X)
-    grid_search.decision_function(X)
-    grid_search.transform(X)
-
-    # Test exception handling on scoring
-    grid_search.scoring = 'sklearn'
-    assert_raises(ValueError, grid_search.fit, X, y)
-
-
-def test_transform_inverse_transform_round_trip():
-    clf = MockClassifier()
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
-    grid_search.fit(X, y)
-    X_round_trip = grid_search.inverse_transform(grid_search.transform(X))
-    assert_array_equal(X, X_round_trip)
-
-
-@ignore_warnings
-def test_grid_search_no_score():
-    # Test grid-search on classifier that has no score function.
-    clf = LinearSVC(random_state=0)
-    X, y = make_blobs(random_state=0, centers=2)
-    Cs = [.1, 1, 10]
-    clf_no_score = LinearSVCNoScore(random_state=0)
-    grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
-    grid_search.fit(X, y)
-
-    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
-                                        scoring='accuracy')
-    # smoketest grid search
-    grid_search_no_score.fit(X, y)
-
-    # check that best params are equal
-    assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
-    # check that we can call score and that it gives the correct result
-    assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
-
-    # giving no scoring function raises an error
-    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
-    assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
-                         [[1]])
-
-
-def test_grid_search_score_method():
-    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
-                               random_state=0)
-    clf = LinearSVC(random_state=0)
-    grid = {'C': [.1]}
-
-    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
-    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
-    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
-                                              scoring='roc_auc').fit(X, y)
-    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)
-
-    # ChangedBehaviourWarning occurred previously (prior to #9005)
-    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
-    score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
-    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
-                                            X, y)
-    score_auc = assert_no_warnings(search_auc.score, X, y)
-
-    # ensure the test is sane
-    assert_true(score_auc < 1.0)
-    assert_true(score_accuracy < 1.0)
-    assert_not_equal(score_auc, score_accuracy)
-
-    assert_almost_equal(score_accuracy, score_no_scoring)
-    assert_almost_equal(score_auc, score_no_score_auc)
-
-
-def test_trivial_grid_scores():
-    # Test search over a "grid" with only one point.
-    # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
-    clf = MockClassifier()
-    grid_search = GridSearchCV(clf, {'foo_param': [1]})
-    grid_search.fit(X, y)
-    assert_true(hasattr(grid_search, "grid_scores_"))
-
-    random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
-    random_search.fit(X, y)
-    assert_true(hasattr(random_search, "grid_scores_"))
-
-
-def test_no_refit():
-    # Test that grid search can be used for model selection only
-    clf = MockClassifier()
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
-    grid_search.fit(X, y)
-    assert_true(hasattr(grid_search, "best_params_"))
-
-
-def test_grid_search_error():
-    # Test that grid search will capture errors on data with different
-    # length
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
-    clf = LinearSVC()
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    assert_raises(ValueError, cv.fit, X_[:180], y_)
-
-
-def test_grid_search_iid():
-    # test the iid parameter
-    # noise-free simple 2d-data
-    X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
-                      cluster_std=0.1, shuffle=False, n_samples=80)
-    # split dataset into two folds that are not iid
-    # first one contains data of all 4 blobs, second only from two.
-    mask = np.ones(X.shape[0], dtype=np.bool)
-    mask[np.where(y == 1)[0][::2]] = 0
-    mask[np.where(y == 2)[0][::2]] = 0
-    # this leads to perfect classification on one fold and a score of 1/3 on
-    # the other
-    svm = SVC(kernel='linear')
-    # create "cv" for splits
-    cv = [[mask, ~mask], [~mask, mask]]
-    # once with iid=True (default)
-    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
-    grid_search.fit(X, y)
-    first = grid_search.grid_scores_[0]
-    assert_equal(first.parameters['C'], 1)
-    assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
-    # for first split, 1/4 of dataset is in test, for second 3/4.
-    # take weighted average
-    assert_almost_equal(first.mean_validation_score,
-                        1 * 1. / 4. + 1. / 3. * 3. / 4.)
-
-    # once with iid=False
-    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
-                               iid=False)
-    grid_search.fit(X, y)
-    first = grid_search.grid_scores_[0]
-    assert_equal(first.parameters['C'], 1)
-    # scores are the same as above
-    assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.])
-    # averaged score is just mean of scores
-    assert_almost_equal(first.mean_validation_score,
-                        np.mean(first.cv_validation_scores))
-
-
-def test_grid_search_one_grid_point():
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}
-
-    clf = SVC()
-    cv = GridSearchCV(clf, param_dict)
-    cv.fit(X_, y_)
-
-    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
-    clf.fit(X_, y_)
-
-    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
-
-
-def test_grid_search_bad_param_grid():
-    param_dict = {"C": 1.0}
-    clf = SVC()
-    assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
-    param_dict = {"C": []}
-    clf = SVC()
-    assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
-    param_dict = {"C": np.ones(6).reshape(3, 2)}
-    clf = SVC()
-    assert_raises(ValueError, GridSearchCV, clf, param_dict)
-
-
-def test_grid_search_sparse():
-    # Test that grid search works with both dense and sparse matrices
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
-    clf = LinearSVC()
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    cv.fit(X_[:180], y_[:180])
-    y_pred = cv.predict(X_[180:])
-    C = cv.best_estimator_.C
-
-    X_ = sp.csr_matrix(X_)
-    clf = LinearSVC()
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    cv.fit(X_[:180].tocoo(), y_[:180])
-    y_pred2 = cv.predict(X_[180:])
-    C2 = cv.best_estimator_.C
-
-    assert_true(np.mean(y_pred == y_pred2) >= .9)
-    assert_equal(C, C2)
-
-
-def test_grid_search_sparse_scoring():
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
-    clf = LinearSVC()
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
-    cv.fit(X_[:180], y_[:180])
-    y_pred = cv.predict(X_[180:])
-    C = cv.best_estimator_.C
-
-    X_ = sp.csr_matrix(X_)
-    clf = LinearSVC()
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
-    cv.fit(X_[:180], y_[:180])
-    y_pred2 = cv.predict(X_[180:])
-    C2 = cv.best_estimator_.C
-
-    assert_array_equal(y_pred, y_pred2)
-    assert_equal(C, C2)
-    # Smoke test the score
-    # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
-    #                            cv.score(X_[:180], y[:180]))
-
-    # test loss where greater is worse
-    def f1_loss(y_true_, y_pred_):
-        return -f1_score(y_true_, y_pred_)
-    F1Loss = make_scorer(f1_loss, greater_is_better=False)
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
-    cv.fit(X_[:180], y_[:180])
-    y_pred3 = cv.predict(X_[180:])
-    C3 = cv.best_estimator_.C
-
-    assert_equal(C, C3)
-    assert_array_equal(y_pred, y_pred3)
-
-
-def test_grid_search_precomputed_kernel():
-    # Test that grid search works when the input features are given in the
-    # form of a precomputed kernel matrix
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-
-    # compute the training kernel matrix corresponding to the linear kernel
-    K_train = np.dot(X_[:180], X_[:180].T)
-    y_train = y_[:180]
-
-    clf = SVC(kernel='precomputed')
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    cv.fit(K_train, y_train)
-
-    assert_true(cv.best_score_ >= 0)
-
-    # compute the test kernel matrix
-    K_test = np.dot(X_[180:], X_[:180].T)
-    y_test = y_[180:]
-
-    y_pred = cv.predict(K_test)
-
-    assert_true(np.mean(y_pred == y_test) >= 0)
-
-    # test error is raised when the precomputed kernel is not array-like
-    # or sparse
-    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
-
-
-def test_grid_search_precomputed_kernel_error_nonsquare():
-    # Test that grid search returns an error with a non-square precomputed
-    # training kernel matrix
-    K_train = np.zeros((10, 20))
-    y_train = np.ones((10, ))
-    clf = SVC(kernel='precomputed')
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    assert_raises(ValueError, cv.fit, K_train, y_train)
-
-
-def test_grid_search_precomputed_kernel_error_kernel_function():
-    # Test that grid search returns an error when using a kernel_function
-    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
-    kernel_function = lambda x1, x2: np.dot(x1, x2.T)
-    clf = SVC(kernel=kernel_function)
-    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    assert_raises(ValueError, cv.fit, X_, y_)
-
-
-class BrokenClassifier(BaseEstimator):
-    """Broken classifier that cannot be fit twice"""
-
-    def __init__(self, parameter=None):
-        self.parameter = parameter
-
-    def fit(self, X, y):
-        assert_true(not hasattr(self, 'has_been_fit_'))
-        self.has_been_fit_ = True
-
-    def predict(self, X):
-        return np.zeros(X.shape[0])
-
-
-@ignore_warnings
-def test_refit():
-    # Regression test for bug in refitting
-    # Simulates re-fitting a broken estimator; this used to break with
-    # sparse SVMs.
-    X = np.arange(100).reshape(10, 10)
-    y = np.array([0] * 5 + [1] * 5)
-
-    clf = GridSearchCV(BrokenClassifier(), [{'parameter': [0, 1]}],
-                       scoring="precision", refit=True)
-    clf.fit(X, y)
-
-
-def test_gridsearch_nd():
-    # Pass X as list in GridSearchCV
-    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
-    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
-    check_X = lambda x: x.shape[1:] == (5, 3, 2)
-    check_y = lambda x: x.shape[1:] == (7, 11)
-    clf = CheckingClassifier(check_X=check_X, check_y=check_y)
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
-    grid_search.fit(X_4d, y_3d).score(X, y)
-    assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_X_as_list():
-    # Pass X as list in GridSearchCV
-    X = np.arange(100).reshape(10, 10)
-    y = np.array([0] * 5 + [1] * 5)
-
-    clf = CheckingClassifier(check_X=lambda x: isinstance(x, list))
-    cv = KFold(n=len(X), n_folds=3)
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
-    grid_search.fit(X.tolist(), y).score(X, y)
-    assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_y_as_list():
-    # Pass y as list in GridSearchCV
-    X = np.arange(100).reshape(10, 10)
-    y = np.array([0] * 5 + [1] * 5)
-
-    clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
-    cv = KFold(n=len(X), n_folds=3)
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
-    grid_search.fit(X, y.tolist()).score(X, y)
-    assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_pandas_input():
-    # check cross_val_score doesn't destroy pandas dataframe
-    types = [(MockDataFrame, MockDataFrame)]
-    try:
-        from pandas import Series, DataFrame
-        types.append((DataFrame, Series))
-    except ImportError:
-        pass
-
-    X = np.arange(100).reshape(10, 10)
-    y = np.array([0] * 5 + [1] * 5)
-
-    for InputFeatureType, TargetType in types:
-        # X dataframe, y series
-        X_df, y_ser = InputFeatureType(X), TargetType(y)
-        check_df = lambda x: isinstance(x, InputFeatureType)
-        check_series = lambda x: isinstance(x, TargetType)
-        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
-
-        grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]})
-        grid_search.fit(X_df, y_ser).score(X_df, y_ser)
-        grid_search.predict(X_df)
-        assert_true(hasattr(grid_search, "grid_scores_"))
-
-
-def test_unsupervised_grid_search():
-    # test grid-search with unsupervised estimator
-    X, y = make_blobs(random_state=0)
-    km = KMeans(random_state=0)
-    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
-                               scoring='adjusted_rand_score')
-    grid_search.fit(X, y)
-    # ARI can find the right number :)
-    assert_equal(grid_search.best_params_["n_clusters"], 3)
-
-    # Now without a score, and without y
-    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
-    grid_search.fit(X)
-    assert_equal(grid_search.best_params_["n_clusters"], 4)
-
-
-def test_gridsearch_no_predict():
-    # test grid-search with an estimator without predict.
-    # slight duplication of a test from KDE
-    def custom_scoring(estimator, X):
-        return 42 if estimator.bandwidth == .1 else 0
-    X, _ = make_blobs(cluster_std=.1, random_state=1,
-                      centers=[[0, 1], [1, 0], [0, 0]])
-    search = GridSearchCV(KernelDensity(),
-                          param_grid=dict(bandwidth=[.01, .1, 1]),
-                          scoring=custom_scoring)
-    search.fit(X)
-    assert_equal(search.best_params_['bandwidth'], .1)
-    assert_equal(search.best_score_, 42)
-
-
-def test_param_sampler():
-    # test basic properties of param sampler
-    param_distributions = {"kernel": ["rbf", "linear"],
-                           "C": uniform(0, 1)}
-    sampler = ParameterSampler(param_distributions=param_distributions,
-                               n_iter=10, random_state=0)
-    samples = [x for x in sampler]
-    assert_equal(len(samples), 10)
-    for sample in samples:
-        assert_true(sample["kernel"] in ["rbf", "linear"])
-        assert_true(0 <= sample["C"] <= 1)
-
-
-def test_randomized_search_grid_scores():
-    # Make a dataset with a lot of noise to get various kind of prediction
-    # errors across CV folds and parameter settings
-    X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
-                               random_state=0)
-
-    # XXX: as of today (scipy 0.12) it's not possible to set the random seed
-    # of scipy.stats distributions: the assertions in this test should thus
-    # not depend on the randomization
-    params = dict(C=expon(scale=10),
-                  gamma=expon(scale=0.1))
-    n_cv_iter = 3
-    n_search_iter = 30
-    search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
-                                param_distributions=params, iid=False)
-    search.fit(X, y)
-    assert_equal(len(search.grid_scores_), n_search_iter)
-
-    # Check consistency of the structure of each cv_score item
-    for cv_score in search.grid_scores_:
-        assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
-        # Because we set iid to False, the mean_validation score is the
-        # mean of the fold mean scores instead of the aggregate sample-wise
-        # mean score
-        assert_almost_equal(np.mean(cv_score.cv_validation_scores),
-                            cv_score.mean_validation_score)
-        assert_equal(list(sorted(cv_score.parameters.keys())),
-                     list(sorted(params.keys())))
-
-    # Check the consistency with the best_score_ and best_params_ attributes
-    sorted_grid_scores = list(sorted(search.grid_scores_,
-                              key=lambda x: x.mean_validation_score))
-    best_score = sorted_grid_scores[-1].mean_validation_score
-    assert_equal(search.best_score_, best_score)
-
-    tied_best_params = [s.parameters for s in sorted_grid_scores
-                        if s.mean_validation_score == best_score]
-    assert_true(search.best_params_ in tied_best_params,
-                "best_params_={0} is not part of the"
-                " tied best models: {1}".format(
-                    search.best_params_, tied_best_params))
-
-
-def test_grid_search_score_consistency():
-    # test that correct scores are used
-    clf = LinearSVC(random_state=0)
-    X, y = make_blobs(random_state=0, centers=2)
-    Cs = [.1, 1, 10]
-    for score in ['f1', 'roc_auc']:
-        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
-        grid_search.fit(X, y)
-        cv = StratifiedKFold(n_folds=3, y=y)
-        for C, scores in zip(Cs, grid_search.grid_scores_):
-            clf.set_params(C=C)
-            scores = scores[2]  # get the separate runs from grid scores
-            i = 0
-            for train, test in cv:
-                clf.fit(X[train], y[train])
-                if score == "f1":
-                    correct_score = f1_score(y[test], clf.predict(X[test]))
-                elif score == "roc_auc":
-                    dec = clf.decision_function(X[test])
-                    correct_score = roc_auc_score(y[test], dec)
-                assert_almost_equal(correct_score, scores[i])
-                i += 1
-
-
-def test_pickle():
-    # Test that a fit search can be pickled
-    clf = MockClassifier()
-    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
-    grid_search.fit(X, y)
-    pickle.dumps(grid_search)  # smoke test
-
-    random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
-                                       refit=True, n_iter=3)
-    random_search.fit(X, y)
-    pickle.dumps(random_search)  # smoke test
-
-
-def test_grid_search_with_multioutput_data():
-    # Test search with multi-output estimator
-
-    X, y = make_multilabel_classification(random_state=0)
-
-    est_parameters = {"max_depth": [1, 2, 3, 4]}
-    cv = KFold(y.shape[0], random_state=0)
-
-    estimators = [DecisionTreeRegressor(random_state=0),
-                  DecisionTreeClassifier(random_state=0)]
-
-    # Test with grid search cv
-    for est in estimators:
-        grid_search = GridSearchCV(est, est_parameters, cv=cv)
-        grid_search.fit(X, y)
-        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
-            est.set_params(**parameters)
-
-            for i, (train, test) in enumerate(cv):
-                est.fit(X[train], y[train])
-                correct_score = est.score(X[test], y[test])
-                assert_almost_equal(correct_score,
-                                    cv_validation_scores[i])
-
-    # Test with a randomized search
-    for est in estimators:
-        random_search = RandomizedSearchCV(est, est_parameters,
-                                           cv=cv, n_iter=3)
-        random_search.fit(X, y)
-        for parameters, _, cv_validation_scores in random_search.grid_scores_:
-            est.set_params(**parameters)
-
-            for i, (train, test) in enumerate(cv):
-                est.fit(X[train], y[train])
-                correct_score = est.score(X[test], y[test])
-                assert_almost_equal(correct_score,
-                                    cv_validation_scores[i])
-
-
-def test_predict_proba_disabled():
-    # Test predict_proba when disabled on estimator.
-    X = np.arange(20).reshape(5, -1)
-    y = [0, 0, 1, 1, 1]
-    clf = SVC(probability=False)
-    gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
-    assert_false(hasattr(gs, "predict_proba"))
-
-
-def test_grid_search_allows_nans():
-    # Test GridSearchCV with Imputer
-    X = np.arange(20, dtype=np.float64).reshape(5, -1)
-    X[2, :] = np.nan
-    y = [0, 0, 1, 1, 1]
-    p = Pipeline([
-        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
-        ('classifier', MockClassifier()),
-    ])
-    GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
-
-
-class FailingClassifier(BaseEstimator):
-    """Classifier that raises a ValueError on fit()"""
-
-    FAILING_PARAMETER = 2
-
-    def __init__(self, parameter=None):
-        self.parameter = parameter
-
-    def fit(self, X, y=None):
-        if self.parameter == FailingClassifier.FAILING_PARAMETER:
-            raise ValueError("Failing classifier failed as required")
-
-    def predict(self, X):
-        return np.zeros(X.shape[0])
-
-
-def test_grid_search_failing_classifier():
-    # GridSearchCV with on_error != 'raise'
-    # Ensures that a warning is raised and score reset where appropriate.
-
-    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
-    clf = FailingClassifier()
-
-    # refit=False because we only want to check that errors caused by fits
-    # to individual folds will be caught and warnings raised instead. If
-    # refit was done, then an exception would be raised on refit and not
-    # caught by grid_search (expected behavior), and this would cause an
-    # error in this test.
-    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
-                      refit=False, error_score=0.0)
-
-    assert_warns(FitFailedWarning, gs.fit, X, y)
-
-    # Ensure that grid scores were set to zero as required for those fits
-    # that are expected to fail.
-    assert all(np.all(this_point.cv_validation_scores == 0.0)
-               for this_point in gs.grid_scores_
-               if this_point.parameters['parameter'] ==
-               FailingClassifier.FAILING_PARAMETER)
-
-    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
-                      refit=False, error_score=float('nan'))
-    assert_warns(FitFailedWarning, gs.fit, X, y)
-    assert all(np.all(np.isnan(this_point.cv_validation_scores))
-               for this_point in gs.grid_scores_
-               if this_point.parameters['parameter'] ==
-               FailingClassifier.FAILING_PARAMETER)
-
-
-def test_grid_search_failing_classifier_raise():
-    # GridSearchCV with on_error == 'raise' raises the error
-
-    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
-
-    clf = FailingClassifier()
-
-    # refit=False because we want to test the behaviour of the grid search part
-    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
-                      refit=False, error_score='raise')
-
-    # FailingClassifier issues a ValueError so this is what we look for.
-    assert_raises(ValueError, gs.fit, X, y)
-
-
-def test_parameters_sampler_replacement():
-    # raise error if n_iter too large
-    params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
-    sampler = ParameterSampler(params, n_iter=7)
-    assert_raises(ValueError, list, sampler)
-    # degenerates to GridSearchCV if n_iter the same as grid_size
-    sampler = ParameterSampler(params, n_iter=6)
-    samples = list(sampler)
-    assert_equal(len(samples), 6)
-    for values in ParameterGrid(params):
-        assert_true(values in samples)
-
-    # test sampling without replacement in a large grid
-    params = {'a': range(10), 'b': range(10), 'c': range(10)}
-    sampler = ParameterSampler(params, n_iter=99, random_state=42)
-    samples = list(sampler)
-    assert_equal(len(samples), 99)
-    hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c'])
-                        for p in samples]
-    assert_equal(len(set(hashable_samples)), 99)
-
-    # doesn't go into infinite loops
-    params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
-    sampler = ParameterSampler(params_distribution, n_iter=7)
-    samples = list(sampler)
-    assert_equal(len(samples), 7)
-
-
-def test_classes__property():
-    # Test that classes_ property matches best_esimator_.classes_
-    X = np.arange(100).reshape(10, 10)
-    y = np.array([0] * 5 + [1] * 5)
-    Cs = [.1, 1, 10]
-
-    grid_search = GridSearchCV(LinearSVC(random_state=0), {'C': Cs})
-    grid_search.fit(X, y)
-    assert_array_equal(grid_search.best_estimator_.classes_,
-                       grid_search.classes_)
-
-    # Test that regressors do not have a classes_ attribute
-    grid_search = GridSearchCV(Ridge(), {'alpha': [1.0, 2.0]})
-    grid_search.fit(X, y)
-    assert_false(hasattr(grid_search, 'classes_'))
diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py
deleted file mode 100644
index d75e6bc82f6b3..0000000000000
--- a/sklearn/tests/test_learning_curve.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Author: Alexander Fabisch <afabisch@informatik.uni-bremen.de>
-#
-# License: BSD 3 clause
-
-import sys
-from sklearn.externals.six.moves import cStringIO as StringIO
-import numpy as np
-import warnings
-from sklearn.base import BaseEstimator
-from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import assert_warns
-from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_false
-from sklearn.datasets import make_classification
-from sklearn.model_selection import KFold
-
-with warnings.catch_warnings():
-    warnings.simplefilter('ignore')
-    from sklearn.learning_curve import learning_curve, validation_curve
-
-from sklearn.linear_model import PassiveAggressiveClassifier
-
-
-class MockImprovingEstimator(BaseEstimator):
-    """Dummy classifier to test the learning curve"""
-    def __init__(self, n_max_train_sizes):
-        self.n_max_train_sizes = n_max_train_sizes
-        self.train_sizes = 0
-        self.X_subset = None
-
-    def fit(self, X_subset, y_subset=None):
-        self.X_subset = X_subset
-        self.train_sizes = X_subset.shape[0]
-        return self
-
-    def predict(self, X):
-        raise NotImplementedError
-
-    def score(self, X=None, Y=None):
-        # training score becomes worse (2 -> 1), test error better (0 -> 1)
-        if self._is_training_data(X):
-            return 2. - float(self.train_sizes) / self.n_max_train_sizes
-        else:
-            return float(self.train_sizes) / self.n_max_train_sizes
-
-    def _is_training_data(self, X):
-        return X is self.X_subset
-
-
-class MockIncrementalImprovingEstimator(MockImprovingEstimator):
-    """Dummy classifier that provides partial_fit"""
-    def __init__(self, n_max_train_sizes):
-        super(MockIncrementalImprovingEstimator,
-              self).__init__(n_max_train_sizes)
-        self.x = None
-
-    def _is_training_data(self, X):
-        return self.x in X
-
-    def partial_fit(self, X, y=None, **params):
-        self.train_sizes += X.shape[0]
-        self.x = X[0]
-
-
-class MockEstimatorWithParameter(BaseEstimator):
-    """Dummy classifier to test the validation curve"""
-    def __init__(self, param=0.5):
-        self.X_subset = None
-        self.param = param
-
-    def fit(self, X_subset, y_subset):
-        self.X_subset = X_subset
-        self.train_sizes = X_subset.shape[0]
-        return self
-
-    def predict(self, X):
-        raise NotImplementedError
-
-    def score(self, X=None, y=None):
-        return self.param if self._is_training_data(X) else 1 - self.param
-
-    def _is_training_data(self, X):
-        return X is self.X_subset
-
-
-class MockEstimatorFailing(BaseEstimator):
-    """Dummy classifier to test error_score in learning curve"""
-    def fit(self, X_subset, y_subset):
-        raise ValueError()
-
-    def score(self, X=None, y=None):
-        return None
-
-
-class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter):
-    """Dummy classifier that disallows repeated calls of fit method"""
-
-    def fit(self, X_subset, y_subset):
-        assert_false(
-            hasattr(self, 'fit_called_'),
-            'fit is called the second time'
-        )
-        self.fit_called_ = True
-        return super(type(self), self).fit(X_subset, y_subset)
-
-
-def test_learning_curve():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(20)
-    with warnings.catch_warnings(record=True) as w:
-        train_sizes, train_scores, test_scores = learning_curve(
-            estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
-    if len(w) > 0:
-        raise RuntimeError("Unexpected warning: %r" % w[0].message)
-    assert_equal(train_scores.shape, (10, 3))
-    assert_equal(test_scores.shape, (10, 3))
-    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
-    assert_array_almost_equal(train_scores.mean(axis=1),
-                              np.linspace(1.9, 1.0, 10))
-    assert_array_almost_equal(test_scores.mean(axis=1),
-                              np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_unsupervised():
-    X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(20)
-    train_sizes, train_scores, test_scores = learning_curve(
-        estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
-    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
-    assert_array_almost_equal(train_scores.mean(axis=1),
-                              np.linspace(1.9, 1.0, 10))
-    assert_array_almost_equal(test_scores.mean(axis=1),
-                              np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_verbose():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(20)
-
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        train_sizes, train_scores, test_scores = \
-            learning_curve(estimator, X, y, cv=3, verbose=1)
-    finally:
-        out = sys.stdout.getvalue()
-        sys.stdout.close()
-        sys.stdout = old_stdout
-
-    assert("[learning_curve]" in out)
-
-
-def test_learning_curve_error_score():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockEstimatorFailing()
-    _, _, test_scores = learning_curve(estimator, X, y, cv=3, error_score=0)
-    all_zeros = not np.any(test_scores)
-    assert(all_zeros)
-
-
-def test_learning_curve_error_score_default_raise():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockEstimatorFailing()
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3)
-
-
-def test_learning_curve_incremental_learning_not_possible():
-    X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    # The mockup does not have partial_fit()
-    estimator = MockImprovingEstimator(1)
-    assert_raises(ValueError, learning_curve, estimator, X, y,
-                  exploit_incremental_learning=True)
-
-
-def test_learning_curve_incremental_learning():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockIncrementalImprovingEstimator(20)
-    train_sizes, train_scores, test_scores = learning_curve(
-        estimator, X, y, cv=3, exploit_incremental_learning=True,
-        train_sizes=np.linspace(0.1, 1.0, 10))
-    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
-    assert_array_almost_equal(train_scores.mean(axis=1),
-                              np.linspace(1.9, 1.0, 10))
-    assert_array_almost_equal(test_scores.mean(axis=1),
-                              np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_incremental_learning_unsupervised():
-    X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockIncrementalImprovingEstimator(20)
-    train_sizes, train_scores, test_scores = learning_curve(
-        estimator, X, y=None, cv=3, exploit_incremental_learning=True,
-        train_sizes=np.linspace(0.1, 1.0, 10))
-    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
-    assert_array_almost_equal(train_scores.mean(axis=1),
-                              np.linspace(1.9, 1.0, 10))
-    assert_array_almost_equal(test_scores.mean(axis=1),
-                              np.linspace(0.1, 1.0, 10))
-
-
-def test_learning_curve_batch_and_incremental_learning_are_equal():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    train_sizes = np.linspace(0.2, 1.0, 5)
-    estimator = PassiveAggressiveClassifier(max_iter=1, tol=None,
-                                            shuffle=False)
-
-    train_sizes_inc, train_scores_inc, test_scores_inc = \
-        learning_curve(
-            estimator, X, y, train_sizes=train_sizes,
-            cv=3, exploit_incremental_learning=True)
-    train_sizes_batch, train_scores_batch, test_scores_batch = \
-        learning_curve(
-            estimator, X, y, cv=3, train_sizes=train_sizes,
-            exploit_incremental_learning=False)
-
-    assert_array_equal(train_sizes_inc, train_sizes_batch)
-    assert_array_almost_equal(train_scores_inc.mean(axis=1),
-                              train_scores_batch.mean(axis=1))
-    assert_array_almost_equal(test_scores_inc.mean(axis=1),
-                              test_scores_batch.mean(axis=1))
-
-
-def test_learning_curve_n_sample_range_out_of_bounds():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(20)
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0, 1])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0.0, 1.0])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0.1, 1.1])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0, 20])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[1, 21])
-
-
-def test_learning_curve_remove_duplicate_sample_sizes():
-    X, y = make_classification(n_samples=3, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(2)
-    train_sizes, _, _ = assert_warns(
-        RuntimeWarning, learning_curve, estimator, X, y, cv=3,
-        train_sizes=np.linspace(0.33, 1.0, 3))
-    assert_array_equal(train_sizes, [1, 2])
-
-
-def test_learning_curve_with_boolean_indices():
-    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    estimator = MockImprovingEstimator(20)
-    cv = KFold(n=30, n_folds=3)
-    train_sizes, train_scores, test_scores = learning_curve(
-        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
-    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
-    assert_array_almost_equal(train_scores.mean(axis=1),
-                              np.linspace(1.9, 1.0, 10))
-    assert_array_almost_equal(test_scores.mean(axis=1),
-                              np.linspace(0.1, 1.0, 10))
-
-
-def test_validation_curve():
-    X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-    param_range = np.linspace(0, 1, 10)
-    with warnings.catch_warnings(record=True) as w:
-        train_scores, test_scores = validation_curve(
-            MockEstimatorWithParameter(), X, y, param_name="param",
-            param_range=param_range, cv=2
-        )
-    if len(w) > 0:
-        raise RuntimeError("Unexpected warning: %r" % w[0].message)
-
-    assert_array_almost_equal(train_scores.mean(axis=1), param_range)
-    assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
-
-
-def test_validation_curve_clone_estimator():
-    X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
-                               n_redundant=0, n_classes=2,
-                               n_clusters_per_class=1, random_state=0)
-
-    param_range = np.linspace(1, 0, 10)
-    _, _ = validation_curve(
-        MockEstimatorWithSingleFitCallAllowed(), X, y,
-        param_name="param", param_range=param_range, cv=2
-    )

From 776bba1248ebfa98edcf5eebbb35b5e6fa79ecd1 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 17:30:14 +0100
Subject: [PATCH 05/14] Remove gaussian_process

---
 sklearn/gaussian_process/__init__.py          |   3 +-
 sklearn/gaussian_process/gaussian_process.py  | 882 ------------------
 .../tests/test_gaussian_process.py            | 175 ----
 sklearn/utils/estimator_checks.py             |   4 +-
 4 files changed, 2 insertions(+), 1062 deletions(-)
 delete mode 100644 sklearn/gaussian_process/gaussian_process.py
 delete mode 100644 sklearn/gaussian_process/tests/test_gaussian_process.py

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 48d9aa05aaf84..377f15795ee58 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -14,10 +14,9 @@
 from .gpc import GaussianProcessClassifier
 from . import kernels
 
-from .gaussian_process import GaussianProcess
 from . import correlation_models
 from . import regression_models
 
-__all__ = ['GaussianProcess', 'correlation_models', 'regression_models',
+__all__ = ['correlation_models', 'regression_models',
            'GaussianProcessRegressor', 'GaussianProcessClassifier',
            'kernels']
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
deleted file mode 100644
index 8c7491e648d31..0000000000000
--- a/sklearn/gaussian_process/gaussian_process.py
+++ /dev/null
@@ -1,882 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
-#         (mostly translation, see implementation details)
-# License: BSD 3 clause
-
-from __future__ import print_function
-
-import numpy as np
-from scipy import linalg, optimize
-
-from ..base import BaseEstimator, RegressorMixin
-from ..metrics.pairwise import manhattan_distances
-from ..utils import check_random_state, check_array, check_X_y
-from ..utils.validation import check_is_fitted
-from . import regression_models as regression
-from . import correlation_models as correlation
-from ..utils import deprecated
-
-MACHINE_EPSILON = np.finfo(np.double).eps
-
-
-@deprecated("l1_cross_distances was deprecated in version 0.18 "
-            "and will be removed in 0.20.")
-def l1_cross_distances(X):
-    """
-    Computes the nonzero componentwise L1 cross-distances between the vectors
-    in X.
-
-    Parameters
-    ----------
-
-    X : array_like
-        An array with shape (n_samples, n_features)
-
-    Returns
-    -------
-
-    D : array with shape (n_samples * (n_samples - 1) / 2, n_features)
-        The array of componentwise L1 cross-distances.
-
-    ij : arrays with shape (n_samples * (n_samples - 1) / 2, 2)
-        The indices i and j of the vectors in X associated to the cross-
-        distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]).
-    """
-    X = check_array(X)
-    n_samples, n_features = X.shape
-    n_nonzero_cross_dist = n_samples * (n_samples - 1) // 2
-    ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int)
-    D = np.zeros((n_nonzero_cross_dist, n_features))
-    ll_1 = 0
-    for k in range(n_samples - 1):
-        ll_0 = ll_1
-        ll_1 = ll_0 + n_samples - k - 1
-        ij[ll_0:ll_1, 0] = k
-        ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples)
-        D[ll_0:ll_1] = np.abs(X[k] - X[(k + 1):n_samples])
-
-    return D, ij
-
-
-@deprecated("GaussianProcess was deprecated in version 0.18 and will be "
-            "removed in 0.20. Use the GaussianProcessRegressor instead.")
-class GaussianProcess(BaseEstimator, RegressorMixin):
-    """The legacy Gaussian Process model class.
-
-    .. deprecated:: 0.18
-        This class will be removed in 0.20.
-        Use the :class:`GaussianProcessRegressor` instead.
-
-    Read more in the :ref:`User Guide <gaussian_process>`.
-
-    Parameters
-    ----------
-    regr : string or callable, optional
-        A regression function returning an array of outputs of the linear
-        regression functional basis. The number of observations n_samples
-        should be greater than the size p of this basis.
-        Default assumes a simple constant regression trend.
-        Available built-in regression models are::
-
-            'constant', 'linear', 'quadratic'
-
-    corr : string or callable, optional
-        A stationary autocorrelation function returning the autocorrelation
-        between two points x and x'.
-        Default assumes a squared-exponential autocorrelation model.
-        Built-in correlation models are::
-
-            'absolute_exponential', 'squared_exponential',
-            'generalized_exponential', 'cubic', 'linear'
-
-    beta0 : double array_like, optional
-        The regression weight vector to perform Ordinary Kriging (OK).
-        Default assumes Universal Kriging (UK) so that the vector beta of
-        regression weights is estimated using the maximum likelihood
-        principle.
-
-    storage_mode : string, optional
-        A string specifying whether the Cholesky decomposition of the
-        correlation matrix should be stored in the class (storage_mode =
-        'full') or not (storage_mode = 'light').
-        Default assumes storage_mode = 'full', so that the
-        Cholesky decomposition of the correlation matrix is stored.
-        This might be a useful parameter when one is not interested in the
-        MSE and only plan to estimate the BLUP, for which the correlation
-        matrix is not required.
-
-    verbose : boolean, optional
-        A boolean specifying the verbose level.
-        Default is verbose = False.
-
-    theta0 : double array_like, optional
-        An array with shape (n_features, ) or (1, ).
-        The parameters in the autocorrelation model.
-        If thetaL and thetaU are also specified, theta0 is considered as
-        the starting point for the maximum likelihood estimation of the
-        best set of parameters.
-        Default assumes isotropic autocorrelation model with theta0 = 1e-1.
-
-    thetaL : double array_like, optional
-        An array with shape matching theta0's.
-        Lower bound on the autocorrelation parameters for maximum
-        likelihood estimation.
-        Default is None, so that it skips maximum likelihood estimation and
-        it uses theta0.
-
-    thetaU : double array_like, optional
-        An array with shape matching theta0's.
-        Upper bound on the autocorrelation parameters for maximum
-        likelihood estimation.
-        Default is None, so that it skips maximum likelihood estimation and
-        it uses theta0.
-
-    normalize : boolean, optional
-        Input X and observations y are centered and reduced wrt
-        means and standard deviations estimated from the n_samples
-        observations provided.
-        Default is normalize = True so that data is normalized to ease
-        maximum likelihood estimation.
-
-    nugget : double or ndarray, optional
-        Introduce a nugget effect to allow smooth predictions from noisy
-        data.  If nugget is an ndarray, it must be the same length as the
-        number of data points used for the fit.
-        The nugget is added to the diagonal of the assumed training covariance;
-        in this way it acts as a Tikhonov regularization in the problem.  In
-        the special case of the squared exponential correlation function, the
-        nugget mathematically represents the variance of the input values.
-        Default assumes a nugget close to machine precision for the sake of
-        robustness (nugget = 10. * MACHINE_EPSILON).
-
-    optimizer : string, optional
-        A string specifying the optimization algorithm to be used.
-        Default uses 'fmin_cobyla' algorithm from scipy.optimize.
-        Available optimizers are::
-
-            'fmin_cobyla', 'Welch'
-
-        'Welch' optimizer is dued to Welch et al., see reference [WBSWM1992]_.
-        It consists in iterating over several one-dimensional optimizations
-        instead of running one single multi-dimensional optimization.
-
-    random_start : int, optional
-        The number of times the Maximum Likelihood Estimation should be
-        performed from a random starting point.
-        The first MLE always uses the specified starting point (theta0),
-        the next starting points are picked at random according to an
-        exponential distribution (log-uniform on [thetaL, thetaU]).
-        Default does not use random starting point (random_start = 1).
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        The generator used to shuffle the sequence of coordinates of theta in
-        the Welch optimizer. If int, random_state is the seed used by the
-        random number generator; If RandomState instance, random_state is the
-        random number generator; If None, the random number generator is the
-        RandomState instance used by `np.random`.
-
-    Attributes
-    ----------
-    theta_ : array
-        Specified theta OR the best set of autocorrelation parameters (the \
-        sought maximizer of the reduced likelihood function).
-
-    reduced_likelihood_function_value_ : array
-        The optimal reduced likelihood function value.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.gaussian_process import GaussianProcess
-    >>> X = np.array([[1., 3., 5., 6., 7., 8.]]).T
-    >>> y = (X * np.sin(X)).ravel()
-    >>> gp = GaussianProcess(theta0=0.1, thetaL=.001, thetaU=1.)
-    >>> gp.fit(X, y)                                      # doctest: +ELLIPSIS
-    GaussianProcess(beta0=None...
-            ...
-
-    Notes
-    -----
-    The presentation implementation is based on a translation of the DACE
-    Matlab toolbox, see reference [NLNS2002]_.
-
-    References
-    ----------
-
-    .. [NLNS2002] `H.B. Nielsen, S.N. Lophaven, H. B. Nielsen and J.
-        Sondergaard.  DACE - A MATLAB Kriging Toolbox.` (2002)
-        http://imedea.uib-csic.es/master/cambioglobal/Modulo_V_cod101615/Lab/lab_maps/krigging/DACE-krigingsoft/dace/dace.pdf
-
-    .. [WBSWM1992] `W.J. Welch, R.J. Buck, J. Sacks, H.P. Wynn, T.J. Mitchell,
-        and M.D.  Morris (1992). Screening, predicting, and computer
-        experiments.  Technometrics, 34(1) 15--25.`
-        http://www.jstor.org/stable/1269548
-    """
-
-    _regression_types = {
-        'constant': regression.constant,
-        'linear': regression.linear,
-        'quadratic': regression.quadratic}
-
-    _correlation_types = {
-        'absolute_exponential': correlation.absolute_exponential,
-        'squared_exponential': correlation.squared_exponential,
-        'generalized_exponential': correlation.generalized_exponential,
-        'cubic': correlation.cubic,
-        'linear': correlation.linear}
-
-    _optimizer_types = [
-        'fmin_cobyla',
-        'Welch']
-
-    def __init__(self, regr='constant', corr='squared_exponential', beta0=None,
-                 storage_mode='full', verbose=False, theta0=1e-1,
-                 thetaL=None, thetaU=None, optimizer='fmin_cobyla',
-                 random_start=1, normalize=True,
-                 nugget=10. * MACHINE_EPSILON, random_state=None):
-
-        self.regr = regr
-        self.corr = corr
-        self.beta0 = beta0
-        self.storage_mode = storage_mode
-        self.verbose = verbose
-        self.theta0 = theta0
-        self.thetaL = thetaL
-        self.thetaU = thetaU
-        self.normalize = normalize
-        self.nugget = nugget
-        self.optimizer = optimizer
-        self.random_start = random_start
-        self.random_state = random_state
-
-    def fit(self, X, y):
-        """
-        The Gaussian Process model fitting method.
-
-        Parameters
-        ----------
-        X : double array_like
-            An array with shape (n_samples, n_features) with the input at which
-            observations were made.
-
-        y : double array_like
-            An array with shape (n_samples, ) or shape (n_samples, n_targets)
-            with the observations of the output to be predicted.
-
-        Returns
-        -------
-        gp : self
-            A fitted Gaussian Process model object awaiting data to perform
-            predictions.
-        """
-        # Run input checks
-        self._check_params()
-
-        self.random_state = check_random_state(self.random_state)
-
-        # Force data to 2D numpy.array
-        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
-        self.y_ndim_ = y.ndim
-        if y.ndim == 1:
-            y = y[:, np.newaxis]
-
-        # Check shapes of DOE & observations
-        n_samples, n_features = X.shape
-        _, n_targets = y.shape
-
-        # Run input checks
-        self._check_params(n_samples)
-
-        # Normalize data or don't
-        if self.normalize:
-            X_mean = np.mean(X, axis=0)
-            X_std = np.std(X, axis=0)
-            y_mean = np.mean(y, axis=0)
-            y_std = np.std(y, axis=0)
-            X_std[X_std == 0.] = 1.
-            y_std[y_std == 0.] = 1.
-            # center and scale X if necessary
-            X = (X - X_mean) / X_std
-            y = (y - y_mean) / y_std
-        else:
-            X_mean = np.zeros(1)
-            X_std = np.ones(1)
-            y_mean = np.zeros(1)
-            y_std = np.ones(1)
-
-        # Calculate matrix of distances D between samples
-        D, ij = l1_cross_distances(X)
-        if (np.min(np.sum(D, axis=1)) == 0.
-                and self.corr != correlation.pure_nugget):
-            raise Exception("Multiple input features cannot have the same"
-                            " target value.")
-
-        # Regression matrix and parameters
-        F = self.regr(X)
-        n_samples_F = F.shape[0]
-        if F.ndim > 1:
-            p = F.shape[1]
-        else:
-            p = 1
-        if n_samples_F != n_samples:
-            raise Exception("Number of rows in F and X do not match. Most "
-                            "likely something is going wrong with the "
-                            "regression model.")
-        if p > n_samples_F:
-            raise Exception(("Ordinary least squares problem is undetermined "
-                             "n_samples=%d must be greater than the "
-                             "regression model size p=%d.") % (n_samples, p))
-        if self.beta0 is not None:
-            if self.beta0.shape[0] != p:
-                raise Exception("Shapes of beta0 and F do not match.")
-
-        # Set attributes
-        self.X = X
-        self.y = y
-        self.D = D
-        self.ij = ij
-        self.F = F
-        self.X_mean, self.X_std = X_mean, X_std
-        self.y_mean, self.y_std = y_mean, y_std
-
-        # Determine Gaussian Process model parameters
-        if self.thetaL is not None and self.thetaU is not None:
-            # Maximum Likelihood Estimation of the parameters
-            if self.verbose:
-                print("Performing Maximum Likelihood Estimation of the "
-                      "autocorrelation parameters...")
-            self.theta_, self.reduced_likelihood_function_value_, par = \
-                self._arg_max_reduced_likelihood_function()
-            if np.isinf(self.reduced_likelihood_function_value_):
-                raise Exception("Bad parameter region. "
-                                "Try increasing upper bound")
-
-        else:
-            # Given parameters
-            if self.verbose:
-                print("Given autocorrelation parameters. "
-                      "Computing Gaussian Process model parameters...")
-            self.theta_ = self.theta0
-            self.reduced_likelihood_function_value_, par = \
-                self.reduced_likelihood_function()
-            if np.isinf(self.reduced_likelihood_function_value_):
-                raise Exception("Bad point. Try increasing theta0.")
-
-        self.beta = par['beta']
-        self.gamma = par['gamma']
-        self.sigma2 = par['sigma2']
-        self.C = par['C']
-        self.Ft = par['Ft']
-        self.G = par['G']
-
-        if self.storage_mode == 'light':
-            # Delete heavy data (it will be computed again if required)
-            # (it is required only when MSE is wanted in self.predict)
-            if self.verbose:
-                print("Light storage mode specified. "
-                      "Flushing autocorrelation matrix...")
-            self.D = None
-            self.ij = None
-            self.F = None
-            self.C = None
-            self.Ft = None
-            self.G = None
-
-        return self
-
-    def predict(self, X, eval_MSE=False, batch_size=None):
-        """
-        This function evaluates the Gaussian Process model at x.
-
-        Parameters
-        ----------
-        X : array_like
-            An array with shape (n_eval, n_features) giving the point(s) at
-            which the prediction(s) should be made.
-
-        eval_MSE : boolean, optional
-            A boolean specifying whether the Mean Squared Error should be
-            evaluated or not.
-            Default assumes evalMSE = False and evaluates only the BLUP (mean
-            prediction).
-
-        batch_size : integer, optional
-            An integer giving the maximum number of points that can be
-            evaluated simultaneously (depending on the available memory).
-            Default is None so that all given points are evaluated at the same
-            time.
-
-        Returns
-        -------
-        y : array_like, shape (n_samples, ) or (n_samples, n_targets)
-            An array with shape (n_eval, ) if the Gaussian Process was trained
-            on an array of shape (n_samples, ) or an array with shape
-            (n_eval, n_targets) if the Gaussian Process was trained on an array
-            of shape (n_samples, n_targets) with the Best Linear Unbiased
-            Prediction at x.
-
-        MSE : array_like, optional (if eval_MSE == True)
-            An array with shape (n_eval, ) or (n_eval, n_targets) as with y,
-            with the Mean Squared Error at x.
-        """
-        check_is_fitted(self, "X")
-
-        # Check input shapes
-        X = check_array(X)
-        n_eval, _ = X.shape
-        n_samples, n_features = self.X.shape
-        n_samples_y, n_targets = self.y.shape
-
-        # Run input checks
-        self._check_params(n_samples)
-
-        if X.shape[1] != n_features:
-            raise ValueError(("The number of features in X (X.shape[1] = %d) "
-                              "should match the number of features used "
-                              "for fit() "
-                              "which is %d.") % (X.shape[1], n_features))
-
-        if batch_size is None:
-            # No memory management
-            # (evaluates all given points in a single batch run)
-
-            # Normalize input
-            X = (X - self.X_mean) / self.X_std
-
-            # Get pairwise componentwise L1-distances to the input training set
-            dx = manhattan_distances(X, Y=self.X, sum_over_features=False)
-            # Get regression function and correlation
-            f = self.regr(X)
-            r = self.corr(self.theta_, dx).reshape(n_eval, n_samples)
-
-            # Scaled predictor
-            y_ = np.dot(f, self.beta) + np.dot(r, self.gamma)
-
-            # Predictor
-            y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets)
-
-            if self.y_ndim_ == 1:
-                y = y.ravel()
-
-            # Mean Squared Error
-            if eval_MSE:
-                C = self.C
-                if C is None:
-                    # Light storage mode (need to recompute C, F, Ft and G)
-                    if self.verbose:
-                        print("This GaussianProcess used 'light' storage mode "
-                              "at instantiation. Need to recompute "
-                              "autocorrelation matrix...")
-                    reduced_likelihood_function_value, par = \
-                        self.reduced_likelihood_function()
-                    self.C = par['C']
-                    self.Ft = par['Ft']
-                    self.G = par['G']
-
-                rt = linalg.solve_triangular(self.C, r.T, lower=True)
-
-                if self.beta0 is None:
-                    # Universal Kriging
-                    u = linalg.solve_triangular(self.G.T,
-                                                np.dot(self.Ft.T, rt) - f.T,
-                                                lower=True)
-                else:
-                    # Ordinary Kriging
-                    u = np.zeros((n_targets, n_eval))
-
-                MSE = np.dot(self.sigma2.reshape(n_targets, 1),
-                             (1. - (rt ** 2.).sum(axis=0)
-                              + (u ** 2.).sum(axis=0))[np.newaxis, :])
-                MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets)
-
-                # Mean Squared Error might be slightly negative depending on
-                # machine precision: force to zero!
-                MSE[MSE < 0.] = 0.
-
-                if self.y_ndim_ == 1:
-                    MSE = MSE.ravel()
-
-                return y, MSE
-
-            else:
-
-                return y
-
-        else:
-            # Memory management
-
-            if type(batch_size) is not int or batch_size <= 0:
-                raise Exception("batch_size must be a positive integer")
-
-            if eval_MSE:
-
-                y, MSE = np.zeros(n_eval), np.zeros(n_eval)
-                for k in range(max(1, int(n_eval / batch_size))):
-                    batch_from = k * batch_size
-                    batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
-                    y[batch_from:batch_to], MSE[batch_from:batch_to] = \
-                        self.predict(X[batch_from:batch_to],
-                                     eval_MSE=eval_MSE, batch_size=None)
-
-                return y, MSE
-
-            else:
-
-                y = np.zeros(n_eval)
-                for k in range(max(1, int(n_eval / batch_size))):
-                    batch_from = k * batch_size
-                    batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
-                    y[batch_from:batch_to] = \
-                        self.predict(X[batch_from:batch_to],
-                                     eval_MSE=eval_MSE, batch_size=None)
-
-                return y
-
-    def reduced_likelihood_function(self, theta=None):
-        """
-        This function determines the BLUP parameters and evaluates the reduced
-        likelihood function for the given autocorrelation parameters theta.
-
-        Maximizing this function wrt the autocorrelation parameters theta is
-        equivalent to maximizing the likelihood of the assumed joint Gaussian
-        distribution of the observations y evaluated onto the design of
-        experiments X.
-
-        Parameters
-        ----------
-        theta : array_like, optional
-            An array containing the autocorrelation parameters at which the
-            Gaussian Process model parameters should be determined.
-            Default uses the built-in autocorrelation parameters
-            (ie ``theta = self.theta_``).
-
-        Returns
-        -------
-        reduced_likelihood_function_value : double
-            The value of the reduced likelihood function associated to the
-            given autocorrelation parameters theta.
-
-        par : dict
-            A dictionary containing the requested Gaussian Process model
-            parameters:
-
-            - ``sigma2`` is the Gaussian Process variance.
-            - ``beta`` is the generalized least-squares regression weights for
-              Universal Kriging or given beta0 for Ordinary Kriging.
-            - ``gamma`` is the Gaussian Process weights.
-            - ``C`` is the Cholesky decomposition of the correlation
-              matrix [R].
-            - ``Ft`` is the solution of the linear equation system
-              [R] x Ft = F
-            - ``G`` is the QR decomposition of the matrix Ft.
-        """
-        check_is_fitted(self, "X")
-
-        if theta is None:
-            # Use built-in autocorrelation parameters
-            theta = self.theta_
-
-        # Initialize output
-        reduced_likelihood_function_value = - np.inf
-        par = {}
-
-        # Retrieve data
-        n_samples = self.X.shape[0]
-        D = self.D
-        ij = self.ij
-        F = self.F
-
-        if D is None:
-            # Light storage mode (need to recompute D, ij and F)
-            D, ij = l1_cross_distances(self.X)
-            if (np.min(np.sum(D, axis=1)) == 0.
-                    and self.corr != correlation.pure_nugget):
-                raise Exception("Multiple X are not allowed")
-            F = self.regr(self.X)
-
-        # Set up R
-        r = self.corr(theta, D)
-        R = np.eye(n_samples) * (1. + self.nugget)
-        R[ij[:, 0], ij[:, 1]] = r
-        R[ij[:, 1], ij[:, 0]] = r
-
-        # Cholesky decomposition of R
-        try:
-            C = linalg.cholesky(R, lower=True)
-        except linalg.LinAlgError:
-            return reduced_likelihood_function_value, par
-
-        # Get generalized least squares solution
-        Ft = linalg.solve_triangular(C, F, lower=True)
-        Q, G = linalg.qr(Ft, mode='economic')
-
-        sv = linalg.svd(G, compute_uv=False)
-        rcondG = sv[-1] / sv[0]
-        if rcondG < 1e-10:
-            # Check F
-            sv = linalg.svd(F, compute_uv=False)
-            condF = sv[0] / sv[-1]
-            if condF > 1e15:
-                raise Exception("F is too ill conditioned. Poor combination "
-                                "of regression model and observations.")
-            else:
-                # Ft is too ill conditioned, get out (try different theta)
-                return reduced_likelihood_function_value, par
-
-        Yt = linalg.solve_triangular(C, self.y, lower=True)
-        if self.beta0 is None:
-            # Universal Kriging
-            beta = linalg.solve_triangular(G, np.dot(Q.T, Yt))
-        else:
-            # Ordinary Kriging
-            beta = np.array(self.beta0)
-
-        rho = Yt - np.dot(Ft, beta)
-        sigma2 = (rho ** 2.).sum(axis=0) / n_samples
-        # The determinant of R is equal to the squared product of the diagonal
-        # elements of its Cholesky decomposition C
-        detR = (np.diag(C) ** (2. / n_samples)).prod()
-
-        # Compute/Organize output
-        reduced_likelihood_function_value = - sigma2.sum() * detR
-        par['sigma2'] = sigma2 * self.y_std ** 2.
-        par['beta'] = beta
-        par['gamma'] = linalg.solve_triangular(C.T, rho)
-        par['C'] = C
-        par['Ft'] = Ft
-        par['G'] = G
-
-        return reduced_likelihood_function_value, par
-
-    def _arg_max_reduced_likelihood_function(self):
-        """
-        This function estimates the autocorrelation parameters theta as the
-        maximizer of the reduced likelihood function.
-        (Minimization of the opposite reduced likelihood function is used for
-        convenience)
-
-        Parameters
-        ----------
-        self : All parameters are stored in the Gaussian Process model object.
-
-        Returns
-        -------
-        optimal_theta : array_like
-            The best set of autocorrelation parameters (the sought maximizer of
-            the reduced likelihood function).
-
-        optimal_reduced_likelihood_function_value : double
-            The optimal reduced likelihood function value.
-
-        optimal_par : dict
-            The BLUP parameters associated to thetaOpt.
-        """
-
-        # Initialize output
-        best_optimal_theta = []
-        best_optimal_rlf_value = []
-        best_optimal_par = []
-
-        if self.verbose:
-            print("The chosen optimizer is: " + str(self.optimizer))
-            if self.random_start > 1:
-                print(str(self.random_start) + " random starts are required.")
-
-        percent_completed = 0.
-
-        # Force optimizer to fmin_cobyla if the model is meant to be isotropic
-        if self.optimizer == 'Welch' and self.theta0.size == 1:
-            self.optimizer = 'fmin_cobyla'
-
-        if self.optimizer == 'fmin_cobyla':
-
-            def minus_reduced_likelihood_function(log10t):
-                return - self.reduced_likelihood_function(
-                    theta=10. ** log10t)[0]
-
-            constraints = []
-            for i in range(self.theta0.size):
-                constraints.append(lambda log10t, i=i:
-                                   log10t[i] - np.log10(self.thetaL[0, i]))
-                constraints.append(lambda log10t, i=i:
-                                   np.log10(self.thetaU[0, i]) - log10t[i])
-
-            for k in range(self.random_start):
-
-                if k == 0:
-                    # Use specified starting point as first guess
-                    theta0 = self.theta0
-                else:
-                    # Generate a random starting point log10-uniformly
-                    # distributed between bounds
-                    log10theta0 = (np.log10(self.thetaL)
-                                   + self.random_state.rand(*self.theta0.shape)
-                                   * np.log10(self.thetaU / self.thetaL))
-                    theta0 = 10. ** log10theta0
-
-                # Run Cobyla
-                try:
-                    log10_optimal_theta = \
-                        optimize.fmin_cobyla(minus_reduced_likelihood_function,
-                                             np.log10(theta0).ravel(),
-                                             constraints, disp=0)
-                except ValueError as ve:
-                    print("Optimization failed. Try increasing the ``nugget``")
-                    raise ve
-
-                optimal_theta = 10. ** log10_optimal_theta
-                optimal_rlf_value, optimal_par = \
-                    self.reduced_likelihood_function(theta=optimal_theta)
-
-                # Compare the new optimizer to the best previous one
-                if k > 0:
-                    if optimal_rlf_value > best_optimal_rlf_value:
-                        best_optimal_rlf_value = optimal_rlf_value
-                        best_optimal_par = optimal_par
-                        best_optimal_theta = optimal_theta
-                else:
-                    best_optimal_rlf_value = optimal_rlf_value
-                    best_optimal_par = optimal_par
-                    best_optimal_theta = optimal_theta
-                if self.verbose and self.random_start > 1:
-                    if (20 * k) / self.random_start > percent_completed:
-                        percent_completed = (20 * k) / self.random_start
-                        print("%s completed" % (5 * percent_completed))
-
-            optimal_rlf_value = best_optimal_rlf_value
-            optimal_par = best_optimal_par
-            optimal_theta = best_optimal_theta
-
-        elif self.optimizer == 'Welch':
-
-            # Backup of the given attributes
-            theta0, thetaL, thetaU = self.theta0, self.thetaL, self.thetaU
-            corr = self.corr
-            verbose = self.verbose
-
-            # This will iterate over fmin_cobyla optimizer
-            self.optimizer = 'fmin_cobyla'
-            self.verbose = False
-
-            # Initialize under isotropy assumption
-            if verbose:
-                print("Initialize under isotropy assumption...")
-            self.theta0 = check_array(self.theta0.min())
-            self.thetaL = check_array(self.thetaL.min())
-            self.thetaU = check_array(self.thetaU.max())
-            theta_iso, optimal_rlf_value_iso, par_iso = \
-                self._arg_max_reduced_likelihood_function()
-            optimal_theta = theta_iso + np.zeros(theta0.shape)
-
-            # Iterate over all dimensions of theta allowing for anisotropy
-            if verbose:
-                print("Now improving allowing for anisotropy...")
-            for i in self.random_state.permutation(theta0.size):
-                if verbose:
-                    print("Proceeding along dimension %d..." % (i + 1))
-                self.theta0 = check_array(theta_iso)
-                self.thetaL = check_array(thetaL[0, i])
-                self.thetaU = check_array(thetaU[0, i])
-
-                def corr_cut(t, d):
-                    return corr(check_array(np.hstack([optimal_theta[0][0:i],
-                                                       t[0],
-                                                       optimal_theta[0][(i +
-                                                                         1)::]])),
-                                d)
-
-                self.corr = corr_cut
-                optimal_theta[0, i], optimal_rlf_value, optimal_par = \
-                    self._arg_max_reduced_likelihood_function()
-
-            # Restore the given attributes
-            self.theta0, self.thetaL, self.thetaU = theta0, thetaL, thetaU
-            self.corr = corr
-            self.optimizer = 'Welch'
-            self.verbose = verbose
-
-        else:
-
-            raise NotImplementedError("This optimizer ('%s') is not "
-                                      "implemented yet. Please contribute!"
-                                      % self.optimizer)
-
-        return optimal_theta, optimal_rlf_value, optimal_par
-
-    def _check_params(self, n_samples=None):
-
-        # Check regression model
-        if not callable(self.regr):
-            if self.regr in self._regression_types:
-                self.regr = self._regression_types[self.regr]
-            else:
-                raise ValueError("regr should be one of %s or callable, "
-                                 "%s was given."
-                                 % (self._regression_types.keys(), self.regr))
-
-        # Check regression weights if given (Ordinary Kriging)
-        if self.beta0 is not None:
-            self.beta0 = np.atleast_2d(self.beta0)
-            if self.beta0.shape[1] != 1:
-                # Force to column vector
-                self.beta0 = self.beta0.T
-
-        # Check correlation model
-        if not callable(self.corr):
-            if self.corr in self._correlation_types:
-                self.corr = self._correlation_types[self.corr]
-            else:
-                raise ValueError("corr should be one of %s or callable, "
-                                 "%s was given."
-                                 % (self._correlation_types.keys(), self.corr))
-
-        # Check storage mode
-        if self.storage_mode != 'full' and self.storage_mode != 'light':
-            raise ValueError("Storage mode should either be 'full' or "
-                             "'light', %s was given." % self.storage_mode)
-
-        # Check correlation parameters
-        self.theta0 = np.atleast_2d(self.theta0)
-        lth = self.theta0.size
-
-        if self.thetaL is not None and self.thetaU is not None:
-            self.thetaL = np.atleast_2d(self.thetaL)
-            self.thetaU = np.atleast_2d(self.thetaU)
-            if self.thetaL.size != lth or self.thetaU.size != lth:
-                raise ValueError("theta0, thetaL and thetaU must have the "
-                                 "same length.")
-            if np.any(self.thetaL <= 0) or np.any(self.thetaU < self.thetaL):
-                raise ValueError("The bounds must satisfy O < thetaL <= "
-                                 "thetaU.")
-
-        elif self.thetaL is None and self.thetaU is None:
-            if np.any(self.theta0 <= 0):
-                raise ValueError("theta0 must be strictly positive.")
-
-        elif self.thetaL is None or self.thetaU is None:
-            raise ValueError("thetaL and thetaU should either be both or "
-                             "neither specified.")
-
-        # Force verbose type to bool
-        self.verbose = bool(self.verbose)
-
-        # Force normalize type to bool
-        self.normalize = bool(self.normalize)
-
-        # Check nugget value
-        self.nugget = np.asarray(self.nugget)
-        if np.any(self.nugget) < 0.:
-            raise ValueError("nugget must be positive or zero.")
-        if (n_samples is not None
-                and self.nugget.shape not in [(), (n_samples,)]):
-            raise ValueError("nugget must be either a scalar "
-                             "or array of length n_samples.")
-
-        # Check optimizer
-        if self.optimizer not in self._optimizer_types:
-            raise ValueError("optimizer should be one of %s"
-                             % self._optimizer_types)
-
-        # Force random_start type to int
-        self.random_start = int(self.random_start)
diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py
deleted file mode 100644
index 37d872fc99fb5..0000000000000
--- a/sklearn/gaussian_process/tests/test_gaussian_process.py
+++ /dev/null
@@ -1,175 +0,0 @@
-"""
-Testing for Gaussian Process module (sklearn.gaussian_process)
-"""
-
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
-# License: BSD 3 clause
-
-import numpy as np
-
-from sklearn.gaussian_process import GaussianProcess
-from sklearn.gaussian_process import regression_models as regression
-from sklearn.gaussian_process import correlation_models as correlation
-from sklearn.datasets import make_regression
-from sklearn.utils.testing import assert_greater, assert_true, assert_raises
-
-
-f = lambda x: x * np.sin(x)
-X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
-X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
-y = f(X).ravel()
-
-
-def test_1d(regr=regression.constant, corr=correlation.squared_exponential,
-            random_start=10, beta0=None):
-    # MLE estimation of a one-dimensional Gaussian Process model.
-    # Check random start optimization.
-    # Test the interpolating property.
-    gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
-                         theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
-                         random_start=random_start, verbose=False).fit(X, y)
-    y_pred, MSE = gp.predict(X, eval_MSE=True)
-    y2_pred, MSE2 = gp.predict(X2, eval_MSE=True)
-
-    assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)
-                and np.allclose(MSE2, 0., atol=10))
-
-
-def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
-            random_start=10, beta0=None):
-    # MLE estimation of a two-dimensional Gaussian Process model accounting for
-    # anisotropy. Check random start optimization.
-    # Test the interpolating property.
-    b, kappa, e = 5., .5, .1
-    g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
-    X = np.array([[-4.61611719, -6.00099547],
-                  [4.10469096, 5.32782448],
-                  [0.00000000, -0.50000000],
-                  [-6.17289014, -4.6984743],
-                  [1.3109306, -6.93271427],
-                  [-5.03823144, 3.10584743],
-                  [-2.87600388, 6.74310541],
-                  [5.21301203, 4.26386883]])
-    y = g(X).ravel()
-
-    thetaL = [1e-4] * 2
-    thetaU = [1e-1] * 2
-    gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
-                         theta0=[1e-2] * 2, thetaL=thetaL,
-                         thetaU=thetaU,
-                         random_start=random_start, verbose=False)
-    gp.fit(X, y)
-    y_pred, MSE = gp.predict(X, eval_MSE=True)
-
-    assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.))
-
-    eps = np.finfo(gp.theta_.dtype).eps
-    assert_true(np.all(gp.theta_ >= thetaL - eps))  # Lower bounds of hyperparameters
-    assert_true(np.all(gp.theta_ <= thetaU + eps))  # Upper bounds of hyperparameters
-
-
-def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
-               random_start=10, beta0=None):
-    # MLE estimation of a two-dimensional Gaussian Process model accounting for
-    # anisotropy. Check random start optimization.
-    # Test the GP interpolation for 2D output
-    b, kappa, e = 5., .5, .1
-    g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
-    f = lambda x: np.vstack((g(x), g(x))).T
-    X = np.array([[-4.61611719, -6.00099547],
-                  [4.10469096, 5.32782448],
-                  [0.00000000, -0.50000000],
-                  [-6.17289014, -4.6984743],
-                  [1.3109306, -6.93271427],
-                  [-5.03823144, 3.10584743],
-                  [-2.87600388, 6.74310541],
-                  [5.21301203, 4.26386883]])
-    y = f(X)
-    gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
-                         theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
-                         thetaU=[1e-1] * 2,
-                         random_start=random_start, verbose=False)
-    gp.fit(X, y)
-    y_pred, MSE = gp.predict(X, eval_MSE=True)
-
-    assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.))
-
-
-def test_wrong_number_of_outputs():
-    gp = GaussianProcess()
-    assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3])
-
-
-def test_more_builtin_correlation_models(random_start=1):
-    # Repeat test_1d and test_2d for several built-in correlation
-    # models specified as strings.
-    all_corr = ['absolute_exponential', 'squared_exponential', 'cubic',
-                'linear']
-
-    for corr in all_corr:
-        test_1d(regr='constant', corr=corr, random_start=random_start)
-        test_2d(regr='constant', corr=corr, random_start=random_start)
-        test_2d_2d(regr='constant', corr=corr, random_start=random_start)
-
-
-def test_ordinary_kriging():
-    # Repeat test_1d and test_2d with given regression weights (beta0) for
-    # different regression models (Ordinary Kriging).
-    test_1d(regr='linear', beta0=[0., 0.5])
-    test_1d(regr='quadratic', beta0=[0., 0.5, 0.5])
-    test_2d(regr='linear', beta0=[0., 0.5, 0.5])
-    test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
-    test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5])
-    test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
-
-
-def test_no_normalize():
-    gp = GaussianProcess(normalize=False).fit(X, y)
-    y_pred = gp.predict(X)
-    assert_true(np.allclose(y_pred, y))
-
-
-def test_batch_size():
-    # TypeError when using batch_size on Python 3, see
-    # https://github.com/scikit-learn/scikit-learn/issues/7329 for more
-    # details
-    gp = GaussianProcess()
-    gp.fit(X, y)
-    gp.predict(X, batch_size=1)
-    gp.predict(X, batch_size=1, eval_MSE=True)
-
-
-def test_random_starts():
-    # Test that an increasing number of random-starts of GP fitting only
-    # increases the reduced likelihood function of the optimal theta.
-    n_samples, n_features = 50, 3
-    rng = np.random.RandomState(0)
-    X = rng.randn(n_samples, n_features) * 2 - 1
-    y = np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)
-    best_likelihood = -np.inf
-    for random_start in range(1, 5):
-        gp = GaussianProcess(regr="constant", corr="squared_exponential",
-                             theta0=[1e-0] * n_features,
-                             thetaL=[1e-4] * n_features,
-                             thetaU=[1e+1] * n_features,
-                             random_start=random_start, random_state=0,
-                             verbose=False).fit(X, y)
-        rlf = gp.reduced_likelihood_function()[0]
-        assert_greater(rlf, best_likelihood - np.finfo(np.float32).eps)
-        best_likelihood = rlf
-
-
-def test_mse_solving():
-    # test the MSE estimate to be sane.
-    # non-regression test for ignoring off-diagonals of feature covariance,
-    # testing with nugget that renders covariance useless, only
-    # using the mean function, with low effective rank of data
-    gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4,
-                         thetaL=1e-12, thetaU=1e-2, nugget=1e-2,
-                         optimizer='Welch', regr="linear", random_state=0)
-
-    X, y = make_regression(n_informative=3, n_features=60, noise=50,
-                           random_state=0, effective_rank=1)
-
-    gp.fit(X, y)
-    assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean())
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index fdbecc358be35..708fb8030de38 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -223,9 +223,7 @@ def _yield_all_checks(name, estimator):
         for check in _yield_clustering_checks(name, estimator):
             yield check
     yield check_fit2d_predict1d
-    if name != 'GaussianProcess':  # FIXME
-        # XXX GaussianProcess deprecated in 0.20
-        yield check_fit2d_1sample
+    yield check_fit2d_1sample
     yield check_fit2d_1feature
     yield check_fit1d
     yield check_get_params_invariance

From 59e3f7d61852e255efdeaea16f088bffd293f7a2 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 17:59:44 +0100
Subject: [PATCH 06/14] remove code to be removed in 0.19

---
 sklearn/multioutput.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 5b4389fd0f31b..1e0285db2f737 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -247,43 +247,6 @@ def partial_fit(self, X, y, sample_weight=None):
         super(MultiOutputRegressor, self).partial_fit(
             X, y, sample_weight=sample_weight)
 
-    def score(self, X, y, sample_weight=None):
-        """Returns the coefficient of determination R^2 of the prediction.
-
-        The coefficient R^2 is defined as (1 - u/v), where u is the residual
-        sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression
-        sum of squares ((y_true - y_true.mean()) ** 2).sum().
-        Best possible score is 1.0 and it can be negative (because the
-        model can be arbitrarily worse). A constant model that always
-        predicts the expected value of y, disregarding the input features,
-        would get a R^2 score of 0.0.
-
-        Notes
-        -----
-        R^2 is calculated by weighting all the targets equally using
-        `multioutput='uniform_average'`.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Test samples.
-
-        y : array-like, shape (n_samples) or (n_samples, n_outputs)
-            True values for X.
-
-        sample_weight : array-like, shape [n_samples], optional
-            Sample weights.
-
-        Returns
-        -------
-        score : float
-            R^2 of self.predict(X) wrt. y.
-        """
-        # XXX remove in 0.19 when r2_score default for multioutput changes
-        from .metrics import r2_score
-        return r2_score(y, self.predict(X), sample_weight=sample_weight,
-                        multioutput='uniform_average')
-
 
 class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin):
     """Multi target classification

From 2ec39c0b8ff31441d575055fc28095b0f11698bd Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Thu, 9 Nov 2017 18:04:39 +0100
Subject: [PATCH 07/14] remove ransac's residual_metric

---
 sklearn/linear_model/ransac.py            | 32 ++------------------
 sklearn/linear_model/tests/test_ransac.py | 33 --------------------
 sklearn/multioutput.py                    | 37 +++++++++++++++++++++++
 3 files changed, 40 insertions(+), 62 deletions(-)

diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index fa3923dbebb14..322f9923b4925 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -135,17 +135,6 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
         as 0.99 (the default) and e is the current fraction of inliers w.r.t.
         the total number of samples.
 
-    residual_metric : callable, optional
-        Metric to reduce the dimensionality of the residuals to 1 for
-        multi-dimensional target values ``y.shape[1] > 1``. By default the sum
-        of absolute differences is used::
-
-            lambda dy: np.sum(np.abs(dy), axis=1)
-
-        .. deprecated:: 0.18
-           ``residual_metric`` is deprecated from 0.18 and will be removed in
-           0.20. Use ``loss`` instead.
-
     loss : string, callable, optional, default "absolute_loss"
         String inputs, "absolute_loss" and "squared_loss" are supported which
         find the absolute loss and squared loss per sample
@@ -205,8 +194,8 @@ def __init__(self, base_estimator=None, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
                  is_model_valid=None, max_trials=100, max_skips=np.inf,
                  stop_n_inliers=np.inf, stop_score=np.inf,
-                 stop_probability=0.99, residual_metric=None,
-                 loss='absolute_loss', random_state=None):
+                 stop_probability=0.99, loss='absolute_loss',
+                 random_state=None):
 
         self.base_estimator = base_estimator
         self.min_samples = min_samples
@@ -218,7 +207,6 @@ def __init__(self, base_estimator=None, min_samples=None,
         self.stop_n_inliers = stop_n_inliers
         self.stop_score = stop_score
         self.stop_probability = stop_probability
-        self.residual_metric = residual_metric
         self.random_state = random_state
         self.loss = loss
 
@@ -281,12 +269,6 @@ def fit(self, X, y, sample_weight=None):
         else:
             residual_threshold = self.residual_threshold
 
-        if self.residual_metric is not None:
-            warnings.warn(
-                "'residual_metric' was deprecated in version 0.18 and "
-                "will be removed in version 0.20. Use 'loss' instead.",
-                DeprecationWarning)
-
         if self.loss == "absolute_loss":
             if y.ndim == 1:
                 loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
@@ -379,15 +361,7 @@ def fit(self, X, y, sample_weight=None):
 
             # residuals of all data for current random sample model
             y_pred = base_estimator.predict(X)
-
-            # XXX: Deprecation: Remove this if block in 0.20
-            if self.residual_metric is not None:
-                diff = y_pred - y
-                if diff.ndim == 1:
-                    diff = diff.reshape(-1, 1)
-                residuals_subset = self.residual_metric(diff)
-            else:
-                residuals_subset = loss_function(y, y_pred)
+            residuals_subset = loss_function(y, y_pred)
 
             # classify data into inliers and outliers
             inlier_mask_subset = residuals_subset < residual_threshold
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 6f8e716f9ad19..176d3348246be 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -352,39 +352,6 @@ def test_ransac_multi_dimensional_targets():
     assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
 
 
-# XXX: Remove in 0.20
-def test_ransac_residual_metric():
-    residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1)
-    residual_metric2 = lambda dy: np.sum(dy ** 2, axis=1)
-
-    yyy = np.column_stack([y, y, y])
-
-    base_estimator = LinearRegression()
-    ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2,
-                                        residual_threshold=5, random_state=0)
-    ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2,
-                                        residual_threshold=5, random_state=0,
-                                        residual_metric=residual_metric1)
-    ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2,
-                                        residual_threshold=5, random_state=0,
-                                        residual_metric=residual_metric2)
-
-    # multi-dimensional
-    ransac_estimator0.fit(X, yyy)
-    assert_warns(DeprecationWarning, ransac_estimator1.fit, X, yyy)
-    assert_warns(DeprecationWarning, ransac_estimator2.fit, X, yyy)
-    assert_array_almost_equal(ransac_estimator0.predict(X),
-                              ransac_estimator1.predict(X))
-    assert_array_almost_equal(ransac_estimator0.predict(X),
-                              ransac_estimator2.predict(X))
-
-    # one-dimensional
-    ransac_estimator0.fit(X, y)
-    assert_warns(DeprecationWarning, ransac_estimator2.fit, X, y)
-    assert_array_almost_equal(ransac_estimator0.predict(X),
-                              ransac_estimator2.predict(X))
-
-
 def test_ransac_residual_loss():
     loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
     loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 1e0285db2f737..5b4389fd0f31b 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -247,6 +247,43 @@ def partial_fit(self, X, y, sample_weight=None):
         super(MultiOutputRegressor, self).partial_fit(
             X, y, sample_weight=sample_weight)
 
+    def score(self, X, y, sample_weight=None):
+        """Returns the coefficient of determination R^2 of the prediction.
+
+        The coefficient R^2 is defined as (1 - u/v), where u is the residual
+        sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression
+        sum of squares ((y_true - y_true.mean()) ** 2).sum().
+        Best possible score is 1.0 and it can be negative (because the
+        model can be arbitrarily worse). A constant model that always
+        predicts the expected value of y, disregarding the input features,
+        would get a R^2 score of 0.0.
+
+        Notes
+        -----
+        R^2 is calculated by weighting all the targets equally using
+        `multioutput='uniform_average'`.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like, shape (n_samples) or (n_samples, n_outputs)
+            True values for X.
+
+        sample_weight : array-like, shape [n_samples], optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            R^2 of self.predict(X) wrt. y.
+        """
+        # XXX remove in 0.19 when r2_score default for multioutput changes
+        from .metrics import r2_score
+        return r2_score(y, self.predict(X), sample_weight=sample_weight,
+                        multioutput='uniform_average')
+
 
 class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin):
     """Multi target classification

From c444763e9139b3f4cb2ca976dd8e474ec3a22c4f Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Fri, 8 Sep 2017 12:10:59 -0400
Subject: [PATCH 08/14] remove RandomizedPCA (also from docs references etc)

fixup! remove RandomizedPCA from docs references etc
---
 benchmarks/bench_plot_incremental_pca.py |  15 +-
 doc/modules/preprocessing.rst            |   5 +-
 sklearn/decomposition/__init__.py        |   3 +-
 sklearn/decomposition/incremental_pca.py |   1 -
 sklearn/decomposition/pca.py             | 245 -----------------------
 sklearn/decomposition/tests/test_pca.py  |  21 --
 sklearn/decomposition/truncated_svd.py   |   1 -
 7 files changed, 6 insertions(+), 285 deletions(-)

diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py
index 495d58f0f43ee..43b6ff9452c78 100644
--- a/benchmarks/bench_plot_incremental_pca.py
+++ b/benchmarks/bench_plot_incremental_pca.py
@@ -13,7 +13,7 @@
 from collections import defaultdict
 import matplotlib.pyplot as plt
 from sklearn.datasets import fetch_lfw_people
-from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA
+from sklearn.decomposition import IncrementalPCA, PCA
 
 
 def plot_results(X, y, label):
@@ -37,7 +37,6 @@ def plot_feature_times(all_times, batch_size, all_components, data):
     plot_results(all_components, all_times['pca'], label="PCA")
     plot_results(all_components, all_times['ipca'],
                  label="IncrementalPCA, bsize=%i" % batch_size)
-    plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
     plt.legend(loc="upper left")
     plt.suptitle("Algorithm runtime vs. n_components\n \
                  LFW, size %i x %i" % data.shape)
@@ -50,7 +49,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
     plot_results(all_components, all_errors['pca'], label="PCA")
     plot_results(all_components, all_errors['ipca'],
                  label="IncrementalPCA, bsize=%i" % batch_size)
-    plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
     plt.legend(loc="lower left")
     plt.suptitle("Algorithm error vs. n_components\n"
                  "LFW, size %i x %i" % data.shape)
@@ -61,7 +59,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data):
 def plot_batch_times(all_times, n_features, all_batch_sizes, data):
     plt.figure()
     plot_results(all_batch_sizes, all_times['pca'], label="PCA")
-    plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
     plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
     plt.legend(loc="lower left")
     plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
@@ -92,11 +89,9 @@ def fixed_batch_size_comparison(data):
     all_errors = defaultdict(list)
     for n_components in all_features:
         pca = PCA(n_components=n_components)
-        rpca = RandomizedPCA(n_components=n_components, random_state=1999)
         ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
         results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
-                                                               ('ipca', ipca),
-                                                               ('rpca', rpca)]}
+                                                               ('ipca', ipca)]}
 
         for k in sorted(results_dict.keys()):
             all_times[k].append(results_dict[k]['time'])
@@ -116,9 +111,7 @@ def variable_batch_size_comparison(data):
         all_times = defaultdict(list)
         all_errors = defaultdict(list)
         pca = PCA(n_components=n_components)
-        rpca = RandomizedPCA(n_components=n_components, random_state=1999)
-        results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
-                                                               ('rpca', rpca)]}
+        results_dict = {k: benchmark(est, data) for k, est in [('pca', pca)]}
 
         # Create flat baselines to compare the variation over batch size
         all_times['pca'].extend([results_dict['pca']['time']] *
@@ -138,8 +131,6 @@ def variable_batch_size_comparison(data):
             all_errors['ipca'].append(results_dict['ipca']['error'])
 
         plot_batch_times(all_times, n_components, batch_sizes, data)
-        # RandomizedPCA error is always worse (approx 100x) than other PCA
-        # tests
         plot_batch_errors(all_errors, n_components, batch_sizes, data)
 
 faces = fetch_lfw_people(resize=.2, min_faces_per_person=5)
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 8bcb14363d69c..29c77f5c32851 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -235,9 +235,8 @@ data.
   independently, since a downstream model can further make some assumption
   on the linear independence of the features.
 
-  To address this issue you can use :class:`sklearn.decomposition.PCA`
-  or :class:`sklearn.decomposition.RandomizedPCA` with ``whiten=True``
-  to further remove the linear correlation across features.
+  To address this issue you can use :class:`sklearn.decomposition.PCA` with
+  ``whiten=True`` to further remove the linear correlation across features.
 
 .. topic:: Scaling a 1D array
 
diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py
index faca56b91b1d8..34ad76ca46074 100644
--- a/sklearn/decomposition/__init__.py
+++ b/sklearn/decomposition/__init__.py
@@ -5,7 +5,7 @@
 """
 
 from .nmf import NMF, non_negative_factorization
-from .pca import PCA, RandomizedPCA
+from .pca import PCA
 from .incremental_pca import IncrementalPCA
 from .kernel_pca import KernelPCA
 from .sparse_pca import SparsePCA, MiniBatchSparsePCA
@@ -26,7 +26,6 @@
            'MiniBatchSparsePCA',
            'NMF',
            'PCA',
-           'RandomizedPCA',
            'SparseCoder',
            'SparsePCA',
            'dict_learning',
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
index 13e51090dd82e..9ed75928cf90c 100644
--- a/sklearn/decomposition/incremental_pca.py
+++ b/sklearn/decomposition/incremental_pca.py
@@ -136,7 +136,6 @@ class IncrementalPCA(_BasePCA):
     See also
     --------
     PCA
-    RandomizedPCA
     KernelPCA
     SparsePCA
     TruncatedSVD
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 2b715b7e06824..4d528e5994a58 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -591,248 +591,3 @@ def score(self, X, y=None):
             Average log-likelihood of the samples under the current model
         """
         return np.mean(self.score_samples(X))
-
-
-@deprecated("RandomizedPCA was deprecated in 0.18 and will be removed in "
-            "0.20. "
-            "Use PCA(svd_solver='randomized') instead. The new implementation "
-            "DOES NOT store whiten ``components_``. Apply transform to get "
-            "them.")
-class RandomizedPCA(BaseEstimator, TransformerMixin):
-    """Principal component analysis (PCA) using randomized SVD
-
-    .. deprecated:: 0.18
-        This class will be removed in 0.20.
-        Use :class:`PCA` with parameter svd_solver 'randomized' instead.
-        The new implementation DOES NOT store whiten ``components_``.
-        Apply transform to get them.
-
-    Linear dimensionality reduction using approximated Singular Value
-    Decomposition of the data and keeping only the most significant
-    singular vectors to project the data to a lower dimensional space.
-
-    Read more in the :ref:`User Guide <RandomizedPCA>`.
-
-    Parameters
-    ----------
-    n_components : int, optional
-        Maximum number of components to keep. When not given or None, this
-        is set to n_features (the second dimension of the training data).
-
-    copy : bool
-        If False, data passed to fit are overwritten and running
-        fit(X).transform(X) will not yield the expected results,
-        use fit_transform(X) instead.
-
-    iterated_power : int, default=2
-        Number of iterations for the power method.
-
-        .. versionchanged:: 0.18
-
-    whiten : bool, optional
-        When True (False by default) the `components_` vectors are multiplied
-        by the square root of (n_samples) and divided by the singular values to
-        ensure uncorrelated outputs with unit component-wise variances.
-
-        Whitening will remove some information from the transformed signal
-        (the relative variance scales of the components) but can sometime
-        improve the predictive accuracy of the downstream estimators by
-        making their data respect some hard-wired assumptions.
-
-    random_state : int, RandomState instance or None, optional, default=None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-    components_ : array, shape (n_components, n_features)
-        Components with maximum variance.
-
-    explained_variance_ratio_ : array, shape (n_components,)
-        Percentage of variance explained by each of the selected components.
-        If k is not set then all components are stored and the sum of explained
-        variances is equal to 1.0.
-
-    singular_values_ : array, shape (n_components,)
-        The singular values corresponding to each of the selected components.
-        The singular values are equal to the 2-norms of the ``n_components``
-        variables in the lower-dimensional space.
-
-    mean_ : array, shape (n_features,)
-        Per-feature empirical mean, estimated from the training set.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.decomposition import RandomizedPCA
-    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
-    >>> pca = RandomizedPCA(n_components=2)
-    >>> pca.fit(X)                 # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    RandomizedPCA(copy=True, iterated_power=2, n_components=2,
-           random_state=None, whiten=False)
-    >>> print(pca.explained_variance_ratio_)  # doctest: +ELLIPSIS
-    [ 0.99244...  0.00755...]
-    >>> print(pca.singular_values_)  # doctest: +ELLIPSIS
-    [ 6.30061...  0.54980...]
-
-    See also
-    --------
-    PCA
-    TruncatedSVD
-
-    References
-    ----------
-
-    .. [Halko2009] `Finding structure with randomness: Stochastic algorithms
-      for constructing approximate matrix decompositions Halko, et al., 2009
-      (arXiv:909)`
-
-    .. [MRT] `A randomized algorithm for the decomposition of matrices
-      Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
-
-    """
-
-    def __init__(self, n_components=None, copy=True, iterated_power=2,
-                 whiten=False, random_state=None):
-        self.n_components = n_components
-        self.copy = copy
-        self.iterated_power = iterated_power
-        self.whiten = whiten
-        self.random_state = random_state
-
-    def fit(self, X, y=None):
-        """Fit the model with X by extracting the first principal components.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Training data, where n_samples in the number of samples
-            and n_features is the number of features.
-
-        y : Ignored
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        self._fit(check_array(X))
-        return self
-
-    def _fit(self, X):
-        """Fit the model to the data X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Training vector, where n_samples in the number of samples and
-            n_features is the number of features.
-
-        Returns
-        -------
-        X : ndarray, shape (n_samples, n_features)
-            The input data, copied, centered and whitened when requested.
-        """
-        random_state = check_random_state(self.random_state)
-        X = np.atleast_2d(as_float_array(X, copy=self.copy))
-
-        n_samples = X.shape[0]
-
-        # Center data
-        self.mean_ = np.mean(X, axis=0)
-        X -= self.mean_
-        if self.n_components is None:
-            n_components = X.shape[1]
-        else:
-            n_components = self.n_components
-
-        U, S, V = randomized_svd(X, n_components,
-                                 n_iter=self.iterated_power,
-                                 random_state=random_state)
-
-        self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1)
-        full_var = np.var(X, ddof=1, axis=0).sum()
-        self.explained_variance_ratio_ = exp_var / full_var
-        self.singular_values_ = S  # Store the singular values.
-
-        if self.whiten:
-            self.components_ = V / S[:, np.newaxis] * sqrt(n_samples)
-        else:
-            self.components_ = V
-
-        return X
-
-    def transform(self, X):
-        """Apply dimensionality reduction on X.
-
-        X is projected on the first principal components previous extracted
-        from a training set.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            New data, where n_samples in the number of samples
-            and n_features is the number of features.
-
-        Returns
-        -------
-        X_new : array-like, shape (n_samples, n_components)
-
-        """
-        check_is_fitted(self, 'mean_')
-
-        X = check_array(X)
-        if self.mean_ is not None:
-            X = X - self.mean_
-
-        X = np.dot(X, self.components_.T)
-        return X
-
-    def fit_transform(self, X, y=None):
-        """Fit the model with X and apply the dimensionality reduction on X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            New data, where n_samples in the number of samples
-            and n_features is the number of features.
-
-        y : Ignored
-
-        Returns
-        -------
-        X_new : array-like, shape (n_samples, n_components)
-
-        """
-        X = check_array(X)
-        X = self._fit(X)
-        return np.dot(X, self.components_.T)
-
-    def inverse_transform(self, X):
-        """Transform data back to its original space.
-
-        Returns an array X_original whose transform would be X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_components)
-            New data, where n_samples in the number of samples
-            and n_components is the number of components.
-
-        Returns
-        -------
-        X_original array-like, shape (n_samples, n_features)
-
-        Notes
-        -----
-        If whitening is enabled, inverse_transform does not compute the
-        exact inverse operation of transform.
-        """
-        check_is_fitted(self, 'mean_')
-
-        X_original = np.dot(X, self.components_)
-        if self.mean_ is not None:
-            X_original = X_original + self.mean_
-        return X_original
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index f1889d1462d2b..b3cf33a4b2176 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -17,7 +17,6 @@
 
 from sklearn import datasets
 from sklearn.decomposition import PCA
-from sklearn.decomposition import RandomizedPCA
 from sklearn.decomposition.pca import _assess_dimension_
 from sklearn.decomposition.pca import _infer_dimension_
 
@@ -684,26 +683,6 @@ def test_svd_solver_auto():
     assert_array_almost_equal(pca.components_, pca_test.components_)
 
 
-def test_deprecation_randomized_pca():
-    rng = np.random.RandomState(0)
-    X = rng.random_sample((5, 4))
-
-    depr_message = ("Class RandomizedPCA is deprecated; RandomizedPCA was "
-                    "deprecated in 0.18 and will be "
-                    "removed in 0.20. Use PCA(svd_solver='randomized') "
-                    "instead. The new implementation DOES NOT store "
-                    "whiten ``components_``. Apply transform to get them.")
-
-    def fit_deprecated(X):
-        global Y
-        rpca = RandomizedPCA(random_state=0)
-        Y = rpca.fit_transform(X)
-
-    assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X)
-    Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X)
-    assert_array_almost_equal(Y, Y_pca)
-
-
 def test_pca_sparse_input():
     X = np.random.RandomState(0).rand(5, 4)
     X = sp.sparse.csr_matrix(X)
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index 726f9162eb925..268f8479f7a92 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -100,7 +100,6 @@ class TruncatedSVD(BaseEstimator, TransformerMixin):
     See also
     --------
     PCA
-    RandomizedPCA
 
     References
     ----------

From a2e40d78eb088f3deff4ce099d0f45f781ce9665 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Fri, 8 Sep 2017 12:17:25 -0400
Subject: [PATCH 09/14] remove references to old GP, GMM and sparse_center_data

Remove mixture/gmm
---
 sklearn/linear_model/tests/test_base.py |  74 --
 sklearn/mixture/__init__.py             |  14 +-
 sklearn/mixture/dpgmm.py                | 869 ------------------------
 sklearn/mixture/gmm.py                  | 853 -----------------------
 sklearn/mixture/tests/test_dpgmm.py     | 237 -------
 sklearn/mixture/tests/test_gmm.py       | 534 ---------------
 6 files changed, 1 insertion(+), 2580 deletions(-)
 delete mode 100644 sklearn/mixture/dpgmm.py
 delete mode 100644 sklearn/mixture/gmm.py
 delete mode 100644 sklearn/mixture/tests/test_dpgmm.py
 delete mode 100644 sklearn/mixture/tests/test_gmm.py

diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index ed53e1fbb4aa5..30e4cfdcced42 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -6,17 +6,14 @@
 import numpy as np
 from scipy import sparse
 from scipy import linalg
-from itertools import product
 
 
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_equal
-from sklearn.utils.testing import ignore_warnings
 
 from sklearn.linear_model.base import LinearRegression
 from sklearn.linear_model.base import _preprocess_data
-from sklearn.linear_model.base import sparse_center_data, center_data
 from sklearn.linear_model.base import _rescale_data
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_greater
@@ -402,74 +399,3 @@ def test_rescale_data():
     rescaled_y2 = y * np.sqrt(sample_weight)
     assert_array_almost_equal(rescaled_X, rescaled_X2)
     assert_array_almost_equal(rescaled_y, rescaled_y2)
-
-
-@ignore_warnings  # all deprecation warnings
-def test_deprecation_center_data():
-    n_samples = 200
-    n_features = 2
-
-    w = 1.0 + rng.rand(n_samples)
-    X = rng.rand(n_samples, n_features)
-    y = rng.rand(n_samples)
-
-    param_grid = product([True, False], [True, False], [True, False],
-                         [None, w])
-
-    for (fit_intercept, normalize, copy, sample_weight) in param_grid:
-
-        XX = X.copy()  # such that we can try copy=False as well
-
-        X1, y1, X1_mean, X1_var, y1_mean = \
-            center_data(XX, y, fit_intercept=fit_intercept,
-                        normalize=normalize, copy=copy,
-                        sample_weight=sample_weight)
-
-        XX = X.copy()
-
-        X2, y2, X2_mean, X2_var, y2_mean = \
-            _preprocess_data(XX, y, fit_intercept=fit_intercept,
-                             normalize=normalize, copy=copy,
-                             sample_weight=sample_weight)
-
-        assert_array_almost_equal(X1, X2)
-        assert_array_almost_equal(y1, y2)
-        assert_array_almost_equal(X1_mean, X2_mean)
-        assert_array_almost_equal(X1_var, X2_var)
-        assert_array_almost_equal(y1_mean, y2_mean)
-
-    # Sparse cases
-    X = sparse.csr_matrix(X)
-
-    for (fit_intercept, normalize, copy, sample_weight) in param_grid:
-
-        X1, y1, X1_mean, X1_var, y1_mean = \
-            center_data(X, y, fit_intercept=fit_intercept, normalize=normalize,
-                        copy=copy, sample_weight=sample_weight)
-
-        X2, y2, X2_mean, X2_var, y2_mean = \
-            _preprocess_data(X, y, fit_intercept=fit_intercept,
-                             normalize=normalize, copy=copy,
-                             sample_weight=sample_weight, return_mean=False)
-
-        assert_array_almost_equal(X1.toarray(), X2.toarray())
-        assert_array_almost_equal(y1, y2)
-        assert_array_almost_equal(X1_mean, X2_mean)
-        assert_array_almost_equal(X1_var, X2_var)
-        assert_array_almost_equal(y1_mean, y2_mean)
-
-    for (fit_intercept, normalize) in product([True, False], [True, False]):
-
-        X1, y1, X1_mean, X1_var, y1_mean = \
-            sparse_center_data(X, y, fit_intercept=fit_intercept,
-                               normalize=normalize)
-
-        X2, y2, X2_mean, X2_var, y2_mean = \
-            _preprocess_data(X, y, fit_intercept=fit_intercept,
-                             normalize=normalize, return_mean=True)
-
-        assert_array_almost_equal(X1.toarray(), X2.toarray())
-        assert_array_almost_equal(y1, y2)
-        assert_array_almost_equal(X1_mean, X2_mean)
-        assert_array_almost_equal(X1_var, X2_var)
-        assert_array_almost_equal(y1_mean, y2_mean)
diff --git a/sklearn/mixture/__init__.py b/sklearn/mixture/__init__.py
index 3622518352cae..08f55802e201e 100644
--- a/sklearn/mixture/__init__.py
+++ b/sklearn/mixture/__init__.py
@@ -2,21 +2,9 @@
 The :mod:`sklearn.mixture` module implements mixture modeling algorithms.
 """
 
-from .gmm import sample_gaussian, log_multivariate_normal_density
-from .gmm import GMM, distribute_covar_matrix_to_match_covariance_type
-from .gmm import _validate_covars
-from .dpgmm import DPGMM, VBGMM
-
 from .gaussian_mixture import GaussianMixture
 from .bayesian_mixture import BayesianGaussianMixture
 
 
-__all__ = ['DPGMM',
-           'GMM',
-           'VBGMM',
-           '_validate_covars',
-           'distribute_covar_matrix_to_match_covariance_type',
-           'log_multivariate_normal_density',
-           'sample_gaussian',
-           'GaussianMixture',
+__all__ = ['GaussianMixture',
            'BayesianGaussianMixture']
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
deleted file mode 100644
index ddc861b4c19f0..0000000000000
--- a/sklearn/mixture/dpgmm.py
+++ /dev/null
@@ -1,869 +0,0 @@
-"""Bayesian Gaussian Mixture Models and
-Dirichlet Process Gaussian Mixture Models"""
-from __future__ import print_function
-
-# Author: Alexandre Passos (alexandre.tp@gmail.com)
-#         Bertrand Thirion <bertrand.thirion@inria.fr>
-#
-# Based on mixture.py by:
-#         Ron Weiss <ronweiss@gmail.com>
-#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
-#
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the function and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-
-import numpy as np
-from scipy.special import digamma as _digamma, gammaln as _gammaln
-from scipy import linalg
-from scipy.linalg import pinvh
-from scipy.spatial.distance import cdist
-
-from ..externals.six.moves import xrange
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.extmath import squared_norm, stable_cumsum
-from ..utils.validation import check_is_fitted
-from .. import cluster
-from .gmm import _GMMBase
-
-
-@deprecated("The function digamma is deprecated in 0.18 and "
-            "will be removed in 0.20. Use scipy.special.digamma instead.")
-def digamma(x):
-    return _digamma(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function gammaln is deprecated in 0.18 and "
-            "will be removed in 0.20. Use scipy.special.gammaln instead.")
-def gammaln(x):
-    return _gammaln(x + np.finfo(np.float32).eps)
-
-
-@deprecated("The function log_normalize is deprecated in 0.18 and "
-            "will be removed in 0.20.")
-def log_normalize(v, axis=0):
-    """Normalized probabilities from unnormalized log-probabilities"""
-    v = np.rollaxis(v, axis)
-    v = v.copy()
-    v -= v.max(axis=0)
-    out = logsumexp(v)
-    v = np.exp(v - out)
-    v += np.finfo(np.float32).eps
-    v /= np.sum(v, axis=0)
-    return np.swapaxes(v, 0, axis)
-
-
-@deprecated("The function wishart_log_det is deprecated in 0.18 and "
-            "will be removed in 0.20.")
-def wishart_log_det(a, b, detB, n_features):
-    """Expected value of the log of the determinant of a Wishart
-
-    The expected value of the logarithm of the determinant of a
-    wishart-distributed random variable with the specified parameters."""
-    l = np.sum(digamma(0.5 * (a - np.arange(-1, n_features - 1))))
-    l += n_features * np.log(2)
-    return l + detB
-
-
-@deprecated("The function wishart_logz is deprecated in 0.18 and "
-            "will be removed in 0.20.")
-def wishart_logz(v, s, dets, n_features):
-    "The logarithm of the normalization constant for the wishart distribution"
-    z = 0.
-    z += 0.5 * v * n_features * np.log(2)
-    z += (0.25 * (n_features * (n_features - 1)) * np.log(np.pi))
-    z += 0.5 * v * np.log(dets)
-    z += np.sum(gammaln(0.5 * (v - np.arange(n_features) + 1)))
-    return z
-
-
-def _bound_wishart(a, B, detB):
-    """Returns a function of the dof, scale matrix and its determinant
-    used as an upper bound in variational approximation of the evidence"""
-    n_features = B.shape[0]
-    logprior = wishart_logz(a, B, detB, n_features)
-    logprior -= wishart_logz(n_features,
-                             np.identity(n_features),
-                             1, n_features)
-    logprior += 0.5 * (a - 1) * wishart_log_det(a, B, detB, n_features)
-    logprior += 0.5 * a * np.trace(B)
-    return logprior
-
-
-##############################################################################
-# Variational bound on the log likelihood of each class
-##############################################################################
-
-
-def _sym_quad_form(x, mu, A):
-    """helper function to calculate symmetric quadratic form x.T * A * x"""
-    q = (cdist(x, mu[np.newaxis], "mahalanobis", VI=A) ** 2).reshape(-1)
-    return q
-
-
-def _bound_state_log_lik(X, initial_bound, precs, means, covariance_type):
-    """Update the bound with likelihood terms, for standard covariance types"""
-    n_components, n_features = means.shape
-    n_samples = X.shape[0]
-    bound = np.empty((n_samples, n_components))
-    bound[:] = initial_bound
-    if covariance_type in ['diag', 'spherical']:
-        for k in range(n_components):
-            d = X - means[k]
-            bound[:, k] -= 0.5 * np.sum(d * d * precs[k], axis=1)
-    elif covariance_type == 'tied':
-        for k in range(n_components):
-            bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs)
-    elif covariance_type == 'full':
-        for k in range(n_components):
-            bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs[k])
-    return bound
-
-
-class _DPGMMBase(_GMMBase):
-    """Variational Inference for the Infinite Gaussian Mixture Model.
-
-    DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it
-    is an infinite mixture model with the Dirichlet Process as a prior
-    distribution on the number of clusters. In practice the
-    approximate inference algorithm uses a truncated distribution with
-    a fixed maximum number of components, but almost always the number
-    of components actually used depends on the data.
-
-    Stick-breaking Representation of a Gaussian mixture model
-    probability distribution. This class allows for easy and efficient
-    inference of an approximate posterior distribution over the
-    parameters of a Gaussian mixture model with a variable number of
-    components (smaller than the truncation parameter n_components).
-
-    Initialization is with normally-distributed means and identity
-    covariance, for proper convergence.
-
-    Read more in the :ref:`User Guide <dpgmm>`.
-
-    Parameters
-    ----------
-    n_components : int, default 1
-        Number of mixture components.
-
-    covariance_type : string, default 'diag'
-        String describing the type of covariance parameters to
-        use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
-
-    alpha : float, default 1
-        Real number representing the concentration parameter of
-        the dirichlet process. Intuitively, the Dirichlet Process
-        is as likely to start a new cluster for a point as it is
-        to add that point to a cluster with alpha elements. A
-        higher alpha means more clusters, as the expected number
-        of clusters is ``alpha*log(N)``.
-
-    tol : float, default 1e-3
-        Convergence threshold.
-
-    n_iter : int, default 10
-        Maximum number of iterations to perform before convergence.
-
-    params : string, default 'wmc'
-        Controls which parameters are updated in the training
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars.
-
-    init_params : string, default 'wmc'
-        Controls which parameters are updated in the initialization
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars.  Defaults to 'wmc'.
-
-    verbose : int, default 0
-        Controls output verbosity.
-
-    Attributes
-    ----------
-    covariance_type : string
-        String describing the type of covariance parameters used by
-        the DP-GMM.  Must be one of 'spherical', 'tied', 'diag', 'full'.
-
-    n_components : int
-        Number of mixture components.
-
-    weights_ : array, shape (`n_components`,)
-        Mixing weights for each mixture component.
-
-    means_ : array, shape (`n_components`, `n_features`)
-        Mean parameters for each mixture component.
-
-    precs_ : array
-        Precision (inverse covariance) parameters for each mixture
-        component.  The shape depends on `covariance_type`::
-
-            (`n_components`, 'n_features')                if 'spherical',
-            (`n_features`, `n_features`)                  if 'tied',
-            (`n_components`, `n_features`)                if 'diag',
-            (`n_components`, `n_features`, `n_features`)  if 'full'
-
-    converged_ : bool
-        True when convergence was reached in fit(), False otherwise.
-
-    See Also
-    --------
-    GMM : Finite Gaussian mixture model fit with EM
-
-    VBGMM : Finite Gaussian mixture model fit with a variational
-        algorithm, better for situations where there might be too little
-        data to get a good estimate of the covariance matrix.
-    """
-    def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
-                 random_state=None, tol=1e-3, verbose=0, min_covar=None,
-                 n_iter=10, params='wmc', init_params='wmc'):
-        self.alpha = alpha
-        super(_DPGMMBase, self).__init__(n_components, covariance_type,
-                                         random_state=random_state,
-                                         tol=tol, min_covar=min_covar,
-                                         n_iter=n_iter, params=params,
-                                         init_params=init_params,
-                                         verbose=verbose)
-
-    def _get_precisions(self):
-        """Return precisions as a full matrix."""
-        if self.covariance_type == 'full':
-            return self.precs_
-        elif self.covariance_type in ['diag', 'spherical']:
-            return [np.diag(cov) for cov in self.precs_]
-        elif self.covariance_type == 'tied':
-            return [self.precs_] * self.n_components
-
-    def _get_covars(self):
-        return [pinvh(c) for c in self._get_precisions()]
-
-    def _set_covars(self, covars):
-        raise NotImplementedError("""The variational algorithm does
-        not support setting the covariance parameters.""")
-
-    def score_samples(self, X):
-        """Return the likelihood of the data under the model.
-
-        Compute the bound on log probability of X under the model
-        and return the posterior distribution (responsibilities) of
-        each mixture component for each element of X.
-
-        This is done by computing the parameters for the mean-field of
-        z for each observation.
-
-        Parameters
-        ----------
-        X : array_like, shape (n_samples, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        logprob : array_like, shape (n_samples,)
-            Log probabilities of each data point in X
-        responsibilities : array_like, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation
-        """
-        check_is_fitted(self, 'gamma_')
-
-        X = check_array(X)
-        if X.ndim == 1:
-            X = X[:, np.newaxis]
-        sd = digamma(self.gamma_.T[1] + self.gamma_.T[2])
-        dgamma1 = digamma(self.gamma_.T[1]) - sd
-        dgamma2 = np.zeros(self.n_components)
-        dgamma2[0] = digamma(self.gamma_[0, 2]) - digamma(self.gamma_[0, 1] +
-                                                          self.gamma_[0, 2])
-        for j in range(1, self.n_components):
-            dgamma2[j] = dgamma2[j - 1] + digamma(self.gamma_[j - 1, 2])
-            dgamma2[j] -= sd[j - 1]
-        dgamma = dgamma1 + dgamma2
-        # Free memory and developers cognitive load:
-        del dgamma1, dgamma2, sd
-
-        if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
-            raise NotImplementedError("This ctype is not implemented: %s"
-                                      % self.covariance_type)
-        p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
-                                 self.precs_, self.means_,
-                                 self.covariance_type)
-        z = p + dgamma
-        z = log_normalize(z, axis=-1)
-        bound = np.sum(z * p, axis=-1)
-        return bound, z
-
-    def _update_concentration(self, z):
-        """Update the concentration parameters for each cluster"""
-        sz = np.sum(z, axis=0)
-        self.gamma_.T[1] = 1. + sz
-        self.gamma_.T[2].fill(0)
-        for i in range(self.n_components - 2, -1, -1):
-            self.gamma_[i, 2] = self.gamma_[i + 1, 2] + sz[i]
-        self.gamma_.T[2] += self.alpha
-
-    def _update_means(self, X, z):
-        """Update the variational distributions for the means"""
-        n_features = X.shape[1]
-        for k in range(self.n_components):
-            if self.covariance_type in ['spherical', 'diag']:
-                num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
-                num *= self.precs_[k]
-                den = 1. + self.precs_[k] * np.sum(z.T[k])
-                self.means_[k] = num / den
-            elif self.covariance_type in ['tied', 'full']:
-                if self.covariance_type == 'tied':
-                    cov = self.precs_
-                else:
-                    cov = self.precs_[k]
-                den = np.identity(n_features) + cov * np.sum(z.T[k])
-                num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0)
-                num = np.dot(cov, num)
-                self.means_[k] = linalg.lstsq(den, num)[0]
-
-    def _update_precisions(self, X, z):
-        """Update the variational distributions for the precisions"""
-        n_features = X.shape[1]
-        if self.covariance_type == 'spherical':
-            self.dof_ = 0.5 * n_features * np.sum(z, axis=0)
-            for k in range(self.n_components):
-                # could be more memory efficient ?
-                sq_diff = np.sum((X - self.means_[k]) ** 2, axis=1)
-                self.scale_[k] = 1.
-                self.scale_[k] += 0.5 * np.sum(z.T[k] * (sq_diff + n_features))
-                self.bound_prec_[k] = (
-                    0.5 * n_features * (
-                        digamma(self.dof_[k]) - np.log(self.scale_[k])))
-            self.precs_ = np.tile(self.dof_ / self.scale_, [n_features, 1]).T
-
-        elif self.covariance_type == 'diag':
-            for k in range(self.n_components):
-                self.dof_[k].fill(1. + 0.5 * np.sum(z.T[k], axis=0))
-                sq_diff = (X - self.means_[k]) ** 2  # see comment above
-                self.scale_[k] = np.ones(n_features) + 0.5 * np.dot(
-                    z.T[k], (sq_diff + 1))
-                self.precs_[k] = self.dof_[k] / self.scale_[k]
-                self.bound_prec_[k] = 0.5 * np.sum(digamma(self.dof_[k])
-                                                   - np.log(self.scale_[k]))
-                self.bound_prec_[k] -= 0.5 * np.sum(self.precs_[k])
-
-        elif self.covariance_type == 'tied':
-            self.dof_ = 2 + X.shape[0] + n_features
-            self.scale_ = (X.shape[0] + 1) * np.identity(n_features)
-            for k in range(self.n_components):
-                diff = X - self.means_[k]
-                self.scale_ += np.dot(diff.T, z[:, k:k + 1] * diff)
-            self.scale_ = pinvh(self.scale_)
-            self.precs_ = self.dof_ * self.scale_
-            self.det_scale_ = linalg.det(self.scale_)
-            self.bound_prec_ = 0.5 * wishart_log_det(
-                self.dof_, self.scale_, self.det_scale_, n_features)
-            self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
-
-        elif self.covariance_type == 'full':
-            for k in range(self.n_components):
-                sum_resp = np.sum(z.T[k])
-                self.dof_[k] = 2 + sum_resp + n_features
-                self.scale_[k] = (sum_resp + 1) * np.identity(n_features)
-                diff = X - self.means_[k]
-                self.scale_[k] += np.dot(diff.T, z[:, k:k + 1] * diff)
-                self.scale_[k] = pinvh(self.scale_[k])
-                self.precs_[k] = self.dof_[k] * self.scale_[k]
-                self.det_scale_[k] = linalg.det(self.scale_[k])
-                self.bound_prec_[k] = 0.5 * wishart_log_det(
-                    self.dof_[k], self.scale_[k], self.det_scale_[k],
-                    n_features)
-                self.bound_prec_[k] -= 0.5 * self.dof_[k] * np.trace(
-                    self.scale_[k])
-
-    def _monitor(self, X, z, n, end=False):
-        """Monitor the lower bound during iteration
-
-        Debug method to help see exactly when it is failing to converge as
-        expected.
-
-        Note: this is very expensive and should not be used by default."""
-        if self.verbose > 0:
-            print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
-            if end:
-                print("Cluster proportions:", self.gamma_.T[1])
-                print("covariance_type:", self.covariance_type)
-
-    def _do_mstep(self, X, z, params):
-        """Maximize the variational lower bound
-
-        Update each of the parameters to maximize the lower bound."""
-        self._monitor(X, z, "z")
-        self._update_concentration(z)
-        self._monitor(X, z, "gamma")
-        if 'm' in params:
-            self._update_means(X, z)
-        self._monitor(X, z, "mu")
-        if 'c' in params:
-            self._update_precisions(X, z)
-        self._monitor(X, z, "a and b", end=True)
-
-    def _initialize_gamma(self):
-        "Initializes the concentration parameters"
-        self.gamma_ = self.alpha * np.ones((self.n_components, 3))
-
-    def _bound_concentration(self):
-        """The variational lower bound for the concentration parameter."""
-        logprior = gammaln(self.alpha) * self.n_components
-        logprior += np.sum((self.alpha - 1) * (
-            digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
-                                                self.gamma_.T[2])))
-        logprior += np.sum(- gammaln(self.gamma_.T[1] + self.gamma_.T[2]))
-        logprior += np.sum(gammaln(self.gamma_.T[1]) +
-                           gammaln(self.gamma_.T[2]))
-        logprior -= np.sum((self.gamma_.T[1] - 1) * (
-            digamma(self.gamma_.T[1]) - digamma(self.gamma_.T[1] +
-                                                self.gamma_.T[2])))
-        logprior -= np.sum((self.gamma_.T[2] - 1) * (
-            digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] +
-                                                self.gamma_.T[2])))
-        return logprior
-
-    def _bound_means(self):
-        "The variational lower bound for the mean parameters"
-        logprior = 0.
-        logprior -= 0.5 * squared_norm(self.means_)
-        logprior -= 0.5 * self.means_.shape[1] * self.n_components
-        return logprior
-
-    def _bound_precisions(self):
-        """Returns the bound term related to precisions"""
-        logprior = 0.
-        if self.covariance_type == 'spherical':
-            logprior += np.sum(gammaln(self.dof_))
-            logprior -= np.sum(
-                (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
-            logprior += np.sum(- np.log(self.scale_) + self.dof_
-                               - self.precs_[:, 0])
-        elif self.covariance_type == 'diag':
-            logprior += np.sum(gammaln(self.dof_))
-            logprior -= np.sum(
-                (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_)))
-            logprior += np.sum(- np.log(self.scale_) + self.dof_ - self.precs_)
-        elif self.covariance_type == 'tied':
-            logprior += _bound_wishart(self.dof_, self.scale_, self.det_scale_)
-        elif self.covariance_type == 'full':
-            for k in range(self.n_components):
-                logprior += _bound_wishart(self.dof_[k],
-                                           self.scale_[k],
-                                           self.det_scale_[k])
-        return logprior
-
-    def _bound_proportions(self, z):
-        """Returns the bound term related to proportions"""
-        dg12 = digamma(self.gamma_.T[1] + self.gamma_.T[2])
-        dg1 = digamma(self.gamma_.T[1]) - dg12
-        dg2 = digamma(self.gamma_.T[2]) - dg12
-
-        cz = stable_cumsum(z[:, ::-1], axis=-1)[:, -2::-1]
-        logprior = np.sum(cz * dg2[:-1]) + np.sum(z * dg1)
-        del cz  # Save memory
-        z_non_zeros = z[z > np.finfo(np.float32).eps]
-        logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
-        return logprior
-
-    def _logprior(self, z):
-        logprior = self._bound_concentration()
-        logprior += self._bound_means()
-        logprior += self._bound_precisions()
-        logprior += self._bound_proportions(z)
-        return logprior
-
-    def lower_bound(self, X, z):
-        """returns a lower bound on model evidence based on X and membership"""
-        check_is_fitted(self, 'means_')
-
-        if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
-            raise NotImplementedError("This ctype is not implemented: %s"
-                                      % self.covariance_type)
-        X = np.asarray(X)
-        if X.ndim == 1:
-            X = X[:, np.newaxis]
-        c = np.sum(z * _bound_state_log_lik(X, self._initial_bound +
-                                            self.bound_prec_, self.precs_,
-                                            self.means_, self.covariance_type))
-
-        return c + self._logprior(z)
-
-    def _set_weights(self):
-        for i in xrange(self.n_components):
-            self.weights_[i] = self.gamma_[i, 1] / (self.gamma_[i, 1]
-                                                    + self.gamma_[i, 2])
-        self.weights_ /= np.sum(self.weights_)
-
-    def _fit(self, X, y=None):
-        """Estimate model parameters with the variational
-        algorithm.
-
-        For a full derivation and description of the algorithm see
-        doc/modules/dp-derivation.rst
-        or
-        http://scikit-learn.org/stable/modules/dp-derivation.html
-
-        A initialization step is performed before entering the em
-        algorithm. If you want to avoid this step, set the keyword
-        argument init_params to the empty string '' when creating
-        the object. Likewise, if you would like just to do an
-        initialization, set n_iter=0.
-
-        Parameters
-        ----------
-        X : array_like, shape (n, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        responsibilities : array, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation.
-        """
-        self.random_state_ = check_random_state(self.random_state)
-
-        # initialization step
-        X = check_array(X)
-        if X.ndim == 1:
-            X = X[:, np.newaxis]
-
-        n_samples, n_features = X.shape
-        z = np.ones((n_samples, self.n_components))
-        z /= self.n_components
-
-        self._initial_bound = - 0.5 * n_features * np.log(2 * np.pi)
-        self._initial_bound -= np.log(2 * np.pi * np.e)
-
-        if (self.init_params != '') or not hasattr(self, 'gamma_'):
-            self._initialize_gamma()
-
-        if 'm' in self.init_params or not hasattr(self, 'means_'):
-            self.means_ = cluster.KMeans(
-                n_clusters=self.n_components,
-                random_state=self.random_state_).fit(X).cluster_centers_[::-1]
-
-        if 'w' in self.init_params or not hasattr(self, 'weights_'):
-            self.weights_ = np.tile(1.0 / self.n_components, self.n_components)
-
-        if 'c' in self.init_params or not hasattr(self, 'precs_'):
-            if self.covariance_type == 'spherical':
-                self.dof_ = np.ones(self.n_components)
-                self.scale_ = np.ones(self.n_components)
-                self.precs_ = np.ones((self.n_components, n_features))
-                self.bound_prec_ = 0.5 * n_features * (
-                    digamma(self.dof_) - np.log(self.scale_))
-            elif self.covariance_type == 'diag':
-                self.dof_ = 1 + 0.5 * n_features
-                self.dof_ *= np.ones((self.n_components, n_features))
-                self.scale_ = np.ones((self.n_components, n_features))
-                self.precs_ = np.ones((self.n_components, n_features))
-                self.bound_prec_ = 0.5 * (np.sum(digamma(self.dof_) -
-                                                 np.log(self.scale_), 1))
-                self.bound_prec_ -= 0.5 * np.sum(self.precs_, 1)
-            elif self.covariance_type == 'tied':
-                self.dof_ = 1.
-                self.scale_ = np.identity(n_features)
-                self.precs_ = np.identity(n_features)
-                self.det_scale_ = 1.
-                self.bound_prec_ = 0.5 * wishart_log_det(
-                    self.dof_, self.scale_, self.det_scale_, n_features)
-                self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_)
-            elif self.covariance_type == 'full':
-                self.dof_ = (1 + self.n_components + n_samples)
-                self.dof_ *= np.ones(self.n_components)
-                self.scale_ = [2 * np.identity(n_features)
-                               for _ in range(self.n_components)]
-                self.precs_ = [np.identity(n_features)
-                               for _ in range(self.n_components)]
-                self.det_scale_ = np.ones(self.n_components)
-                self.bound_prec_ = np.zeros(self.n_components)
-                for k in range(self.n_components):
-                    self.bound_prec_[k] = wishart_log_det(
-                        self.dof_[k], self.scale_[k], self.det_scale_[k],
-                        n_features)
-                    self.bound_prec_[k] -= (self.dof_[k] *
-                                            np.trace(self.scale_[k]))
-                self.bound_prec_ *= 0.5
-
-        # EM algorithms
-        current_log_likelihood = None
-        # reset self.converged_ to False
-        self.converged_ = False
-
-        for i in range(self.n_iter):
-            prev_log_likelihood = current_log_likelihood
-            # Expectation step
-            curr_logprob, z = self.score_samples(X)
-
-            current_log_likelihood = (
-                curr_logprob.mean() + self._logprior(z) / n_samples)
-
-            # Check for convergence.
-            if prev_log_likelihood is not None:
-                change = abs(current_log_likelihood - prev_log_likelihood)
-                if change < self.tol:
-                    self.converged_ = True
-                    break
-
-            # Maximization step
-            self._do_mstep(X, z, self.params)
-
-        if self.n_iter == 0:
-            # Need to make sure that there is a z value to output
-            # Output zeros because it was just a quick initialization
-            z = np.zeros((X.shape[0], self.n_components))
-
-        self._set_weights()
-
-        return z
-
-
-@deprecated("The `DPGMM` class is not working correctly and it's better "
-            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
-            "parameter `weight_concentration_prior_type='dirichlet_process'` "
-            "instead. DPGMM is deprecated in 0.18 and will be "
-            "removed in 0.20.")
-class DPGMM(_DPGMMBase):
-    """Dirichlet Process Gaussian Mixture Models
-
-    .. deprecated:: 0.18
-        This class will be removed in 0.20.
-        Use :class:`sklearn.mixture.BayesianGaussianMixture` with
-        parameter ``weight_concentration_prior_type='dirichlet_process'``
-        instead.
-
-    """
-
-    def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
-                 random_state=None, tol=1e-3, verbose=0, min_covar=None,
-                 n_iter=10, params='wmc', init_params='wmc'):
-        super(DPGMM, self).__init__(
-            n_components=n_components, covariance_type=covariance_type,
-            alpha=alpha, random_state=random_state, tol=tol, verbose=verbose,
-            min_covar=min_covar, n_iter=n_iter, params=params,
-            init_params=init_params)
-
-
-@deprecated("The `VBGMM` class is not working correctly and it's better "
-            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
-            "parameter `weight_concentration_prior_type="
-            "'dirichlet_distribution'` instead. "
-            "VBGMM is deprecated in 0.18 and will be removed in 0.20.")
-class VBGMM(_DPGMMBase):
-    """Variational Inference for the Gaussian Mixture Model
-
-    .. deprecated:: 0.18
-        This class will be removed in 0.20.
-        Use :class:`sklearn.mixture.BayesianGaussianMixture` with parameter
-        ``weight_concentration_prior_type='dirichlet_distribution'`` instead.
-
-    Variational inference for a Gaussian mixture model probability
-    distribution. This class allows for easy and efficient inference
-    of an approximate posterior distribution over the parameters of a
-    Gaussian mixture model with a fixed number of components.
-
-    Initialization is with normally-distributed means and identity
-    covariance, for proper convergence.
-
-    Read more in the :ref:`User Guide <bgmm>`.
-
-    Parameters
-    ----------
-    n_components : int, default 1
-        Number of mixture components.
-
-    covariance_type : string, default 'diag'
-        String describing the type of covariance parameters to
-        use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
-
-    alpha : float, default 1
-        Real number representing the concentration parameter of
-        the dirichlet distribution. Intuitively, the higher the
-        value of alpha the more likely the variational mixture of
-        Gaussians model will use all components it can.
-
-    tol : float, default 1e-3
-        Convergence threshold.
-
-    n_iter : int, default 10
-        Maximum number of iterations to perform before convergence.
-
-    params : string, default 'wmc'
-        Controls which parameters are updated in the training
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars.
-
-    init_params : string, default 'wmc'
-        Controls which parameters are updated in the initialization
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars.  Defaults to 'wmc'.
-
-    verbose : int, default 0
-        Controls output verbosity.
-
-    Attributes
-    ----------
-    covariance_type : string
-        String describing the type of covariance parameters used by
-        the DP-GMM.  Must be one of 'spherical', 'tied', 'diag', 'full'.
-
-    n_features : int
-        Dimensionality of the Gaussians.
-
-    n_components : int (read-only)
-        Number of mixture components.
-
-    weights_ : array, shape (`n_components`,)
-        Mixing weights for each mixture component.
-
-    means_ : array, shape (`n_components`, `n_features`)
-        Mean parameters for each mixture component.
-
-    precs_ : array
-        Precision (inverse covariance) parameters for each mixture
-        component.  The shape depends on `covariance_type`::
-
-            (`n_components`, 'n_features')                if 'spherical',
-            (`n_features`, `n_features`)                  if 'tied',
-            (`n_components`, `n_features`)                if 'diag',
-            (`n_components`, `n_features`, `n_features`)  if 'full'
-
-    converged_ : bool
-        True when convergence was reached in fit(), False
-        otherwise.
-
-    See Also
-    --------
-    GMM : Finite Gaussian mixture model fit with EM
-    DPGMM : Infinite Gaussian mixture model, using the dirichlet
-        process, fit with a variational algorithm
-    """
-
-    def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
-                 random_state=None, tol=1e-3, verbose=0,
-                 min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
-        super(VBGMM, self).__init__(
-            n_components, covariance_type, random_state=random_state,
-            tol=tol, verbose=verbose, min_covar=min_covar,
-            n_iter=n_iter, params=params, init_params=init_params)
-        self.alpha = alpha
-
-    def _fit(self, X, y=None):
-        """Estimate model parameters with the variational algorithm.
-
-        For a full derivation and description of the algorithm see
-        doc/modules/dp-derivation.rst
-        or
-        http://scikit-learn.org/stable/modules/dp-derivation.html
-
-        A initialization step is performed before entering the EM
-        algorithm. If you want to avoid this step, set the keyword
-        argument init_params to the empty string '' when creating
-        the object. Likewise, if you just would like to do an
-        initialization, set n_iter=0.
-
-        Parameters
-        ----------
-        X : array_like, shape (n, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        responsibilities : array, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation.
-        """
-        self.alpha_ = float(self.alpha) / self.n_components
-        return super(VBGMM, self)._fit(X, y)
-
-    def score_samples(self, X):
-        """Return the likelihood of the data under the model.
-
-        Compute the bound on log probability of X under the model
-        and return the posterior distribution (responsibilities) of
-        each mixture component for each element of X.
-
-        This is done by computing the parameters for the mean-field of
-        z for each observation.
-
-        Parameters
-        ----------
-        X : array_like, shape (n_samples, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        logprob : array_like, shape (n_samples,)
-            Log probabilities of each data point in X
-        responsibilities : array_like, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation
-        """
-        check_is_fitted(self, 'gamma_')
-
-        X = check_array(X)
-        if X.ndim == 1:
-            X = X[:, np.newaxis]
-        dg = digamma(self.gamma_) - digamma(np.sum(self.gamma_))
-
-        if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']:
-            raise NotImplementedError("This ctype is not implemented: %s"
-                                      % self.covariance_type)
-        p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_,
-                                 self.precs_, self.means_,
-                                 self.covariance_type)
-
-        z = p + dg
-        z = log_normalize(z, axis=-1)
-        bound = np.sum(z * p, axis=-1)
-        return bound, z
-
-    def _update_concentration(self, z):
-        for i in range(self.n_components):
-            self.gamma_[i] = self.alpha_ + np.sum(z.T[i])
-
-    def _initialize_gamma(self):
-        self.gamma_ = self.alpha_ * np.ones(self.n_components)
-
-    def _bound_proportions(self, z):
-        logprior = 0.
-        dg = digamma(self.gamma_)
-        dg -= digamma(np.sum(self.gamma_))
-        logprior += np.sum(dg.reshape((-1, 1)) * z.T)
-        z_non_zeros = z[z > np.finfo(np.float32).eps]
-        logprior -= np.sum(z_non_zeros * np.log(z_non_zeros))
-        return logprior
-
-    def _bound_concentration(self):
-        logprior = gammaln(np.sum(self.gamma_)) - gammaln(self.n_components
-                                                          * self.alpha_)
-        logprior -= np.sum(gammaln(self.gamma_) - gammaln(self.alpha_))
-        sg = digamma(np.sum(self.gamma_))
-        logprior += np.sum((self.gamma_ - self.alpha_)
-                           * (digamma(self.gamma_) - sg))
-        return logprior
-
-    def _monitor(self, X, z, n, end=False):
-        """Monitor the lower bound during iteration
-
-        Debug method to help see exactly when it is failing to converge as
-        expected.
-
-        Note: this is very expensive and should not be used by default."""
-        if self.verbose > 0:
-            print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
-            if end:
-                print("Cluster proportions:", self.gamma_)
-                print("covariance_type:", self.covariance_type)
-
-    def _set_weights(self):
-        self.weights_[:] = self.gamma_
-        self.weights_ /= np.sum(self.weights_)
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
deleted file mode 100644
index 207eff9f1502a..0000000000000
--- a/sklearn/mixture/gmm.py
+++ /dev/null
@@ -1,853 +0,0 @@
-"""
-Gaussian Mixture Models.
-
-This implementation corresponds to frequentist (non-Bayesian) formulation
-of Gaussian Mixture Models.
-"""
-
-# Author: Ron Weiss <ronweiss@gmail.com>
-#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
-#         Bertrand Thirion <bertrand.thirion@inria.fr>
-
-# Important note for the deprecation cleaning of 0.20 :
-# All the functions and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-from time import time
-
-import numpy as np
-from scipy import linalg
-
-from ..base import BaseEstimator
-from ..utils import check_random_state, check_array, deprecated
-from ..utils.fixes import logsumexp
-from ..utils.validation import check_is_fitted
-from .. import cluster
-
-from sklearn.externals.six.moves import zip
-
-EPS = np.finfo(float).eps
-
-@deprecated("The function log_multivariate_normal_density is deprecated in 0.18"
-            " and will be removed in 0.20.")
-def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
-    """Compute the log probability under a multivariate Gaussian distribution.
-
-    Parameters
-    ----------
-    X : array_like, shape (n_samples, n_features)
-        List of n_features-dimensional data points. Each row corresponds to a
-        single data point.
-
-    means : array_like, shape (n_components, n_features)
-        List of n_features-dimensional mean vectors for n_components Gaussians.
-        Each row corresponds to a single mean vector.
-
-    covars : array_like
-        List of n_components covariance parameters for each Gaussian. The shape
-        depends on `covariance_type`:
-            (n_components, n_features)      if 'spherical',
-            (n_features, n_features)    if 'tied',
-            (n_components, n_features)    if 'diag',
-            (n_components, n_features, n_features) if 'full'
-
-    covariance_type : string
-        Type of the covariance parameters.  Must be one of
-        'spherical', 'tied', 'diag', 'full'.  Defaults to 'diag'.
-
-    Returns
-    -------
-    lpr : array_like, shape (n_samples, n_components)
-        Array containing the log probabilities of each data point in
-        X under each of the n_components multivariate Gaussian distributions.
-    """
-    log_multivariate_normal_density_dict = {
-        'spherical': _log_multivariate_normal_density_spherical,
-        'tied': _log_multivariate_normal_density_tied,
-        'diag': _log_multivariate_normal_density_diag,
-        'full': _log_multivariate_normal_density_full}
-    return log_multivariate_normal_density_dict[covariance_type](
-        X, means, covars)
-
-
-@deprecated("The function sample_gaussian is deprecated in 0.18"
-            " and will be removed in 0.20."
-            " Use numpy.random.multivariate_normal instead.")
-def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
-                    random_state=None):
-    """Generate random samples from a Gaussian distribution.
-
-    Parameters
-    ----------
-    mean : array_like, shape (n_features,)
-        Mean of the distribution.
-
-    covar : array_like
-        Covariance of the distribution. The shape depends on `covariance_type`:
-            scalar if 'spherical',
-            (n_features) if 'diag',
-            (n_features, n_features)  if 'tied', or 'full'
-
-    covariance_type : string, optional
-        Type of the covariance parameters. Must be one of
-        'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
-
-    n_samples : int, optional
-        Number of samples to generate. Defaults to 1.
-
-    Returns
-    -------
-    X : array
-        Randomly generated sample. The shape depends on `n_samples`:
-        (n_features,) if `1`
-        (n_features, n_samples) otherwise
-    """
-    return _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
-                            random_state=None)
-
-
-def _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
-                     random_state=None):
-    rng = check_random_state(random_state)
-    n_dim = len(mean)
-    rand = rng.randn(n_dim, n_samples)
-    if n_samples == 1:
-        rand.shape = (n_dim,)
-
-    if covariance_type == 'spherical':
-        rand *= np.sqrt(covar)
-    elif covariance_type == 'diag':
-        rand = np.dot(np.diag(np.sqrt(covar)), rand)
-    else:
-        s, U = linalg.eigh(covar)
-        s.clip(0, out=s)  # get rid of tiny negatives
-        np.sqrt(s, out=s)
-        U *= s
-        rand = np.dot(U, rand)
-
-    return (rand.T + mean).T
-
-
-class _GMMBase(BaseEstimator):
-    """Gaussian Mixture Model.
-
-    Representation of a Gaussian mixture model probability distribution.
-    This class allows for easy evaluation of, sampling from, and
-    maximum-likelihood estimation of the parameters of a GMM distribution.
-
-    Initializes parameters such that every mixture component has zero
-    mean and identity covariance.
-
-    Read more in the :ref:`User Guide <gmm>`.
-
-    Parameters
-    ----------
-    n_components : int, optional
-        Number of mixture components. Defaults to 1.
-
-    covariance_type : string, optional
-        String describing the type of covariance parameters to
-        use.  Must be one of 'spherical', 'tied', 'diag', 'full'.
-        Defaults to 'diag'.
-
-    random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
-    min_covar : float, optional
-        Floor on the diagonal of the covariance matrix to prevent
-        overfitting. Defaults to 1e-3.
-
-    tol : float, optional
-        Convergence threshold. EM iterations will stop when average
-        gain in log-likelihood is below this threshold. Defaults to 1e-3.
-
-    n_iter : int, optional
-        Number of EM iterations to perform.
-
-    n_init : int, optional
-        Number of initializations to perform. The best results is kept.
-
-    params : string, optional
-        Controls which parameters are updated in the training
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
-    init_params : string, optional
-        Controls which parameters are updated in the initialization
-        process.  Can contain any combination of 'w' for weights,
-        'm' for means, and 'c' for covars. Defaults to 'wmc'.
-
-    verbose : int, default: 0
-        Enable verbose output. If 1 then it always prints the current
-        initialization and iteration step. If greater than 1 then
-        it prints additionally the change and time needed for each step.
-
-    Attributes
-    ----------
-    weights_ : array, shape (`n_components`,)
-        This attribute stores the mixing weights for each mixture component.
-
-    means_ : array, shape (`n_components`, `n_features`)
-        Mean parameters for each mixture component.
-
-    covars_ : array
-        Covariance parameters for each mixture component.  The shape
-        depends on `covariance_type`::
-
-            (n_components, n_features)             if 'spherical',
-            (n_features, n_features)               if 'tied',
-            (n_components, n_features)             if 'diag',
-            (n_components, n_features, n_features) if 'full'
-
-    converged_ : bool
-        True when convergence was reached in fit(), False otherwise.
-
-    See Also
-    --------
-
-    DPGMM : Infinite gaussian mixture model, using the Dirichlet
-        process, fit with a variational algorithm
-
-
-    VBGMM : Finite gaussian mixture model fit with a variational
-        algorithm, better for situations where there might be too little
-        data to get a good estimate of the covariance matrix.
-
-    Examples
-    --------
-
-    >>> import numpy as np
-    >>> from sklearn import mixture
-    >>> np.random.seed(1)
-    >>> g = mixture.GMM(n_components=2)
-    >>> # Generate random observations with two modes centered on 0
-    >>> # and 10 to use for training.
-    >>> obs = np.concatenate((np.random.randn(100, 1),
-    ...                       10 + np.random.randn(300, 1)))
-    >>> g.fit(obs)  # doctest: +NORMALIZE_WHITESPACE
-    GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
-            n_components=2, n_init=1, n_iter=100, params='wmc',
-            random_state=None, tol=0.001, verbose=0)
-    >>> np.round(g.weights_, 2)
-    array([ 0.75,  0.25])
-    >>> np.round(g.means_, 2)
-    array([[ 10.05],
-           [  0.06]])
-    >>> np.round(g.covars_, 2) # doctest: +SKIP
-    array([[[ 1.02]],
-           [[ 0.96]]])
-    >>> g.predict([[0], [2], [9], [10]]) # doctest: +ELLIPSIS
-    array([1, 1, 0, 0]...)
-    >>> np.round(g.score([[0], [2], [9], [10]]), 2)
-    array([-2.19, -4.58, -1.75, -1.21])
-    >>> # Refit the model on new data (initial parameters remain the
-    >>> # same), this time with an even split between the two modes.
-    >>> g.fit(20 * [[0]] + 20 * [[10]])  # doctest: +NORMALIZE_WHITESPACE
-    GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
-            n_components=2, n_init=1, n_iter=100, params='wmc',
-            random_state=None, tol=0.001, verbose=0)
-    >>> np.round(g.weights_, 2)
-    array([ 0.5,  0.5])
-
-    """
-
-    def __init__(self, n_components=1, covariance_type='diag',
-                 random_state=None, tol=1e-3, min_covar=1e-3,
-                 n_iter=100, n_init=1, params='wmc', init_params='wmc',
-                 verbose=0):
-        self.n_components = n_components
-        self.covariance_type = covariance_type
-        self.tol = tol
-        self.min_covar = min_covar
-        self.random_state = random_state
-        self.n_iter = n_iter
-        self.n_init = n_init
-        self.params = params
-        self.init_params = init_params
-        self.verbose = verbose
-
-        if covariance_type not in ['spherical', 'tied', 'diag', 'full']:
-            raise ValueError('Invalid value for covariance_type: %s' %
-                             covariance_type)
-
-        if n_init < 1:
-            raise ValueError('GMM estimation requires at least one run')
-
-    def _get_covars(self):
-        """Covariance parameters for each mixture component.
-
-        The shape depends on ``cvtype``::
-
-            (n_states, n_features)                if 'spherical',
-            (n_features, n_features)              if 'tied',
-            (n_states, n_features)                if 'diag',
-            (n_states, n_features, n_features)    if 'full'
-
-        """
-        if self.covariance_type == 'full':
-            return self.covars_
-        elif self.covariance_type == 'diag':
-            return [np.diag(cov) for cov in self.covars_]
-        elif self.covariance_type == 'tied':
-            return [self.covars_] * self.n_components
-        elif self.covariance_type == 'spherical':
-            return [np.diag(cov) for cov in self.covars_]
-
-    def _set_covars(self, covars):
-        """Provide values for covariance."""
-        covars = np.asarray(covars)
-        _validate_covars(covars, self.covariance_type, self.n_components)
-        self.covars_ = covars
-
-    def score_samples(self, X):
-        """Return the per-sample likelihood of the data under the model.
-
-        Compute the log probability of X under the model and
-        return the posterior distribution (responsibilities) of each
-        mixture component for each element of X.
-
-        Parameters
-        ----------
-        X : array_like, shape (n_samples, n_features)
-            List of n_features-dimensional data points. Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        logprob : array_like, shape (n_samples,)
-            Log probabilities of each data point in X.
-
-        responsibilities : array_like, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation
-        """
-        check_is_fitted(self, 'means_')
-
-        X = check_array(X)
-        if X.ndim == 1:
-            X = X[:, np.newaxis]
-        if X.size == 0:
-            return np.array([]), np.empty((0, self.n_components))
-        if X.shape[1] != self.means_.shape[1]:
-            raise ValueError('The shape of X  is not compatible with self')
-
-        lpr = (log_multivariate_normal_density(X, self.means_, self.covars_,
-                                               self.covariance_type) +
-               np.log(self.weights_))
-        logprob = logsumexp(lpr, axis=1)
-        responsibilities = np.exp(lpr - logprob[:, np.newaxis])
-        return logprob, responsibilities
-
-    def score(self, X, y=None):
-        """Compute the log probability under the model.
-
-        Parameters
-        ----------
-        X : array_like, shape (n_samples, n_features)
-            List of n_features-dimensional data points. Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        logprob : array_like, shape (n_samples,)
-            Log probabilities of each data point in X
-        """
-        logprob, _ = self.score_samples(X)
-        return logprob
-
-    def predict(self, X):
-        """Predict label for data.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        C : array, shape = (n_samples,) component memberships
-        """
-        logprob, responsibilities = self.score_samples(X)
-        return responsibilities.argmax(axis=1)
-
-    def predict_proba(self, X):
-        """Predict posterior probability of data under each Gaussian
-        in the model.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        responsibilities : array-like, shape = (n_samples, n_components)
-            Returns the probability of the sample for each Gaussian
-            (state) in the model.
-        """
-        logprob, responsibilities = self.score_samples(X)
-        return responsibilities
-
-    def sample(self, n_samples=1, random_state=None):
-        """Generate random samples from the model.
-
-        Parameters
-        ----------
-        n_samples : int, optional
-            Number of samples to generate. Defaults to 1.
-
-        Returns
-        -------
-        X : array_like, shape (n_samples, n_features)
-            List of samples
-        """
-        check_is_fitted(self, 'means_')
-
-        if random_state is None:
-            random_state = self.random_state
-        random_state = check_random_state(random_state)
-        weight_cdf = np.cumsum(self.weights_)
-
-        X = np.empty((n_samples, self.means_.shape[1]))
-        rand = random_state.rand(n_samples)
-        # decide which component to use for each sample
-        comps = weight_cdf.searchsorted(rand)
-        # for each component, generate all needed samples
-        for comp in range(self.n_components):
-            # occurrences of current component in X
-            comp_in_X = (comp == comps)
-            # number of those occurrences
-            num_comp_in_X = comp_in_X.sum()
-            if num_comp_in_X > 0:
-                if self.covariance_type == 'tied':
-                    cv = self.covars_
-                elif self.covariance_type == 'spherical':
-                    cv = self.covars_[comp][0]
-                else:
-                    cv = self.covars_[comp]
-                X[comp_in_X] = _sample_gaussian(
-                    self.means_[comp], cv, self.covariance_type,
-                    num_comp_in_X, random_state=random_state).T
-        return X
-
-    def fit_predict(self, X, y=None):
-        """Fit and then predict labels for data.
-
-        Warning: Due to the final maximization step in the EM algorithm,
-        with low iterations the prediction may not be 100%  accurate.
-
-        .. versionadded:: 0.17
-           *fit_predict* method in Gaussian Mixture Model.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        C : array, shape = (n_samples,) component memberships
-        """
-        return self._fit(X, y).argmax(axis=1)
-
-    def _fit(self, X, y=None, do_prediction=False):
-        """Estimate model parameters with the EM algorithm.
-
-        A initialization step is performed before entering the
-        expectation-maximization (EM) algorithm. If you want to avoid
-        this step, set the keyword argument init_params to the empty
-        string '' when creating the GMM object. Likewise, if you would
-        like just to do an initialization, set n_iter=0.
-
-        Parameters
-        ----------
-        X : array_like, shape (n, n_features)
-            List of n_features-dimensional data points. Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        responsibilities : array, shape (n_samples, n_components)
-            Posterior probabilities of each mixture component for each
-            observation.
-        """
-
-        # initialization step
-        X = check_array(X, dtype=np.float64, ensure_min_samples=2,
-                        estimator=self)
-        if X.shape[0] < self.n_components:
-            raise ValueError(
-                'GMM estimation with %s components, but got only %s samples' %
-                (self.n_components, X.shape[0]))
-
-        max_log_prob = -np.infty
-
-        if self.verbose > 0:
-            print('Expectation-maximization algorithm started.')
-
-        for init in range(self.n_init):
-            if self.verbose > 0:
-                print('Initialization ' + str(init + 1))
-                start_init_time = time()
-
-            if 'm' in self.init_params or not hasattr(self, 'means_'):
-                self.means_ = cluster.KMeans(
-                    n_clusters=self.n_components,
-                    random_state=self.random_state).fit(X).cluster_centers_
-                if self.verbose > 1:
-                    print('\tMeans have been initialized.')
-
-            if 'w' in self.init_params or not hasattr(self, 'weights_'):
-                self.weights_ = np.tile(1.0 / self.n_components,
-                                        self.n_components)
-                if self.verbose > 1:
-                    print('\tWeights have been initialized.')
-
-            if 'c' in self.init_params or not hasattr(self, 'covars_'):
-                cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
-                if not cv.shape:
-                    cv.shape = (1, 1)
-                self.covars_ = \
-                    distribute_covar_matrix_to_match_covariance_type(
-                        cv, self.covariance_type, self.n_components)
-                if self.verbose > 1:
-                    print('\tCovariance matrices have been initialized.')
-
-            # EM algorithms
-            current_log_likelihood = None
-            # reset self.converged_ to False
-            self.converged_ = False
-
-            for i in range(self.n_iter):
-                if self.verbose > 0:
-                    print('\tEM iteration ' + str(i + 1))
-                    start_iter_time = time()
-                prev_log_likelihood = current_log_likelihood
-                # Expectation step
-                log_likelihoods, responsibilities = self.score_samples(X)
-                current_log_likelihood = log_likelihoods.mean()
-
-                # Check for convergence.
-                if prev_log_likelihood is not None:
-                    change = abs(current_log_likelihood - prev_log_likelihood)
-                    if self.verbose > 1:
-                        print('\t\tChange: ' + str(change))
-                    if change < self.tol:
-                        self.converged_ = True
-                        if self.verbose > 0:
-                            print('\t\tEM algorithm converged.')
-                        break
-
-                # Maximization step
-                self._do_mstep(X, responsibilities, self.params,
-                               self.min_covar)
-                if self.verbose > 1:
-                    print('\t\tEM iteration ' + str(i + 1) + ' took {0:.5f}s'.format(
-                        time() - start_iter_time))
-
-            # if the results are better, keep it
-            if self.n_iter:
-                if current_log_likelihood > max_log_prob:
-                    max_log_prob = current_log_likelihood
-                    best_params = {'weights': self.weights_,
-                                   'means': self.means_,
-                                   'covars': self.covars_}
-                    if self.verbose > 1:
-                        print('\tBetter parameters were found.')
-
-            if self.verbose > 1:
-                print('\tInitialization ' + str(init + 1) + ' took {0:.5f}s'.format(
-                    time() - start_init_time))
-
-        # check the existence of an init param that was not subject to
-        # likelihood computation issue.
-        if np.isneginf(max_log_prob) and self.n_iter:
-            raise RuntimeError(
-                "EM algorithm was never able to compute a valid likelihood " +
-                "given initial parameters. Try different init parameters " +
-                "(or increasing n_init) or check for degenerate data.")
-
-        if self.n_iter:
-            self.covars_ = best_params['covars']
-            self.means_ = best_params['means']
-            self.weights_ = best_params['weights']
-        else:  # self.n_iter == 0 occurs when using GMM within HMM
-            # Need to make sure that there are responsibilities to output
-            # Output zeros because it was just a quick initialization
-            responsibilities = np.zeros((X.shape[0], self.n_components))
-
-        return responsibilities
-
-    def fit(self, X, y=None):
-        """Estimate model parameters with the EM algorithm.
-
-        A initialization step is performed before entering the
-        expectation-maximization (EM) algorithm. If you want to avoid
-        this step, set the keyword argument init_params to the empty
-        string '' when creating the GMM object. Likewise, if you would
-        like just to do an initialization, set n_iter=0.
-
-        Parameters
-        ----------
-        X : array_like, shape (n, n_features)
-            List of n_features-dimensional data points.  Each row
-            corresponds to a single data point.
-
-        Returns
-        -------
-        self
-        """
-        self._fit(X, y)
-        return self
-
-    def _do_mstep(self, X, responsibilities, params, min_covar=0):
-        """Perform the Mstep of the EM algorithm and return the cluster weights.
-        """
-        weights = responsibilities.sum(axis=0)
-        weighted_X_sum = np.dot(responsibilities.T, X)
-        inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
-
-        if 'w' in params:
-            self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS)
-        if 'm' in params:
-            self.means_ = weighted_X_sum * inverse_weights
-        if 'c' in params:
-            covar_mstep_func = _covar_mstep_funcs[self.covariance_type]
-            self.covars_ = covar_mstep_func(
-                self, X, responsibilities, weighted_X_sum, inverse_weights,
-                min_covar)
-        return weights
-
-    def _n_parameters(self):
-        """Return the number of free parameters in the model."""
-        ndim = self.means_.shape[1]
-        if self.covariance_type == 'full':
-            cov_params = self.n_components * ndim * (ndim + 1) / 2.
-        elif self.covariance_type == 'diag':
-            cov_params = self.n_components * ndim
-        elif self.covariance_type == 'tied':
-            cov_params = ndim * (ndim + 1) / 2.
-        elif self.covariance_type == 'spherical':
-            cov_params = self.n_components
-        mean_params = ndim * self.n_components
-        return int(cov_params + mean_params + self.n_components - 1)
-
-    def bic(self, X):
-        """Bayesian information criterion for the current model fit
-        and the proposed data.
-
-        Parameters
-        ----------
-        X : array of shape(n_samples, n_dimensions)
-
-        Returns
-        -------
-        bic : float (the lower the better)
-        """
-        return (-2 * self.score(X).sum() +
-                self._n_parameters() * np.log(X.shape[0]))
-
-    def aic(self, X):
-        """Akaike information criterion for the current model fit
-        and the proposed data.
-
-        Parameters
-        ----------
-        X : array of shape(n_samples, n_dimensions)
-
-        Returns
-        -------
-        aic : float (the lower the better)
-        """
-        return - 2 * self.score(X).sum() + 2 * self._n_parameters()
-
-
-@deprecated("The class GMM is deprecated in 0.18 and will be "
-            " removed in 0.20. Use class GaussianMixture instead.")
-class GMM(_GMMBase):
-    """
-    Legacy Gaussian Mixture Model
-
-    .. deprecated:: 0.18
-        This class will be removed in 0.20.
-        Use :class:`sklearn.mixture.GaussianMixture` instead.
-
-    """
-
-    def __init__(self, n_components=1, covariance_type='diag',
-                 random_state=None, tol=1e-3, min_covar=1e-3,
-                 n_iter=100, n_init=1, params='wmc', init_params='wmc',
-                 verbose=0):
-        super(GMM, self).__init__(
-            n_components=n_components, covariance_type=covariance_type,
-            random_state=random_state, tol=tol, min_covar=min_covar,
-            n_iter=n_iter, n_init=n_init, params=params,
-            init_params=init_params, verbose=verbose)
-
-#########################################################################
-# some helper routines
-#########################################################################
-
-
-def _log_multivariate_normal_density_diag(X, means, covars):
-    """Compute Gaussian log-density at X for a diagonal model."""
-    n_samples, n_dim = X.shape
-    lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1)
-                  + np.sum((means ** 2) / covars, 1)
-                  - 2 * np.dot(X, (means / covars).T)
-                  + np.dot(X ** 2, (1.0 / covars).T))
-    return lpr
-
-
-def _log_multivariate_normal_density_spherical(X, means, covars):
-    """Compute Gaussian log-density at X for a spherical model."""
-    cv = covars.copy()
-    if covars.ndim == 1:
-        cv = cv[:, np.newaxis]
-    if cv.shape[1] == 1:
-        cv = np.tile(cv, (1, X.shape[-1]))
-    return _log_multivariate_normal_density_diag(X, means, cv)
-
-
-def _log_multivariate_normal_density_tied(X, means, covars):
-    """Compute Gaussian log-density at X for a tied model."""
-    cv = np.tile(covars, (means.shape[0], 1, 1))
-    return _log_multivariate_normal_density_full(X, means, cv)
-
-
-def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
-    """Log probability for full covariance matrices."""
-    n_samples, n_dim = X.shape
-    nmix = len(means)
-    log_prob = np.empty((n_samples, nmix))
-    for c, (mu, cv) in enumerate(zip(means, covars)):
-        try:
-            cv_chol = linalg.cholesky(cv, lower=True)
-        except linalg.LinAlgError:
-            # The model is most probably stuck in a component with too
-            # few observations, we need to reinitialize this components
-            try:
-                cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim),
-                                          lower=True)
-            except linalg.LinAlgError:
-                raise ValueError("'covars' must be symmetric, "
-                                 "positive-definite")
-
-        cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
-        cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
-        log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) +
-                                 n_dim * np.log(2 * np.pi) + cv_log_det)
-
-    return log_prob
-
-
-def _validate_covars(covars, covariance_type, n_components):
-    """Do basic checks on matrix covariance sizes and values."""
-    from scipy import linalg
-    if covariance_type == 'spherical':
-        if len(covars) != n_components:
-            raise ValueError("'spherical' covars have length n_components")
-        elif np.any(covars <= 0):
-            raise ValueError("'spherical' covars must be non-negative")
-    elif covariance_type == 'tied':
-        if covars.shape[0] != covars.shape[1]:
-            raise ValueError("'tied' covars must have shape (n_dim, n_dim)")
-        elif (not np.allclose(covars, covars.T)
-              or np.any(linalg.eigvalsh(covars) <= 0)):
-            raise ValueError("'tied' covars must be symmetric, "
-                             "positive-definite")
-    elif covariance_type == 'diag':
-        if len(covars.shape) != 2:
-            raise ValueError("'diag' covars must have shape "
-                             "(n_components, n_dim)")
-        elif np.any(covars <= 0):
-            raise ValueError("'diag' covars must be non-negative")
-    elif covariance_type == 'full':
-        if len(covars.shape) != 3:
-            raise ValueError("'full' covars must have shape "
-                             "(n_components, n_dim, n_dim)")
-        elif covars.shape[1] != covars.shape[2]:
-            raise ValueError("'full' covars must have shape "
-                             "(n_components, n_dim, n_dim)")
-        for n, cv in enumerate(covars):
-            if (not np.allclose(cv, cv.T)
-                    or np.any(linalg.eigvalsh(cv) <= 0)):
-                raise ValueError("component %d of 'full' covars must be "
-                                 "symmetric, positive-definite" % n)
-    else:
-        raise ValueError("covariance_type must be one of " +
-                         "'spherical', 'tied', 'diag', 'full'")
-
-
-@deprecated("The function distribute_covar_matrix_to_match_covariance_type"
-            "is deprecated in 0.18 and will be removed in 0.20.")
-def distribute_covar_matrix_to_match_covariance_type(
-        tied_cv, covariance_type, n_components):
-    """Create all the covariance matrices from a given template."""
-    if covariance_type == 'spherical':
-        cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]),
-                     (n_components, 1))
-    elif covariance_type == 'tied':
-        cv = tied_cv
-    elif covariance_type == 'diag':
-        cv = np.tile(np.diag(tied_cv), (n_components, 1))
-    elif covariance_type == 'full':
-        cv = np.tile(tied_cv, (n_components, 1, 1))
-    else:
-        raise ValueError("covariance_type must be one of " +
-                         "'spherical', 'tied', 'diag', 'full'")
-    return cv
-
-
-def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm,
-                      min_covar):
-    """Perform the covariance M step for diagonal cases."""
-    avg_X2 = np.dot(responsibilities.T, X * X) * norm
-    avg_means2 = gmm.means_ ** 2
-    avg_X_means = gmm.means_ * weighted_X_sum * norm
-    return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar
-
-
-def _covar_mstep_spherical(*args):
-    """Perform the covariance M step for spherical cases."""
-    cv = _covar_mstep_diag(*args)
-    return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1]))
-
-
-def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm,
-                      min_covar):
-    """Perform the covariance M step for full cases."""
-    # Eq. 12 from K. Murphy, "Fitting a Conditional Linear Gaussian
-    # Distribution"
-    n_features = X.shape[1]
-    cv = np.empty((gmm.n_components, n_features, n_features))
-    for c in range(gmm.n_components):
-        post = responsibilities[:, c]
-        mu = gmm.means_[c]
-        diff = X - mu
-        with np.errstate(under='ignore'):
-            # Underflow Errors in doing post * X.T are  not important
-            avg_cv = np.dot(post * diff.T, diff) / (post.sum() + 10 * EPS)
-        cv[c] = avg_cv + min_covar * np.eye(n_features)
-    return cv
-
-
-def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm,
-                      min_covar):
-    """Perform the covariance M step for tied cases."""
-    # Eq. 15 from K. Murphy, "Fitting a Conditional Linear Gaussian
-    # Distribution"
-    avg_X2 = np.dot(X.T, X)
-    avg_means2 = np.dot(gmm.means_.T, weighted_X_sum)
-    out = avg_X2 - avg_means2
-    out *= 1. / X.shape[0]
-    out.flat[::len(out) + 1] += min_covar
-    return out
-
-_covar_mstep_funcs = {'spherical': _covar_mstep_spherical,
-                      'diag': _covar_mstep_diag,
-                      'tied': _covar_mstep_tied,
-                      'full': _covar_mstep_full,
-                      }
diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py
deleted file mode 100644
index 8ca38626b4cef..0000000000000
--- a/sklearn/mixture/tests/test_dpgmm.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Important note for the deprecation cleaning of 0.20 :
-# All the function and classes of this file have been deprecated in 0.18.
-# When you remove this file please also remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_gmm.py'
-import unittest
-import sys
-
-import numpy as np
-
-from sklearn.mixture import DPGMM, VBGMM
-from sklearn.mixture.dpgmm import log_normalize
-from sklearn.datasets import make_blobs
-from sklearn.utils.testing import assert_array_less, assert_equal
-from sklearn.utils.testing import assert_warns_message, ignore_warnings
-from sklearn.mixture.tests.test_gmm import GMMTester
-from sklearn.externals.six.moves import cStringIO as StringIO
-from sklearn.mixture.dpgmm import digamma, gammaln
-from sklearn.mixture.dpgmm import wishart_log_det, wishart_logz
-
-
-np.seterr(all='warn')
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_class_weights():
-    # check that the class weights are updated
-    # simple 3 cluster dataset
-    X, y = make_blobs(random_state=1)
-    for Model in [DPGMM, VBGMM]:
-        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50)
-        dpgmm.fit(X)
-        # get indices of components that are used:
-        indices = np.unique(dpgmm.predict(X))
-        active = np.zeros(10, dtype=np.bool)
-        active[indices] = True
-        # used components are important
-        assert_array_less(.1, dpgmm.weights_[active])
-        # others are not
-        assert_array_less(dpgmm.weights_[~active], .05)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_boolean():
-    # checks that the output for the verbose output is the same
-    # for the flag values '1' and 'True'
-    # simple 3 cluster dataset
-    X, y = make_blobs(random_state=1)
-    for Model in [DPGMM, VBGMM]:
-        dpgmm_bool = Model(n_components=10, random_state=1, alpha=20,
-                           n_iter=50, verbose=True)
-        dpgmm_int = Model(n_components=10, random_state=1, alpha=20,
-                          n_iter=50, verbose=1)
-
-        old_stdout = sys.stdout
-        sys.stdout = StringIO()
-        try:
-            # generate output with the boolean flag
-            dpgmm_bool.fit(X)
-            verbose_output = sys.stdout
-            verbose_output.seek(0)
-            bool_output = verbose_output.readline()
-            # generate output with the int flag
-            dpgmm_int.fit(X)
-            verbose_output = sys.stdout
-            verbose_output.seek(0)
-            int_output = verbose_output.readline()
-            assert_equal(bool_output, int_output)
-        finally:
-            sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_first_level():
-    # simple 3 cluster dataset
-    X, y = make_blobs(random_state=1)
-    for Model in [DPGMM, VBGMM]:
-        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
-                      verbose=1)
-
-        old_stdout = sys.stdout
-        sys.stdout = StringIO()
-        try:
-            dpgmm.fit(X)
-        finally:
-            sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_second_level():
-    # simple 3 cluster dataset
-    X, y = make_blobs(random_state=1)
-    for Model in [DPGMM, VBGMM]:
-        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
-                      verbose=2)
-
-        old_stdout = sys.stdout
-        sys.stdout = StringIO()
-        try:
-            dpgmm.fit(X)
-        finally:
-            sys.stdout = old_stdout
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_digamma():
-    assert_warns_message(DeprecationWarning, "The function digamma is"
-                         " deprecated in 0.18 and will be removed in 0.20. "
-                         "Use scipy.special.digamma instead.", digamma, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_gammaln():
-    assert_warns_message(DeprecationWarning, "The function gammaln"
-                         " is deprecated in 0.18 and will be removed"
-                         " in 0.20. Use scipy.special.gammaln instead.",
-                         gammaln, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_log_normalize():
-    v = np.array([0.1, 0.8, 0.01, 0.09])
-    a = np.log(2 * v)
-    result = assert_warns_message(DeprecationWarning, "The function "
-                                  "log_normalize is deprecated in 0.18 and"
-                                  " will be removed in 0.20.",
-                                  log_normalize, a)
-    assert np.allclose(v, result, rtol=0.01)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_wishart_log_det():
-    a = np.array([0.1, 0.8, 0.01, 0.09])
-    b = np.array([0.2, 0.7, 0.05, 0.1])
-    assert_warns_message(DeprecationWarning, "The function "
-                         "wishart_log_det is deprecated in 0.18 and"
-                         " will be removed in 0.20.",
-                         wishart_log_det, a, b, 2, 4)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_wishart_logz():
-    assert_warns_message(DeprecationWarning, "The function "
-                         "wishart_logz is deprecated in 0.18 and "
-                         "will be removed in 0.20.", wishart_logz,
-                         3, np.identity(3), 1, 3)
-
-
-@ignore_warnings(category=DeprecationWarning)
-def test_DPGMM_deprecation():
-    assert_warns_message(
-      DeprecationWarning, "The `DPGMM` class is not working correctly and "
-      "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
-      "with parameter `weight_concentration_prior_type='dirichlet_process'` "
-      "instead. DPGMM is deprecated in 0.18 and will be removed in 0.20.",
-      DPGMM)
-
-
-def do_model(self, **kwds):
-    return VBGMM(verbose=False, **kwds)
-
-
-class DPGMMTester(GMMTester):
-    model = DPGMM
-    do_test_eval = False
-
-    def score(self, g, train_obs):
-        _, z = g.score_samples(train_obs)
-        return g.lower_bound(train_obs, z)
-
-
-class TestDPGMMWithSphericalCovars(unittest.TestCase, DPGMMTester):
-    covariance_type = 'spherical'
-    setUp = GMMTester._setUp
-
-
-class TestDPGMMWithDiagCovars(unittest.TestCase, DPGMMTester):
-    covariance_type = 'diag'
-    setUp = GMMTester._setUp
-
-
-class TestDPGMMWithTiedCovars(unittest.TestCase, DPGMMTester):
-    covariance_type = 'tied'
-    setUp = GMMTester._setUp
-
-
-class TestDPGMMWithFullCovars(unittest.TestCase, DPGMMTester):
-    covariance_type = 'full'
-    setUp = GMMTester._setUp
-
-
-def test_VBGMM_deprecation():
-    assert_warns_message(
-        DeprecationWarning, "The `VBGMM` class is not working correctly and "
-        "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
-        "with parameter `weight_concentration_prior_type="
-        "'dirichlet_distribution'` instead. VBGMM is deprecated "
-        "in 0.18 and will be removed in 0.20.", VBGMM)
-
-
-class VBGMMTester(GMMTester):
-    model = do_model
-    do_test_eval = False
-
-    def score(self, g, train_obs):
-        _, z = g.score_samples(train_obs)
-        return g.lower_bound(train_obs, z)
-
-
-class TestVBGMMWithSphericalCovars(unittest.TestCase, VBGMMTester):
-    covariance_type = 'spherical'
-    setUp = GMMTester._setUp
-
-
-class TestVBGMMWithDiagCovars(unittest.TestCase, VBGMMTester):
-    covariance_type = 'diag'
-    setUp = GMMTester._setUp
-
-
-class TestVBGMMWithTiedCovars(unittest.TestCase, VBGMMTester):
-    covariance_type = 'tied'
-    setUp = GMMTester._setUp
-
-
-class TestVBGMMWithFullCovars(unittest.TestCase, VBGMMTester):
-    covariance_type = 'full'
-    setUp = GMMTester._setUp
-
-
-def test_vbgmm_no_modify_alpha():
-    alpha = 2.
-    n_components = 3
-    X, y = make_blobs(random_state=1)
-    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
-    assert_equal(vbgmm.alpha, alpha)
-    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
deleted file mode 100644
index 137703adfcad4..0000000000000
--- a/sklearn/mixture/tests/test_gmm.py
+++ /dev/null
@@ -1,534 +0,0 @@
-# Important note for the deprecation cleaning of 0.20 :
-# All the functions and classes of this file have been deprecated in 0.18.
-# When you remove this file please remove the related files
-# - 'sklearn/mixture/dpgmm.py'
-# - 'sklearn/mixture/gmm.py'
-# - 'sklearn/mixture/test_dpgmm.py'
-import unittest
-import copy
-import sys
-
-import numpy as np
-from numpy.testing import assert_array_equal, assert_array_almost_equal
-
-from scipy import stats
-from sklearn import mixture
-from sklearn.datasets.samples_generator import make_spd_matrix
-from sklearn.utils.testing import (assert_true, assert_greater,
-                                   assert_raise_message, assert_warns_message,
-                                   ignore_warnings, assert_raises)
-from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.externals.six.moves import cStringIO as StringIO
-
-
-rng = np.random.RandomState(0)
-
-
-def test_sample_gaussian():
-    # Test sample generation from mixture.sample_gaussian where covariance
-    # is diagonal, spherical and full
-
-    n_features, n_samples = 2, 300
-    axis = 1
-    mu = rng.randint(10) * rng.rand(n_features)
-    cv = (rng.rand(n_features) + 1.0) ** 2
-
-    samples = mixture.gmm._sample_gaussian(
-        mu, cv, covariance_type='diag', n_samples=n_samples)
-
-    assert_true(np.allclose(samples.mean(axis), mu, atol=1.3))
-    assert_true(np.allclose(samples.var(axis), cv, atol=1.5))
-
-    # the same for spherical covariances
-    cv = (rng.rand() + 1.0) ** 2
-    samples = mixture.gmm._sample_gaussian(
-        mu, cv, covariance_type='spherical', n_samples=n_samples)
-
-    assert_true(np.allclose(samples.mean(axis), mu, atol=1.5))
-    assert_true(np.allclose(
-        samples.var(axis), np.repeat(cv, n_features), atol=1.5))
-
-    # and for full covariances
-    A = rng.randn(n_features, n_features)
-    cv = np.dot(A.T, A) + np.eye(n_features)
-    samples = mixture.gmm._sample_gaussian(
-        mu, cv, covariance_type='full', n_samples=n_samples)
-    assert_true(np.allclose(samples.mean(axis), mu, atol=1.3))
-    assert_true(np.allclose(np.cov(samples), cv, atol=2.5))
-
-    # Numerical stability check: in SciPy 0.12.0 at least, eigh may return
-    # tiny negative values in its second return value.
-    x = mixture.gmm._sample_gaussian(
-        [0, 0], [[4, 3], [1, .1]], covariance_type='full', random_state=42)
-    assert_true(np.isfinite(x).all())
-
-
-def _naive_lmvnpdf_diag(X, mu, cv):
-    # slow and naive implementation of lmvnpdf
-    ref = np.empty((len(X), len(mu)))
-    stds = np.sqrt(cv)
-    for i, (m, std) in enumerate(zip(mu, stds)):
-        ref[:, i] = np.log(stats.norm.pdf(X, m, std)).sum(axis=1)
-    return ref
-
-
-def test_lmvnpdf_diag():
-    # test a slow and naive implementation of lmvnpdf and
-    # compare it to the vectorized version (mixture.lmvnpdf) to test
-    # for correctness
-    n_features, n_components, n_samples = 2, 3, 10
-    mu = rng.randint(10) * rng.rand(n_components, n_features)
-    cv = (rng.rand(n_components, n_features) + 1.0) ** 2
-    X = rng.randint(10) * rng.rand(n_samples, n_features)
-
-    ref = _naive_lmvnpdf_diag(X, mu, cv)
-    lpr = assert_warns_message(DeprecationWarning, "The function"
-                             " log_multivariate_normal_density is "
-                             "deprecated in 0.18 and will be removed in 0.20.",
-                             mixture.log_multivariate_normal_density,
-                             X, mu, cv, 'diag')
-    assert_array_almost_equal(lpr, ref)
-
-
-def test_lmvnpdf_spherical():
-    n_features, n_components, n_samples = 2, 3, 10
-
-    mu = rng.randint(10) * rng.rand(n_components, n_features)
-    spherecv = rng.rand(n_components, 1) ** 2 + 1
-    X = rng.randint(10) * rng.rand(n_samples, n_features)
-
-    cv = np.tile(spherecv, (n_features, 1))
-    reference = _naive_lmvnpdf_diag(X, mu, cv)
-    lpr = assert_warns_message(DeprecationWarning, "The function"
-                             " log_multivariate_normal_density is "
-                             "deprecated in 0.18 and will be removed in 0.20.",
-                             mixture.log_multivariate_normal_density,
-                             X, mu, spherecv, 'spherical')
-    assert_array_almost_equal(lpr, reference)
-
-def test_lmvnpdf_full():
-    n_features, n_components, n_samples = 2, 3, 10
-
-    mu = rng.randint(10) * rng.rand(n_components, n_features)
-    cv = (rng.rand(n_components, n_features) + 1.0) ** 2
-    X = rng.randint(10) * rng.rand(n_samples, n_features)
-
-    fullcv = np.array([np.diag(x) for x in cv])
-
-    reference = _naive_lmvnpdf_diag(X, mu, cv)
-    lpr = assert_warns_message(DeprecationWarning, "The function"
-                             " log_multivariate_normal_density is "
-                             "deprecated in 0.18 and will be removed in 0.20.",
-                             mixture.log_multivariate_normal_density,
-                             X, mu, fullcv, 'full')
-    assert_array_almost_equal(lpr, reference)
-
-
-def test_lvmpdf_full_cv_non_positive_definite():
-    n_features, n_samples = 2, 10
-    rng = np.random.RandomState(0)
-    X = rng.randint(10) * rng.rand(n_samples, n_features)
-    mu = np.mean(X, 0)
-    cv = np.array([[[-1, 0], [0, 1]]])
-    expected_message = "'covars' must be symmetric, positive-definite"
-    assert_raise_message(ValueError, expected_message,
-                         mixture.log_multivariate_normal_density,
-                         X, mu, cv, 'full')
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_GMM_attributes():
-    n_components, n_features = 10, 4
-    covariance_type = 'diag'
-    g = mixture.GMM(n_components, covariance_type, random_state=rng)
-    weights = rng.rand(n_components)
-    weights = weights / weights.sum()
-    means = rng.randint(-20, 20, (n_components, n_features))
-
-    assert_true(g.n_components == n_components)
-    assert_true(g.covariance_type == covariance_type)
-
-    g.weights_ = weights
-    assert_array_almost_equal(g.weights_, weights)
-    g.means_ = means
-    assert_array_almost_equal(g.means_, means)
-
-    covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2
-    g.covars_ = covars
-    assert_array_almost_equal(g.covars_, covars)
-    assert_raises(ValueError, g._set_covars, [])
-    assert_raises(ValueError, g._set_covars,
-                  np.zeros((n_components - 2, n_features)))
-
-    assert_raises(ValueError, mixture.GMM, n_components=20,
-                  covariance_type='badcovariance_type')
-
-
-class GMMTester():
-    do_test_eval = True
-
-    def _setUp(self):
-        self.n_components = 10
-        self.n_features = 4
-        self.weights = rng.rand(self.n_components)
-        self.weights = self.weights / self.weights.sum()
-        self.means = rng.randint(-20, 20, (self.n_components, self.n_features))
-        self.threshold = -0.5
-        self.I = np.eye(self.n_features)
-        self.covars = {
-            'spherical': (0.1 + 2 * rng.rand(self.n_components,
-                                             self.n_features)) ** 2,
-            'tied': (make_spd_matrix(self.n_features, random_state=0)
-                     + 5 * self.I),
-            'diag': (0.1 + 2 * rng.rand(self.n_components,
-                                        self.n_features)) ** 2,
-            'full': np.array([make_spd_matrix(self.n_features, random_state=0)
-                              + 5 * self.I for x in range(self.n_components)])}
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def test_eval(self):
-        if not self.do_test_eval:
-            return  # DPGMM does not support setting the means and
-        # covariances before fitting There is no way of fixing this
-        # due to the variational parameters being more expressive than
-        # covariance matrices
-        g = self.model(n_components=self.n_components,
-                       covariance_type=self.covariance_type, random_state=rng)
-        # Make sure the means are far apart so responsibilities.argmax()
-        # picks the actual component used to generate the observations.
-        g.means_ = 20 * self.means
-        g.covars_ = self.covars[self.covariance_type]
-        g.weights_ = self.weights
-
-        gaussidx = np.repeat(np.arange(self.n_components), 5)
-        n_samples = len(gaussidx)
-        X = rng.randn(n_samples, self.n_features) + g.means_[gaussidx]
-
-        with ignore_warnings(category=DeprecationWarning):
-            ll, responsibilities = g.score_samples(X)
-
-        self.assertEqual(len(ll), n_samples)
-        self.assertEqual(responsibilities.shape,
-                         (n_samples, self.n_components))
-        assert_array_almost_equal(responsibilities.sum(axis=1),
-                                  np.ones(n_samples))
-        assert_array_equal(responsibilities.argmax(axis=1), gaussidx)
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def test_sample(self, n=100):
-        g = self.model(n_components=self.n_components,
-                       covariance_type=self.covariance_type,
-                       random_state=rng)
-        # Make sure the means are far apart so responsibilities.argmax()
-        # picks the actual component used to generate the observations.
-        g.means_ = 20 * self.means
-        g.covars_ = np.maximum(self.covars[self.covariance_type], 0.1)
-        g.weights_ = self.weights
-
-        with ignore_warnings(category=DeprecationWarning):
-            samples = g.sample(n)
-        self.assertEqual(samples.shape, (n, self.n_features))
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def test_train(self, params='wmc'):
-        g = mixture.GMM(n_components=self.n_components,
-                        covariance_type=self.covariance_type)
-        with ignore_warnings(category=DeprecationWarning):
-            g.weights_ = self.weights
-            g.means_ = self.means
-            g.covars_ = 20 * self.covars[self.covariance_type]
-
-        # Create a training set by sampling from the predefined distribution.
-        with ignore_warnings(category=DeprecationWarning):
-            X = g.sample(n_samples=100)
-            g = self.model(n_components=self.n_components,
-                           covariance_type=self.covariance_type,
-                           random_state=rng, min_covar=1e-1,
-                           n_iter=1, init_params=params)
-            g.fit(X)
-
-        # Do one training iteration at a time so we can keep track of
-        # the log likelihood to make sure that it increases after each
-        # iteration.
-        trainll = []
-        with ignore_warnings(category=DeprecationWarning):
-            for _ in range(5):
-                g.params = params
-                g.init_params = ''
-                g.fit(X)
-                trainll.append(self.score(g, X))
-            g.n_iter = 10
-            g.init_params = ''
-            g.params = params
-            g.fit(X)  # finish fitting
-
-        # Note that the log likelihood will sometimes decrease by a
-        # very small amount after it has more or less converged due to
-        # the addition of min_covar to the covariance (to prevent
-        # underflow).  This is why the threshold is set to -0.5
-        # instead of 0.
-        with ignore_warnings(category=DeprecationWarning):
-            delta_min = np.diff(trainll).min()
-        self.assertTrue(
-            delta_min > self.threshold,
-            "The min nll increase is %f which is lower than the admissible"
-            " threshold of %f, for model %s. The likelihoods are %s."
-            % (delta_min, self.threshold, self.covariance_type, trainll))
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def test_train_degenerate(self, params='wmc'):
-        # Train on degenerate data with 0 in some dimensions
-        # Create a training set by sampling from the predefined
-        # distribution.
-        X = rng.randn(100, self.n_features)
-        X.T[1:] = 0
-        g = self.model(n_components=2,
-                       covariance_type=self.covariance_type,
-                       random_state=rng, min_covar=1e-3, n_iter=5,
-                       init_params=params)
-        with ignore_warnings(category=DeprecationWarning):
-            g.fit(X)
-            trainll = g.score(X)
-        self.assertTrue(np.sum(np.abs(trainll / 100 / X.shape[1])) < 5)
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def test_train_1d(self, params='wmc'):
-        # Train on 1-D data
-        # Create a training set by sampling from the predefined
-        # distribution.
-        X = rng.randn(100, 1)
-        # X.T[1:] = 0
-        g = self.model(n_components=2,
-                       covariance_type=self.covariance_type,
-                       random_state=rng, min_covar=1e-7, n_iter=5,
-                       init_params=params)
-        with ignore_warnings(category=DeprecationWarning):
-            g.fit(X)
-            trainll = g.score(X)
-            if isinstance(g, mixture.dpgmm._DPGMMBase):
-                self.assertTrue(np.sum(np.abs(trainll / 100)) < 5)
-            else:
-                self.assertTrue(np.sum(np.abs(trainll / 100)) < 2)
-
-    # This function tests the deprecated old GMM class
-    @ignore_warnings(category=DeprecationWarning)
-    def score(self, g, X):
-        with ignore_warnings(category=DeprecationWarning):
-            return g.score(X).sum()
-
-
-class TestGMMWithSphericalCovars(unittest.TestCase, GMMTester):
-    covariance_type = 'spherical'
-    model = mixture.GMM
-    setUp = GMMTester._setUp
-
-
-class TestGMMWithDiagonalCovars(unittest.TestCase, GMMTester):
-    covariance_type = 'diag'
-    model = mixture.GMM
-    setUp = GMMTester._setUp
-
-
-class TestGMMWithTiedCovars(unittest.TestCase, GMMTester):
-    covariance_type = 'tied'
-    model = mixture.GMM
-    setUp = GMMTester._setUp
-
-
-class TestGMMWithFullCovars(unittest.TestCase, GMMTester):
-    covariance_type = 'full'
-    model = mixture.GMM
-    setUp = GMMTester._setUp
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_multiple_init():
-    # Test that multiple inits does not much worse than a single one
-    X = rng.randn(30, 5)
-    X[:10] += 2
-    g = mixture.GMM(n_components=2, covariance_type='spherical',
-                    random_state=rng, min_covar=1e-7, n_iter=5)
-    with ignore_warnings(category=DeprecationWarning):
-        train1 = g.fit(X).score(X).sum()
-        g.n_init = 5
-        train2 = g.fit(X).score(X).sum()
-    assert_true(train2 >= train1 - 1.e-2)
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_n_parameters():
-    n_samples, n_dim, n_components = 7, 5, 2
-    X = rng.randn(n_samples, n_dim)
-    n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41}
-    for cv_type in ['full', 'tied', 'diag', 'spherical']:
-        with ignore_warnings(category=DeprecationWarning):
-            g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
-                            random_state=rng, min_covar=1e-7, n_iter=1)
-            g.fit(X)
-            assert_true(g._n_parameters() == n_params[cv_type])
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_1d_1component():
-    # Test all of the covariance_types return the same BIC score for
-    # 1-dimensional, 1 component fits.
-    n_samples, n_dim, n_components = 100, 1, 1
-    X = rng.randn(n_samples, n_dim)
-    g_full = mixture.GMM(n_components=n_components, covariance_type='full',
-                         random_state=rng, min_covar=1e-7, n_iter=1)
-    with ignore_warnings(category=DeprecationWarning):
-        g_full.fit(X)
-        g_full_bic = g_full.bic(X)
-        for cv_type in ['tied', 'diag', 'spherical']:
-            g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
-                            random_state=rng, min_covar=1e-7, n_iter=1)
-            g.fit(X)
-            assert_array_almost_equal(g.bic(X), g_full_bic)
-
-
-def assert_fit_predict_correct(model, X):
-    model2 = copy.deepcopy(model)
-
-    predictions_1 = model.fit(X).predict(X)
-    predictions_2 = model2.fit_predict(X)
-
-    assert adjusted_rand_score(predictions_1, predictions_2) == 1.0
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_fit_predict():
-    """
-    test that gmm.fit_predict is equivalent to gmm.fit + gmm.predict
-    """
-    lrng = np.random.RandomState(101)
-
-    n_samples, n_dim, n_comps = 100, 2, 2
-    mu = np.array([[8, 8]])
-    component_0 = lrng.randn(n_samples, n_dim)
-    component_1 = lrng.randn(n_samples, n_dim) + mu
-    X = np.vstack((component_0, component_1))
-
-    for m_constructor in (mixture.GMM, mixture.VBGMM, mixture.DPGMM):
-        model = m_constructor(n_components=n_comps, covariance_type='full',
-                              min_covar=1e-7, n_iter=5,
-                              random_state=np.random.RandomState(0))
-        assert_fit_predict_correct(model, X)
-
-    model = mixture.GMM(n_components=n_comps, n_iter=0)
-    z = model.fit_predict(X)
-    assert np.all(z == 0), "Quick Initialization Failed!"
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_aic():
-    # Test the aic and bic criteria
-    n_samples, n_dim, n_components = 50, 3, 2
-    X = rng.randn(n_samples, n_dim)
-    SGH = 0.5 * (X.var() + np.log(2 * np.pi))  # standard gaussian entropy
-
-    for cv_type in ['full', 'tied', 'diag', 'spherical']:
-        g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
-                        random_state=rng, min_covar=1e-7)
-        g.fit(X)
-        aic = 2 * n_samples * SGH * n_dim + 2 * g._n_parameters()
-        bic = (2 * n_samples * SGH * n_dim +
-               np.log(n_samples) * g._n_parameters())
-        bound = n_dim * 3. / np.sqrt(n_samples)
-        assert_true(np.abs(g.aic(X) - aic) / n_samples < bound)
-        assert_true(np.abs(g.bic(X) - bic) / n_samples < bound)
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def check_positive_definite_covars(covariance_type):
-    r"""Test that covariance matrices do not become non positive definite
-
-    Due to the accumulation of round-off errors, the computation of the
-    covariance  matrices during the learning phase could lead to non-positive
-    definite covariance matrices. Namely the use of the formula:
-
-    .. math:: C = (\sum_i w_i  x_i x_i^T) - \mu \mu^T
-
-    instead of:
-
-    .. math:: C = \sum_i w_i (x_i - \mu)(x_i - \mu)^T
-
-    while mathematically equivalent, was observed a ``LinAlgError`` exception,
-    when computing a ``GMM`` with full covariance matrices and fixed mean.
-
-    This function ensures that some later optimization will not introduce the
-    problem again.
-    """
-    rng = np.random.RandomState(1)
-    # we build a dataset with 2 2d component. The components are unbalanced
-    # (respective weights 0.9 and 0.1)
-    X = rng.randn(100, 2)
-    X[-10:] += (3, 3)  # Shift the 10 last points
-
-    gmm = mixture.GMM(2, params="wc", covariance_type=covariance_type,
-                      min_covar=1e-3)
-
-    # This is a non-regression test for issue #2640. The following call used
-    # to trigger:
-    # numpy.linalg.linalg.LinAlgError: 2-th leading minor not positive definite
-    gmm.fit(X)
-
-    if covariance_type == "diag" or covariance_type == "spherical":
-        assert_greater(gmm.covars_.min(), 0)
-    else:
-        if covariance_type == "tied":
-            covs = [gmm.covars_]
-        else:
-            covs = gmm.covars_
-
-        for c in covs:
-            assert_greater(np.linalg.det(c), 0)
-
-
-def test_positive_definite_covars():
-    # Check positive definiteness for all covariance types
-    for covariance_type in ["full", "tied", "diag", "spherical"]:
-        yield check_positive_definite_covars, covariance_type
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_first_level():
-    # Create sample data
-    X = rng.randn(30, 5)
-    X[:10] += 2
-    g = mixture.GMM(n_components=2, n_init=2, verbose=1)
-
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        g.fit(X)
-    finally:
-        sys.stdout = old_stdout
-
-
-# This function tests the deprecated old GMM class
-@ignore_warnings(category=DeprecationWarning)
-def test_verbose_second_level():
-    # Create sample data
-    X = rng.randn(30, 5)
-    X[:10] += 2
-    g = mixture.GMM(n_components=2, n_init=2, verbose=2)
-
-    old_stdout = sys.stdout
-    sys.stdout = StringIO()
-    try:
-        g.fit(X)
-    finally:
-        sys.stdout = old_stdout

From 7d4b2c11583e6bf2594d1cc1c445d66da7335d6f Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Fri, 8 Sep 2017 12:30:36 -0400
Subject: [PATCH 10/14] more cleanup of deprecated scorers

---
 sklearn/metrics/scorer.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 05231826a8998..ebb6c7ca25ffe 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -225,18 +225,13 @@ def get_scorer(scoring):
     scorer : callable
         The scorer.
     """
-    valid = True
     if isinstance(scoring, six.string_types):
         try:
             scorer = SCORERS[scoring]
         except KeyError:
-            scorers = [scorer for scorer in SCORERS
-                       if SCORERS[scorer]._deprecation_msg is None]
-            valid = False  # Don't raise here to make the error message elegant
-        if not valid:
             raise ValueError('%r is not a valid scoring value. '
                              'Valid options are %s'
-                             % (scoring, sorted(scorers)))
+                             % (scoring, sorted(SCORERS.keys())))
     else:
         scorer = scoring
     return scorer
@@ -513,11 +508,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 # Score function for probabilistic classification
 neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
                                   needs_proba=True)
-deprecation_msg = ('Scoring method log_loss was renamed to '
-                   'neg_log_loss in version 0.18 and will be removed in 0.20.')
-log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
-                              needs_proba=True)
-log_loss_scorer._deprecation_msg = deprecation_msg
 brier_score_loss_scorer = make_scorer(brier_score_loss,
                                       greater_is_better=False,
                                       needs_proba=True)
@@ -546,7 +536,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
                balanced_accuracy=balanced_accuracy_scorer,
                average_precision=average_precision_scorer,
-               log_loss=log_loss_scorer,
                neg_log_loss=neg_log_loss_scorer,
                brier_score_loss=brier_score_loss_scorer,
                # Cluster metrics that use supervised evaluation

From 2ffa7bdad5b18ea5d516e305fcef57738c215e4a Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Fri, 10 Nov 2017 16:09:28 +0100
Subject: [PATCH 11/14] More in scoring

---
 sklearn/metrics/tests/test_score_objects.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 6af6418635d59..836cdc0f934f8 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -499,25 +499,6 @@ def test_scorer_memmap_input():
         yield check_scorer_memmap, name
 
 
-def test_deprecated_names():
-    X, y = make_blobs(random_state=0, centers=2)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-    clf = LogisticRegression(random_state=0)
-    clf.fit(X_train, y_train)
-
-    for name in ('mean_absolute_error', 'mean_squared_error',
-                 'median_absolute_error', 'log_loss'):
-        warning_msg = "Scoring method %s was renamed to" % name
-        for scorer in (get_scorer(name), SCORERS[name]):
-            assert_warns_message(DeprecationWarning,
-                                 warning_msg,
-                                 scorer, clf, X, y)
-
-        assert_warns_message(DeprecationWarning,
-                             warning_msg,
-                             cross_val_score, clf, X, y, scoring=name)
-
-
 def test_scoring_is_not_metric():
     assert_raises_regexp(ValueError, 'make_scorer', check_scoring,
                          LogisticRegression(), f1_score)

From 0bf414616b133155f2cad12c063761eec75e0c5c Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Fri, 10 Nov 2017 16:22:53 +0100
Subject: [PATCH 12/14] Remove `hamming_loss` deprecated parameter `classes`

---
 sklearn/metrics/classification.py            | 14 +-------------
 sklearn/metrics/tests/test_classification.py |  1 -
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 7d8b887c66624..c14c8ffe855af 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1528,8 +1528,7 @@ class 2       1.00      0.67      0.80         3
     return report
 
 
-def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
-                 classes=None):
+def hamming_loss(y_true, y_pred, labels=None, sample_weight=None):
     """Compute the average Hamming loss.
 
     The Hamming loss is the fraction of labels that are incorrectly predicted.
@@ -1555,13 +1554,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
 
         .. versionadded:: 0.18
 
-    classes : array, shape = [n_labels], optional
-        Integer array of labels.
-
-        .. deprecated:: 0.18
-           This parameter has been deprecated in favor of ``labels`` in
-           version 0.18 and will be removed in 0.20. Use ``labels`` instead.
-
     Returns
     -------
     loss : float or int,
@@ -1609,10 +1601,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None,
     >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))
     0.75
     """
-    if classes is not None:
-        warnings.warn("'classes' was renamed to 'labels' in version 0.18 and "
-                      "will be removed in 0.20.", DeprecationWarning)
-        labels = classes
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c259036807f7f..4f51f614d2a47 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -936,7 +936,6 @@ def test_multilabel_hamming_loss():
     assert_equal(hamming_loss(y1, np.zeros_like(y1), sample_weight=w), 2. / 3)
     # sp_hamming only works with 1-D arrays
     assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0]))
-    assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1])
 
 
 def test_multilabel_jaccard_similarity_score():

From b36341e23c0b515773f5027edef6f18ac80c61d9 Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Fri, 10 Nov 2017 16:41:01 +0100
Subject: [PATCH 13/14] splitter classes (issue:6660) Fix minor stuff

---
 examples/model_selection/plot_nested_cross_validation_iris.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py
index 917746c359d4b..b40dc91fc4d8f 100644
--- a/examples/model_selection/plot_nested_cross_validation_iris.py
+++ b/examples/model_selection/plot_nested_cross_validation_iris.py
@@ -75,7 +75,7 @@
 
     # Choose cross-validation techniques for the inner and outer loops,
     # independently of the dataset.
-    # E.g "LabelKFold", "LeaveOneOut", "LeaveOneLabelOut", etc.
+    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
     inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
     outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
 

From 4b7aa69655a3ab24398e9ded0c2daf880b49215f Mon Sep 17 00:00:00 2001
From: Joan Massich <sik@visor.udg.edu>
Date: Fri, 10 Nov 2017 17:45:42 +0100
Subject: [PATCH 14/14] Fix doctest expected output

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4a19e27e9c11c..a122728e825a6 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -104,7 +104,7 @@ Usage examples:
     >>> model = svm.SVC()
     >>> cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
-    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
+    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
 
 .. note::