From eb4d1796bfd6b7cccd28d134cb66d391e50861d8 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 13:20:58 +0100 Subject: [PATCH 01/14] Remove deprecated (0.18) cross_validation.py in favor of model_selection --- sklearn/__init__.py | 18 +- sklearn/cross_validation.py | 2075 ------------------------ sklearn/tests/test_cross_validation.py | 1252 -------------- 3 files changed, 9 insertions(+), 3336 deletions(-) delete mode 100644 sklearn/cross_validation.py delete mode 100644 sklearn/tests/test_cross_validation.py diff --git a/sklearn/__init__.py b/sklearn/__init__.py index c45728106ad53..27879e16be363 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -135,15 +135,15 @@ def config_context(**new_config): __check_build # avoid flakes unused variable error __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', - 'cross_validation', 'datasets', 'decomposition', 'dummy', - 'ensemble', 'exceptions', 'externals', 'feature_extraction', - 'feature_selection', 'gaussian_process', 'grid_search', - 'isotonic', 'kernel_approximation', 'kernel_ridge', - 'learning_curve', 'linear_model', 'manifold', 'metrics', - 'mixture', 'model_selection', 'multiclass', 'multioutput', - 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', - 'svm', 'tree', 'discriminant_analysis', + 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions', + 'externals', 'feature_extraction', 'feature_selection', + 'gaussian_process', 'grid_search', 'isotonic', + 'kernel_approximation', 'kernel_ridge', 'learning_curve', + 'linear_model', 'manifold', 'metrics', 'mixture', + 'model_selection', 'multiclass', 'multioutput', 'naive_bayes', + 'neighbors', 'neural_network', 'pipeline', 'preprocessing', + 'random_projection', 'semi_supervised', 'svm', 'tree', + 'discriminant_analysis', # Non-modules: 'clone'] diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py deleted file mode 100644 index 7646459da3936..0000000000000 --- a/sklearn/cross_validation.py +++ /dev/null @@ -1,2075 +0,0 @@ -""" -The :mod:`sklearn.cross_validation` module includes utilities for cross- -validation and performance evaluation. -""" - -# Author: Alexandre Gramfort , -# Gael Varoquaux , -# Olivier Grisel -# License: BSD 3 clause - -from __future__ import print_function -from __future__ import division - -import warnings -from itertools import chain, combinations -from math import ceil, floor, factorial -import numbers -import time -from abc import ABCMeta, abstractmethod - -import numpy as np -import scipy.sparse as sp - -from .base import is_classifier, clone -from .utils import indexable, check_random_state, safe_indexing -from .utils.validation import (_is_arraylike, _num_samples, - column_or_1d) -from .utils.multiclass import type_of_target -from .externals.joblib import Parallel, delayed, logger -from .externals.six import with_metaclass -from .externals.six.moves import zip -from .metrics.scorer import check_scoring -from .gaussian_process.kernels import Kernel as GPKernel -from .exceptions import FitFailedWarning - - -warnings.warn("This module was deprecated in version 0.18 in favor of the " - "model_selection module into which all the refactored classes " - "and functions are moved. Also note that the interface of the " - "new CV iterators are different from that of this module. " - "This module will be removed in 0.20.", DeprecationWarning) - - -__all__ = ['KFold', - 'LabelKFold', - 'LeaveOneLabelOut', - 'LeaveOneOut', - 'LeavePLabelOut', - 'LeavePOut', - 'ShuffleSplit', - 'StratifiedKFold', - 'StratifiedShuffleSplit', - 'PredefinedSplit', - 'LabelShuffleSplit', - 'check_cv', - 'cross_val_score', - 'cross_val_predict', - 'permutation_test_score', - 'train_test_split'] - - -class _PartitionIterator(with_metaclass(ABCMeta)): - """Base class for CV iterators where train_mask = ~test_mask - - Implementations must define `_iter_test_masks` or `_iter_test_indices`. - - Parameters - ---------- - n : int - Total number of elements in dataset. - """ - - def __init__(self, n): - if abs(n - int(n)) >= np.finfo('f').eps: - raise ValueError("n must be an integer") - self.n = int(n) - - def __iter__(self): - ind = np.arange(self.n) - for test_index in self._iter_test_masks(): - train_index = np.logical_not(test_index) - train_index = ind[train_index] - test_index = ind[test_index] - yield train_index, test_index - - # Since subclasses must implement either _iter_test_masks or - # _iter_test_indices, neither can be abstract. - def _iter_test_masks(self): - """Generates boolean masks corresponding to test sets. - - By default, delegates to _iter_test_indices() - """ - for test_index in self._iter_test_indices(): - test_mask = self._empty_mask() - test_mask[test_index] = True - yield test_mask - - def _iter_test_indices(self): - """Generates integer indices corresponding to test sets.""" - raise NotImplementedError - - def _empty_mask(self): - return np.zeros(self.n, dtype=np.bool) - - -class LeaveOneOut(_PartitionIterator): - """Leave-One-Out cross validation iterator. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.LeaveOneOut` instead. - - Provides train/test indices to split data in train test sets. Each - sample is used once as a test set (singleton) while the remaining - samples form the training set. - - Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and - ``LeavePOut(n, p=1)``. - - Due to the high number of test sets (which is the same as the - number of samples) this cross validation method can be very costly. - For large datasets one should favor KFold, StratifiedKFold or - ShuffleSplit. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n : int - Total number of elements in dataset. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4]]) - >>> y = np.array([1, 2]) - >>> loo = cross_validation.LeaveOneOut(2) - >>> len(loo) - 2 - >>> print(loo) - sklearn.cross_validation.LeaveOneOut(n=2) - >>> for train_index, test_index in loo: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [1] TEST: [0] - [[3 4]] [[1 2]] [2] [1] - TRAIN: [0] TEST: [1] - [[1 2]] [[3 4]] [1] [2] - - See also - -------- - LeaveOneLabelOut for splitting the data according to explicit, - domain-specific stratification of the dataset. - """ - - def _iter_test_indices(self): - return range(self.n) - - def __repr__(self): - return '%s.%s(n=%i)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - ) - - def __len__(self): - return self.n - - -class LeavePOut(_PartitionIterator): - """Leave-P-Out cross validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.LeavePOut` instead. - - Provides train/test indices to split data in train test sets. This results - in testing on all distinct samples of size p, while the remaining n - p - samples form the training set in each iteration. - - Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)`` - which creates non-overlapping test sets. - - Due to the high number of iterations which grows combinatorically with the - number of samples this cross validation method can be very costly. For - large datasets one should favor KFold, StratifiedKFold or ShuffleSplit. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n : int - Total number of elements in dataset. - - p : int - Size of the test sets. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 3, 4]) - >>> lpo = cross_validation.LeavePOut(4, 2) - >>> len(lpo) - 6 - >>> print(lpo) - sklearn.cross_validation.LeavePOut(n=4, p=2) - >>> for train_index, test_index in lpo: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [2 3] TEST: [0 1] - TRAIN: [1 3] TEST: [0 2] - TRAIN: [1 2] TEST: [0 3] - TRAIN: [0 3] TEST: [1 2] - TRAIN: [0 2] TEST: [1 3] - TRAIN: [0 1] TEST: [2 3] - """ - - def __init__(self, n, p): - super(LeavePOut, self).__init__(n) - self.p = p - - def _iter_test_indices(self): - for comb in combinations(range(self.n), self.p): - yield np.array(comb) - - def __repr__(self): - return '%s.%s(n=%i, p=%i)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.p, - ) - - def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) - / factorial(self.p)) - - -class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): - """Base class to validate KFold approaches""" - - @abstractmethod - def __init__(self, n, n_folds, shuffle, random_state): - super(_BaseKFold, self).__init__(n) - - if abs(n_folds - int(n_folds)) >= np.finfo('f').eps: - raise ValueError("n_folds must be an integer") - self.n_folds = n_folds = int(n_folds) - - if n_folds <= 1: - raise ValueError( - "k-fold cross validation requires at least one" - " train / test split by setting n_folds=2 or more," - " got n_folds={0}.".format(n_folds)) - if n_folds > self.n: - raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of samples: {1}.").format(n_folds, n)) - - if not isinstance(shuffle, bool): - raise TypeError("shuffle must be True or False;" - " got {0}".format(shuffle)) - self.shuffle = shuffle - self.random_state = random_state - - -class KFold(_BaseKFold): - """K-Folds cross validation iterator. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.KFold` instead. - - Provides train/test indices to split data in train test sets. Split - dataset into k consecutive folds (without shuffling by default). - - Each fold is then used as a validation set once while the k - 1 remaining - fold(s) form the training set. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n : int - Total number of elements. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - shuffle : boolean, optional - Whether to shuffle the data before splitting into batches. - - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``shuffle`` == True. - - Examples - -------- - >>> from sklearn.cross_validation import KFold - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([1, 2, 3, 4]) - >>> kf = KFold(4, n_folds=2) - >>> len(kf) - 2 - >>> print(kf) # doctest: +NORMALIZE_WHITESPACE - sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False, - random_state=None) - >>> for train_index, test_index in kf: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [2 3] TEST: [0 1] - TRAIN: [0 1] TEST: [2 3] - - Notes - ----- - The first n % n_folds folds have size n // n_folds + 1, other folds have - size n // n_folds. - - See also - -------- - StratifiedKFold take label information into account to avoid building - folds with imbalanced class distributions (for binary or multiclass - classification tasks). - - LabelKFold: K-fold iterator variant with non-overlapping labels. - """ - - def __init__(self, n, n_folds=3, shuffle=False, - random_state=None): - super(KFold, self).__init__(n, n_folds, shuffle, random_state) - self.idxs = np.arange(n) - if shuffle: - rng = check_random_state(self.random_state) - rng.shuffle(self.idxs) - - def _iter_test_indices(self): - n = self.n - n_folds = self.n_folds - fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int) - fold_sizes[:n % n_folds] += 1 - current = 0 - for fold_size in fold_sizes: - start, stop = current, current + fold_size - yield self.idxs[start:stop] - current = stop - - def __repr__(self): - return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.n_folds, - self.shuffle, - self.random_state, - ) - - def __len__(self): - return self.n_folds - - -class LabelKFold(_BaseKFold): - """K-fold iterator variant with non-overlapping labels. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.GroupKFold` instead. - - The same label will not appear in two different folds (the number of - distinct labels has to be at least equal to the number of folds). - - The folds are approximately balanced in the sense that the number of - distinct labels is approximately the same in each fold. - - .. versionadded:: 0.17 - - Parameters - ---------- - labels : array-like with shape (n_samples, ) - Contains a label for each sample. - The folds are built so that the same label does not appear in two - different folds. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - Examples - -------- - >>> from sklearn.cross_validation import LabelKFold - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 3, 4]) - >>> labels = np.array([0, 0, 2, 2]) - >>> label_kfold = LabelKFold(labels, n_folds=2) - >>> len(label_kfold) - 2 - >>> print(label_kfold) - sklearn.cross_validation.LabelKFold(n_labels=4, n_folds=2) - >>> for train_index, test_index in label_kfold: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - ... - TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [3 4] - TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [3 4] [1 2] - - See also - -------- - LeaveOneLabelOut for splitting the data according to explicit, - domain-specific stratification of the dataset. - """ - def __init__(self, labels, n_folds=3): - super(LabelKFold, self).__init__(len(labels), n_folds, - shuffle=False, random_state=None) - - unique_labels, labels = np.unique(labels, return_inverse=True) - n_labels = len(unique_labels) - - if n_folds > n_labels: - raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of labels: {1}.").format(n_folds, - n_labels)) - - # Weight labels by their number of occurrences - n_samples_per_label = np.bincount(labels) - - # Distribute the most frequent labels first - indices = np.argsort(n_samples_per_label)[::-1] - n_samples_per_label = n_samples_per_label[indices] - - # Total weight of each fold - n_samples_per_fold = np.zeros(n_folds) - - # Mapping from label index to fold index - label_to_fold = np.zeros(len(unique_labels)) - - # Distribute samples by adding the largest weight to the lightest fold - for label_index, weight in enumerate(n_samples_per_label): - lightest_fold = np.argmin(n_samples_per_fold) - n_samples_per_fold[lightest_fold] += weight - label_to_fold[indices[label_index]] = lightest_fold - - self.idxs = label_to_fold[labels] - - def _iter_test_indices(self): - for f in range(self.n_folds): - yield np.where(self.idxs == f)[0] - - def __repr__(self): - return '{0}.{1}(n_labels={2}, n_folds={3})'.format( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.n_folds, - ) - - def __len__(self): - return self.n_folds - - -class StratifiedKFold(_BaseKFold): - """Stratified K-Folds cross validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.StratifiedKFold` instead. - - Provides train/test indices to split data in train test sets. - - This cross-validation object is a variation of KFold that - returns stratified folds. The folds are made by preserving - the percentage of samples for each class. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y : array-like, [n_samples] - Samples to split in K folds. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - shuffle : boolean, optional - Whether to shuffle each stratification of the data before splitting - into batches. - - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. Used when ``shuffle`` == True. - - Examples - -------- - >>> from sklearn.cross_validation import StratifiedKFold - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([0, 0, 1, 1]) - >>> skf = StratifiedKFold(y, n_folds=2) - >>> len(skf) - 2 - >>> print(skf) # doctest: +NORMALIZE_WHITESPACE - sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2, - shuffle=False, random_state=None) - >>> for train_index, test_index in skf: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [1 3] TEST: [0 2] - TRAIN: [0 2] TEST: [1 3] - - Notes - ----- - All the folds have size trunc(n_samples / n_folds), the last one has the - complementary. - - See also - -------- - LabelKFold: K-fold iterator variant with non-overlapping labels. - """ - - def __init__(self, y, n_folds=3, shuffle=False, - random_state=None): - super(StratifiedKFold, self).__init__( - len(y), n_folds, shuffle, random_state) - y = np.asarray(y) - n_samples = y.shape[0] - unique_labels, y_inversed = np.unique(y, return_inverse=True) - label_counts = np.bincount(y_inversed) - min_labels = np.min(label_counts) - if np.all(self.n_folds > label_counts): - raise ValueError("All the n_labels for individual classes" - " are less than %d folds." - % (self.n_folds)) - if self.n_folds > min_labels: - warnings.warn(("The least populated class in y has only %d" - " members, which is too few. The minimum" - " number of labels for any class cannot" - " be less than n_folds=%d." - % (min_labels, self.n_folds)), Warning) - - # don't want to use the same seed in each label's shuffle - if self.shuffle: - rng = check_random_state(self.random_state) - else: - rng = self.random_state - - # pre-assign each sample to a test fold index using individual KFold - # splitting strategies for each label so as to respect the - # balance of labels - per_label_cvs = [ - KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle, - random_state=rng) for c in label_counts] - test_folds = np.zeros(n_samples, dtype=np.int) - for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)): - for label, (_, test_split) in zip(unique_labels, per_label_splits): - label_test_folds = test_folds[y == label] - # the test split can be too big because we used - # KFold(max(c, self.n_folds), self.n_folds) instead of - # KFold(c, self.n_folds) to make it possible to not crash even - # if the data is not 100% stratifiable for all the labels - # (we use a warning instead of raising an exception) - # If this is the case, let's trim it: - test_split = test_split[test_split < len(label_test_folds)] - label_test_folds[test_split] = test_fold_idx - test_folds[y == label] = label_test_folds - - self.test_folds = test_folds - self.y = y - - def _iter_test_masks(self): - for i in range(self.n_folds): - yield self.test_folds == i - - def __repr__(self): - return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.y, - self.n_folds, - self.shuffle, - self.random_state, - ) - - def __len__(self): - return self.n_folds - - -class LeaveOneLabelOut(_PartitionIterator): - """Leave-One-Label_Out cross-validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.LeaveOneGroupOut` instead. - - Provides train/test indices to split data according to a third-party - provided label. This label information can be used to encode arbitrary - domain specific stratifications of the samples as integers. - - For instance the labels could be the year of collection of the samples - and thus allow for cross-validation against time-based splits. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - labels : array-like of int with shape (n_samples,) - Arbitrary domain-specific stratification of the data to be used - to draw the splits. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 1, 2]) - >>> labels = np.array([1, 1, 2, 2]) - >>> lol = cross_validation.LeaveOneLabelOut(labels) - >>> len(lol) - 2 - >>> print(lol) - sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2]) - >>> for train_index, test_index in lol: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [1 2] [1 2] - TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [1 2] - - See also - -------- - LabelKFold: K-fold iterator variant with non-overlapping labels. - """ - - def __init__(self, labels): - super(LeaveOneLabelOut, self).__init__(len(labels)) - # We make a copy of labels to avoid side-effects during iteration - self.labels = np.array(labels, copy=True) - self.unique_labels = np.unique(labels) - self.n_unique_labels = len(self.unique_labels) - - def _iter_test_masks(self): - for i in self.unique_labels: - yield self.labels == i - - def __repr__(self): - return '%s.%s(labels=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.labels, - ) - - def __len__(self): - return self.n_unique_labels - - -class LeavePLabelOut(_PartitionIterator): - """Leave-P-Label_Out cross-validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.LeavePGroupsOut` instead. - - Provides train/test indices to split data according to a third-party - provided label. This label information can be used to encode arbitrary - domain specific stratifications of the samples as integers. - - For instance the labels could be the year of collection of the samples - and thus allow for cross-validation against time-based splits. - - The difference between LeavePLabelOut and LeaveOneLabelOut is that - the former builds the test sets with all the samples assigned to - ``p`` different values of the labels while the latter uses samples - all assigned the same labels. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - labels : array-like of int with shape (n_samples,) - Arbitrary domain-specific stratification of the data to be used - to draw the splits. - - p : int - Number of samples to leave out in the test split. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6]]) - >>> y = np.array([1, 2, 1]) - >>> labels = np.array([1, 2, 3]) - >>> lpl = cross_validation.LeavePLabelOut(labels, p=2) - >>> len(lpl) - 3 - >>> print(lpl) - sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2) - >>> for train_index, test_index in lpl: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [2] TEST: [0 1] - [[5 6]] [[1 2] - [3 4]] [1] [1 2] - TRAIN: [1] TEST: [0 2] - [[3 4]] [[1 2] - [5 6]] [2] [1 1] - TRAIN: [0] TEST: [1 2] - [[1 2]] [[3 4] - [5 6]] [1] [2 1] - - See also - -------- - LabelKFold: K-fold iterator variant with non-overlapping labels. - """ - - def __init__(self, labels, p): - # We make a copy of labels to avoid side-effects during iteration - super(LeavePLabelOut, self).__init__(len(labels)) - self.labels = np.array(labels, copy=True) - self.unique_labels = np.unique(labels) - self.n_unique_labels = len(self.unique_labels) - self.p = p - - def _iter_test_masks(self): - comb = combinations(range(self.n_unique_labels), self.p) - for idx in comb: - test_index = self._empty_mask() - idx = np.array(idx) - for l in self.unique_labels[idx]: - test_index[self.labels == l] = True - yield test_index - - def __repr__(self): - return '%s.%s(labels=%s, p=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.labels, - self.p, - ) - - def __len__(self): - return int(factorial(self.n_unique_labels) / - factorial(self.n_unique_labels - self.p) / - factorial(self.p)) - - -class BaseShuffleSplit(with_metaclass(ABCMeta)): - """Base class for ShuffleSplit and StratifiedShuffleSplit""" - - def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, - random_state=None): - self.n = n - self.n_iter = n_iter - self.test_size = test_size - self.train_size = train_size - self.random_state = random_state - self.n_train, self.n_test = _validate_shuffle_split(n, test_size, - train_size) - - def __iter__(self): - for train, test in self._iter_indices(): - yield train, test - return - - @abstractmethod - def _iter_indices(self): - """Generate (train, test) indices""" - - -class ShuffleSplit(BaseShuffleSplit): - """Random permutation cross-validation iterator. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.ShuffleSplit` instead. - - Yields indices to split data into training and test sets. - - Note: contrary to other cross-validation strategies, random splits - do not guarantee that all folds will be different, although this is - still very likely for sizeable datasets. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n : int - Total number of elements in the dataset. - - n_iter : int (default 10) - Number of re-shuffling & splitting iterations. - - test_size : float (default 0.1), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int, RandomState instance or None, optional (default None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Examples - -------- - >>> from sklearn import cross_validation - >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, - ... test_size=.25, random_state=0) - >>> len(rs) - 3 - >>> print(rs) - ... # doctest: +ELLIPSIS - ShuffleSplit(4, n_iter=3, test_size=0.25, ...) - >>> for train_index, test_index in rs: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... - TRAIN: [3 1 0] TEST: [2] - TRAIN: [2 1 3] TEST: [0] - TRAIN: [0 2 1] TEST: [3] - - >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, - ... train_size=0.5, test_size=.25, random_state=0) - >>> for train_index, test_index in rs: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... - TRAIN: [3 1] TEST: [2] - TRAIN: [2 1] TEST: [0] - TRAIN: [0 2] TEST: [3] - - """ - - def _iter_indices(self): - rng = check_random_state(self.random_state) - for i in range(self.n_iter): - # random partition - permutation = rng.permutation(self.n) - ind_test = permutation[:self.n_test] - ind_train = permutation[self.n_test:self.n_test + self.n_train] - yield ind_train, ind_test - - def __repr__(self): - return ('%s(%d, n_iter=%d, test_size=%s, ' - 'random_state=%s)' % ( - self.__class__.__name__, - self.n, - self.n_iter, - str(self.test_size), - self.random_state, - )) - - def __len__(self): - return self.n_iter - - -def _validate_shuffle_split(n, test_size, train_size): - if test_size is None and train_size is None: - raise ValueError( - 'test_size and train_size can not both be None') - - if test_size is not None: - if np.asarray(test_size).dtype.kind == 'f': - if test_size >= 1.: - raise ValueError( - 'test_size=%f should be smaller ' - 'than 1.0 or be an integer' % test_size) - elif np.asarray(test_size).dtype.kind == 'i': - if test_size >= n: - raise ValueError( - 'test_size=%d should be smaller ' - 'than the number of samples %d' % (test_size, n)) - else: - raise ValueError("Invalid value for test_size: %r" % test_size) - - if train_size is not None: - if np.asarray(train_size).dtype.kind == 'f': - if train_size >= 1.: - raise ValueError("train_size=%f should be smaller " - "than 1.0 or be an integer" % train_size) - elif np.asarray(test_size).dtype.kind == 'f' and \ - train_size + test_size > 1.: - raise ValueError('The sum of test_size and train_size = %f, ' - 'should be smaller than 1.0. Reduce ' - 'test_size and/or train_size.' % - (train_size + test_size)) - elif np.asarray(train_size).dtype.kind == 'i': - if train_size >= n: - raise ValueError("train_size=%d should be smaller " - "than the number of samples %d" % - (train_size, n)) - else: - raise ValueError("Invalid value for train_size: %r" % train_size) - - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) - - if train_size is None: - n_train = n - n_test - else: - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n) - else: - n_train = float(train_size) - - if test_size is None: - n_test = n - n_train - - if n_train + n_test > n: - raise ValueError('The sum of train_size and test_size = %d, ' - 'should be smaller than the number of ' - 'samples %d. Reduce test_size and/or ' - 'train_size.' % (n_train + n_test, n)) - - return int(n_train), int(n_test) - - -def _approximate_mode(class_counts, n_draws, rng): - """Computes approximate mode of multivariate hypergeometric. - - This is an approximation to the mode of the multivariate - hypergeometric given by class_counts and n_draws. - It shouldn't be off by more than one. - - It is the mostly likely outcome of drawing n_draws many - samples from the population given by class_counts. - - Parameters - ---------- - class_counts : ndarray of int - Population per class. - n_draws : int - Number of draws (samples to draw) from the overall population. - rng : random state - Used to break ties. - - Returns - ------- - sampled_classes : ndarray of int - Number of samples drawn from each class. - np.sum(sampled_classes) == n_draws - """ - # this computes a bad approximation to the mode of the - # multivariate hypergeometric given by class_counts and n_draws - continuous = n_draws * class_counts / class_counts.sum() - # floored means we don't overshoot n_samples, but probably undershoot - floored = np.floor(continuous) - # we add samples according to how much "left over" probability - # they had, until we arrive at n_samples - need_to_add = int(n_draws - floored.sum()) - if need_to_add > 0: - remainder = continuous - floored - values = np.sort(np.unique(remainder))[::-1] - # add according to remainder, but break ties - # randomly to avoid biases - for value in values: - inds, = np.where(remainder == value) - # if we need_to_add less than what's in inds - # we draw randomly from them. - # if we need to add more, we add them all and - # go to the next value - add_now = min(len(inds), need_to_add) - inds = rng.choice(inds, size=add_now, replace=False) - floored[inds] += 1 - need_to_add -= add_now - if need_to_add == 0: - break - return floored.astype(np.int) - - -class StratifiedShuffleSplit(BaseShuffleSplit): - """Stratified ShuffleSplit cross validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.StratifiedShuffleSplit` instead. - - Provides train/test indices to split data in train test sets. - - This cross-validation object is a merge of StratifiedKFold and - ShuffleSplit, which returns stratified randomized folds. The folds - are made by preserving the percentage of samples for each class. - - Note: like the ShuffleSplit strategy, stratified random splits - do not guarantee that all folds will be different, although this is - still very likely for sizeable datasets. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y : array, [n_samples] - Labels of samples. - - n_iter : int (default 10) - Number of re-shuffling & splitting iterations. - - test_size : float (default 0.1), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int, RandomState instance or None, optional (default None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Examples - -------- - >>> from sklearn.cross_validation import StratifiedShuffleSplit - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([0, 0, 1, 1]) - >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0) - >>> len(sss) - 3 - >>> print(sss) # doctest: +ELLIPSIS - StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...) - >>> for train_index, test_index in sss: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [1 2] TEST: [3 0] - TRAIN: [0 2] TEST: [1 3] - TRAIN: [0 2] TEST: [3 1] - """ - - def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, - random_state=None): - - super(StratifiedShuffleSplit, self).__init__( - len(y), n_iter, test_size, train_size, random_state) - - self.y = np.array(y) - self.classes, self.y_indices = np.unique(y, return_inverse=True) - n_cls = self.classes.shape[0] - - if np.min(np.bincount(self.y_indices)) < 2: - raise ValueError("The least populated class in y has only 1" - " member, which is too few. The minimum" - " number of labels for any class cannot" - " be less than 2.") - - if self.n_train < n_cls: - raise ValueError('The train_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (self.n_train, n_cls)) - if self.n_test < n_cls: - raise ValueError('The test_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (self.n_test, n_cls)) - - def _iter_indices(self): - rng = check_random_state(self.random_state) - cls_count = np.bincount(self.y_indices) - - for n in range(self.n_iter): - # if there are ties in the class-counts, we want - # to make sure to break them anew in each iteration - n_i = _approximate_mode(cls_count, self.n_train, rng) - class_counts_remaining = cls_count - n_i - t_i = _approximate_mode(class_counts_remaining, self.n_test, rng) - - train = [] - test = [] - - for i, _ in enumerate(self.classes): - permutation = rng.permutation(cls_count[i]) - perm_indices_class_i = np.where( - (i == self.y_indices))[0][permutation] - - train.extend(perm_indices_class_i[:n_i[i]]) - test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) - train = rng.permutation(train) - test = rng.permutation(test) - - yield train, test - - def __repr__(self): - return ('%s(labels=%s, n_iter=%d, test_size=%s, ' - 'random_state=%s)' % ( - self.__class__.__name__, - self.y, - self.n_iter, - str(self.test_size), - self.random_state, - )) - - def __len__(self): - return self.n_iter - - -class PredefinedSplit(_PartitionIterator): - """Predefined split cross validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.PredefinedSplit` instead. - - Splits the data into training/test set folds according to a predefined - scheme. Each sample can be assigned to at most one test set fold, as - specified by the user through the ``test_fold`` parameter. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - test_fold : "array-like, shape (n_samples,) - test_fold[i] gives the test set fold of sample i. A value of -1 - indicates that the corresponding sample is not part of any test set - folds, but will instead always be put into the training fold. - - Examples - -------- - >>> from sklearn.cross_validation import PredefinedSplit - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([0, 0, 1, 1]) - >>> ps = PredefinedSplit(test_fold=[0, 1, -1, 1]) - >>> len(ps) - 2 - >>> print(ps) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - sklearn.cross_validation.PredefinedSplit(test_fold=[ 0 1 -1 1]) - >>> for train_index, test_index in ps: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [1 2 3] TEST: [0] - TRAIN: [0 2] TEST: [1 3] - """ - - def __init__(self, test_fold): - super(PredefinedSplit, self).__init__(len(test_fold)) - self.test_fold = np.array(test_fold, dtype=np.int) - self.test_fold = column_or_1d(self.test_fold) - self.unique_folds = np.unique(self.test_fold) - self.unique_folds = self.unique_folds[self.unique_folds != -1] - - def _iter_test_indices(self): - for f in self.unique_folds: - yield np.where(self.test_fold == f)[0] - - def __repr__(self): - return '%s.%s(test_fold=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.test_fold) - - def __len__(self): - return len(self.unique_folds) - - -class LabelShuffleSplit(ShuffleSplit): - """Shuffle-Labels-Out cross-validation iterator - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.GroupShuffleSplit` instead. - - Provides randomized train/test indices to split data according to a - third-party provided label. This label information can be used to encode - arbitrary domain specific stratifications of the samples as integers. - - For instance the labels could be the year of collection of the samples - and thus allow for cross-validation against time-based splits. - - The difference between LeavePLabelOut and LabelShuffleSplit is that - the former generates splits using all subsets of size ``p`` unique labels, - whereas LabelShuffleSplit generates a user-determined number of random - test splits, each with a user-determined fraction of unique labels. - - For example, a less computationally intensive alternative to - ``LeavePLabelOut(labels, p=10)`` would be - ``LabelShuffleSplit(labels, test_size=10, n_iter=100)``. - - Note: The parameters ``test_size`` and ``train_size`` refer to labels, and - not to samples, as in ShuffleSplit. - - .. versionadded:: 0.17 - - Parameters - ---------- - labels : array, [n_samples] - Labels of samples - - n_iter : int (default 5) - Number of re-shuffling and splitting iterations. - - test_size : float (default 0.2), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the labels to include in the test split. If - int, represents the absolute number of test labels. If None, - the value is automatically set to the complement of the train size. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the labels to include in the train split. If - int, represents the absolute number of train labels. If None, - the value is automatically set to the complement of the test size. - - random_state : int, RandomState instance or None, optional (default None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - """ - def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None, - random_state=None): - - classes, label_indices = np.unique(labels, return_inverse=True) - - super(LabelShuffleSplit, self).__init__( - len(classes), - n_iter=n_iter, - test_size=test_size, - train_size=train_size, - random_state=random_state) - - self.labels = labels - self.classes = classes - self.label_indices = label_indices - - def __repr__(self): - return ('%s(labels=%s, n_iter=%d, test_size=%s, ' - 'random_state=%s)' % ( - self.__class__.__name__, - self.labels, - self.n_iter, - str(self.test_size), - self.random_state, - )) - - def __len__(self): - return self.n_iter - - def _iter_indices(self): - for label_train, label_test in super(LabelShuffleSplit, - self)._iter_indices(): - # these are the indices of classes in the partition - # invert them into data indices - - train = np.flatnonzero(np.in1d(self.label_indices, label_train)) - test = np.flatnonzero(np.in1d(self.label_indices, label_test)) - - yield train, test - - -############################################################################## -def _index_param_value(X, v, indices): - """Private helper function for parameter value indexing.""" - if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): - # pass through: skip indexing - return v - if sp.issparse(v): - v = v.tocsr() - return safe_indexing(v, indices) - - -def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1, - verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): - """Generate cross-validated estimates for each input data point - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.cross_val_predict` instead. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object implementing 'fit' and 'predict' - The object to use to fit the data. - - X : array-like - The data to fit. Can be, for example a list, or an array at least 2d. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - n_jobs : integer, optional - The number of CPUs to use to do the computation. -1 means - 'all CPUs'. - - verbose : integer, optional - The verbosity level. - - fit_params : dict, optional - Parameters to pass to the fit method of the estimator. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - Returns - ------- - preds : ndarray - This is the result of calling 'predict' - - Examples - -------- - >>> from sklearn import datasets, linear_model - >>> from sklearn.cross_validation import cross_val_predict - >>> diabetes = datasets.load_diabetes() - >>> X = diabetes.data[:150] - >>> y = diabetes.target[:150] - >>> lasso = linear_model.Lasso() - >>> y_pred = cross_val_predict(lasso, X, y) - """ - X, y = indexable(X, y) - - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - # We clone the estimator to make sure that all the folds are - # independent, and that it is pickle-able. - parallel = Parallel(n_jobs=n_jobs, verbose=verbose, - pre_dispatch=pre_dispatch) - preds_blocks = parallel(delayed(_fit_and_predict)(clone(estimator), X, y, - train, test, verbose, - fit_params) - for train, test in cv) - - preds = [p for p, _ in preds_blocks] - locs = np.concatenate([loc for _, loc in preds_blocks]) - if not _check_is_partition(locs, _num_samples(X)): - raise ValueError('cross_val_predict only works for partitions') - inv_locs = np.empty(len(locs), dtype=int) - inv_locs[locs] = np.arange(len(locs)) - - # Check for sparse predictions - if sp.issparse(preds[0]): - preds = sp.vstack(preds, format=preds[0].format) - else: - preds = np.concatenate(preds) - return preds[inv_locs] - - -def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params): - """Fit estimator and predict values for a given dataset split. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object implementing 'fit' and 'predict' - The object to use to fit the data. - - X : array-like of shape at least 2D - The data to fit. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - train : array-like, shape (n_train_samples,) - Indices of training samples. - - test : array-like, shape (n_test_samples,) - Indices of test samples. - - verbose : integer - The verbosity level. - - fit_params : dict or None - Parameters that will be passed to ``estimator.fit``. - - Returns - ------- - preds : sequence - Result of calling 'estimator.predict' - - test : array-like - This is the value of the test parameter - """ - # Adjust length of sample weights - fit_params = fit_params if fit_params is not None else {} - fit_params = dict([(k, _index_param_value(X, v, train)) - for k, v in fit_params.items()]) - - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, _ = _safe_split(estimator, X, y, test, train) - - if y_train is None: - estimator.fit(X_train, **fit_params) - else: - estimator.fit(X_train, y_train, **fit_params) - preds = estimator.predict(X_test) - return preds, test - - -def _check_is_partition(locs, n): - """Check whether locs is a reordering of the array np.arange(n) - - Parameters - ---------- - locs : ndarray - integer array to test - n : int - number of expected elements - - Returns - ------- - is_partition : bool - True iff sorted(locs) is range(n) - """ - if len(locs) != n: - return False - hit = np.zeros(n, bool) - hit[locs] = True - if not np.all(hit): - return False - return True - - -def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, - verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): - """Evaluate a score by cross-validation - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.cross_val_score` instead. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like - The data to fit. Can be, for example a list, or an array at least 2d. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - n_jobs : integer, optional - The number of CPUs to use to do the computation. -1 means - 'all CPUs'. - - verbose : integer, optional - The verbosity level. - - fit_params : dict, optional - Parameters to pass to the fit method of the estimator. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - Returns - ------- - scores : array of float, shape=(len(list(cv)),) - Array of scores of the estimator for each run of the cross validation. - - Examples - -------- - >>> from sklearn import datasets, linear_model - >>> from sklearn.cross_validation import cross_val_score - >>> diabetes = datasets.load_diabetes() - >>> X = diabetes.data[:150] - >>> y = diabetes.target[:150] - >>> lasso = linear_model.Lasso() - >>> print(cross_val_score(lasso, X, y)) # doctest: +ELLIPSIS - [ 0.33150734 0.08022311 0.03531764] - - See Also - --------- - :func:`sklearn.metrics.make_scorer`: - Make a scorer from a performance metric or loss function. - - """ - X, y = indexable(X, y) - - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, scoring=scoring) - # We clone the estimator to make sure that all the folds are - # independent, and that it is pickle-able. - parallel = Parallel(n_jobs=n_jobs, verbose=verbose, - pre_dispatch=pre_dispatch) - scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, - train, test, verbose, None, - fit_params) - for train, test in cv) - return np.array(scores)[:, 0] - - -def _fit_and_score(estimator, X, y, scorer, train, test, verbose, - parameters, fit_params, return_train_score=False, - return_parameters=False, error_score='raise'): - """Fit estimator and compute scores for a given dataset split. - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like of shape at least 2D - The data to fit. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - scorer : callable - A scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - train : array-like, shape (n_train_samples,) - Indices of training samples. - - test : array-like, shape (n_test_samples,) - Indices of test samples. - - verbose : integer - The verbosity level. - - error_score : 'raise' (default) or numeric - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - parameters : dict or None - Parameters to be set on the estimator. - - fit_params : dict or None - Parameters that will be passed to ``estimator.fit``. - - return_train_score : boolean, optional, default: False - Compute and return score on training set. - - return_parameters : boolean, optional, default: False - Return parameters that has been used for the estimator. - - Returns - ------- - train_score : float, optional - Score on training set, returned only if `return_train_score` is `True`. - - test_score : float - Score on test set. - - n_test_samples : int - Number of test samples. - - scoring_time : float - Time spent for fitting and scoring in seconds. - - parameters : dict or None, optional - The parameters that have been evaluated. - """ - if verbose > 1: - if parameters is None: - msg = '' - else: - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) - - # Adjust length of sample weights - fit_params = fit_params if fit_params is not None else {} - fit_params = dict([(k, _index_param_value(X, v, train)) - for k, v in fit_params.items()]) - - if parameters is not None: - estimator.set_params(**parameters) - - start_time = time.time() - - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) - - try: - if y_train is None: - estimator.fit(X_train, **fit_params) - else: - estimator.fit(X_train, y_train, **fit_params) - - except Exception as e: - if error_score == 'raise': - raise - elif isinstance(error_score, numbers.Number): - test_score = error_score - if return_train_score: - train_score = error_score - warnings.warn("Classifier fit failed. The score on this train-test" - " partition for these parameters will be set to %f. " - "Details: \n%r" % (error_score, e), FitFailedWarning) - else: - raise ValueError("error_score must be the string 'raise' or a" - " numeric value. (Hint: if using 'raise', please" - " make sure that it has been spelled correctly.)" - ) - - else: - test_score = _score(estimator, X_test, y_test, scorer) - if return_train_score: - train_score = _score(estimator, X_train, y_train, scorer) - - scoring_time = time.time() - start_time - - if verbose > 2: - msg += ", score=%f" % test_score - if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - - ret = [train_score] if return_train_score else [] - ret.extend([test_score, _num_samples(X_test), scoring_time]) - if return_parameters: - ret.append(parameters) - return ret - - -def _safe_split(estimator, X, y, indices, train_indices=None): - """Create subset of dataset and properly handle kernels.""" - if hasattr(estimator, 'kernel') and callable(estimator.kernel) \ - and not isinstance(estimator.kernel, GPKernel): - # cannot compute the kernel values with custom function - raise ValueError("Cannot use a custom kernel function. " - "Precompute the kernel matrix instead.") - - if not hasattr(X, "shape"): - if getattr(estimator, "_pairwise", False): - raise ValueError("Precomputed kernels or affinity matrices have " - "to be passed as arrays or sparse matrices.") - X_subset = [X[idx] for idx in indices] - else: - if getattr(estimator, "_pairwise", False): - # X is a precomputed square kernel matrix - if X.shape[0] != X.shape[1]: - raise ValueError("X should be a square kernel matrix") - if train_indices is None: - X_subset = X[np.ix_(indices, indices)] - else: - X_subset = X[np.ix_(indices, train_indices)] - else: - X_subset = safe_indexing(X, indices) - - if y is not None: - y_subset = safe_indexing(y, indices) - else: - y_subset = None - - return X_subset, y_subset - - -def _score(estimator, X_test, y_test, scorer): - """Compute the score of an estimator on a given test set.""" - if y_test is None: - score = scorer(estimator, X_test) - else: - score = scorer(estimator, X_test, y_test) - if hasattr(score, 'item'): - try: - # e.g. unwrap memmapped scalars - score = score.item() - except ValueError: - # non-scalar? - pass - if not isinstance(score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s) instead." - % (str(score), type(score))) - return score - - -def _permutation_test_score(estimator, X, y, cv, scorer): - """Auxiliary function for permutation_test_score""" - avg_score = [] - for train, test in cv: - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) - estimator.fit(X_train, y_train) - avg_score.append(scorer(estimator, X_test, y_test)) - return np.mean(avg_score) - - -def _shuffle(y, labels, random_state): - """Return a shuffled copy of y eventually shuffle among same labels.""" - if labels is None: - ind = random_state.permutation(len(y)) - else: - ind = np.arange(len(labels)) - for label in np.unique(labels): - this_mask = (labels == label) - ind[this_mask] = random_state.permutation(ind[this_mask]) - return safe_indexing(y, ind) - - -def check_cv(cv, X=None, y=None, classifier=False): - """Input checker utility for building a CV in a user friendly way. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.check_cv` instead. - - Parameters - ---------- - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if classifier is True and ``y`` is binary or - multiclass, :class:`StratifiedKFold` is used. In all other cases, - :class:`KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - X : array-like - The data the cross-val object will be applied on. - - y : array-like - The target variable for a supervised learning problem. - - classifier : boolean optional - Whether the task is a classification task, in which case - stratified KFold will be used. - - Returns - ------- - checked_cv : a cross-validation generator instance. - The return value is guaranteed to be a cv generator instance, whatever - the input type. - """ - is_sparse = sp.issparse(X) - if cv is None: - cv = 3 - if isinstance(cv, numbers.Integral): - if classifier: - if type_of_target(y) in ['binary', 'multiclass']: - cv = StratifiedKFold(y, cv) - else: - cv = KFold(_num_samples(y), cv) - else: - if not is_sparse: - n_samples = len(X) - else: - n_samples = X.shape[0] - cv = KFold(n_samples, cv) - return cv - - -def permutation_test_score(estimator, X, y, cv=None, - n_permutations=100, n_jobs=1, labels=None, - random_state=0, verbose=0, scoring=None): - """Evaluate the significance of a cross-validated score with permutations - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.permutation_test_score` instead. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like of shape at least 2D - The data to fit. - - y : array-like - The target variable to try to predict in the case of - supervised learning. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - n_permutations : integer, optional - Number of times to permute ``y``. - - n_jobs : integer, optional - The number of CPUs to use to do the computation. -1 means - 'all CPUs'. - - labels : array-like of shape [n_samples] (optional) - Labels constrain the permutation among groups of samples with - a same label. - - random_state : int, RandomState instance or None, optional (default=0) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - verbose : integer, optional - The verbosity level. - - Returns - ------- - score : float - The true score without permuting targets. - - permutation_scores : array, shape (n_permutations,) - The scores obtained for each permutations. - - pvalue : float - The p-value, which approximates the probability that the score would - be obtained by chance. This is calculated as: - - `(C + 1) / (n_permutations + 1)` - - Where C is the number of permutations whose score >= the true score. - - The best possible p-value is 1/(n_permutations + 1), the worst is 1.0. - - Notes - ----- - This function implements Test 1 in: - - Ojala and Garriga. Permutation Tests for Studying Classifier - Performance. The Journal of Machine Learning Research (2010) - vol. 11 - - """ - X, y = indexable(X, y) - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, scoring=scoring) - random_state = check_random_state(random_state) - - # We clone the estimator to make sure that all the folds are - # independent, and that it is pickle-able. - score = _permutation_test_score(clone(estimator), X, y, cv, scorer) - permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_permutation_test_score)( - clone(estimator), X, _shuffle(y, labels, random_state), cv, - scorer) - for _ in range(n_permutations)) - permutation_scores = np.array(permutation_scores) - pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) - return score, permutation_scores, pvalue - - -permutation_test_score.__test__ = False # to avoid a pb with nosetests - - -def train_test_split(*arrays, **options): - """Split arrays or matrices into random train and test subsets - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.train_test_split` instead. - - Quick utility that wraps input validation and - ``next(iter(ShuffleSplit(n_samples)))`` and application to input - data into a single call for splitting (and optionally subsampling) - data in a oneliner. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - *arrays : sequence of indexables with same length / shape[0] - Allowed inputs are lists, numpy arrays, scipy-sparse - matrices or pandas dataframes. - - test_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - If train size is also None, test size is set to 0.25. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - stratify : array-like or None (default is None) - If not None, data is split in a stratified fashion, using this as - the labels array. - - .. versionadded:: 0.17 - *stratify* splitting - - Returns - ------- - splitting : list, length = 2 * len(arrays), - List containing train-test split of inputs. - - .. versionadded:: 0.16 - If the input is sparse, the output will be a - ``scipy.sparse.csr_matrix``. Else, output type is the same as the - input type. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.cross_validation import train_test_split - >>> X, y = np.arange(10).reshape((5, 2)), range(5) - >>> X - array([[0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9]]) - >>> list(y) - [0, 1, 2, 3, 4] - - >>> X_train, X_test, y_train, y_test = train_test_split( - ... X, y, test_size=0.33, random_state=42) - ... - >>> X_train - array([[4, 5], - [0, 1], - [6, 7]]) - >>> y_train - [2, 0, 3] - >>> X_test - array([[2, 3], - [8, 9]]) - >>> y_test - [1, 4] - - """ - n_arrays = len(arrays) - if n_arrays == 0: - raise ValueError("At least one array required as input") - - test_size = options.pop('test_size', None) - train_size = options.pop('train_size', None) - random_state = options.pop('random_state', None) - stratify = options.pop('stratify', None) - - if options: - raise TypeError("Invalid parameters passed: %s" % str(options)) - - if test_size is None and train_size is None: - test_size = 0.25 - arrays = indexable(*arrays) - if stratify is not None: - cv = StratifiedShuffleSplit(stratify, test_size=test_size, - train_size=train_size, - random_state=random_state) - else: - n_samples = _num_samples(arrays[0]) - cv = ShuffleSplit(n_samples, test_size=test_size, - train_size=train_size, - random_state=random_state) - - train, test = next(iter(cv)) - return list(chain.from_iterable((safe_indexing(a, train), - safe_indexing(a, test)) for a in arrays)) - - -train_test_split.__test__ = False # to avoid a pb with nosetests diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py deleted file mode 100644 index 4d756bdaa0cf8..0000000000000 --- a/sklearn/tests/test_cross_validation.py +++ /dev/null @@ -1,1252 +0,0 @@ -"""Test the cross_validation module""" -from __future__ import division -import warnings - -import numpy as np -from scipy.sparse import coo_matrix -from scipy.sparse import csr_matrix -from scipy import stats - -from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.testing import assert_true -from sklearn.utils.testing import assert_false -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_almost_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_greater -from sklearn.utils.testing import assert_greater_equal -from sklearn.utils.testing import assert_less -from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import assert_raise_message -from sklearn.utils.testing import ignore_warnings -from sklearn.utils.mocking import CheckingClassifier, MockDataFrame - -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn import cross_validation as cval - -from sklearn.datasets import make_regression -from sklearn.datasets import load_boston -from sklearn.datasets import load_digits -from sklearn.datasets import load_iris -from sklearn.datasets import make_multilabel_classification -from sklearn.metrics import explained_variance_score -from sklearn.metrics import make_scorer -from sklearn.metrics import precision_score -from sklearn.externals import six -from sklearn.externals.six.moves import zip - -from sklearn.linear_model import Ridge -from sklearn.multiclass import OneVsRestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC -from sklearn.cluster import KMeans - -from sklearn.preprocessing import Imputer -from sklearn.pipeline import Pipeline - - -class MockClassifier(object): - """Dummy classifier to test the cross-validation""" - - def __init__(self, a=0, allow_nd=False): - self.a = a - self.allow_nd = allow_nd - - def fit(self, X, Y=None, sample_weight=None, class_prior=None, - sparse_sample_weight=None, sparse_param=None, dummy_int=None, - dummy_str=None, dummy_obj=None, callback=None): - """The dummy arguments are to test that this fit function can - accept non-array arguments through cross-validation, such as: - - int - - str (this is actually array-like) - - object - - function - """ - self.dummy_int = dummy_int - self.dummy_str = dummy_str - self.dummy_obj = dummy_obj - if callback is not None: - callback(self) - - if self.allow_nd: - X = X.reshape(len(X), -1) - if X.ndim >= 3 and not self.allow_nd: - raise ValueError('X cannot be d') - if sample_weight is not None: - assert_true(sample_weight.shape[0] == X.shape[0], - 'MockClassifier extra fit_param sample_weight.shape[0]' - ' is {0}, should be {1}'.format(sample_weight.shape[0], - X.shape[0])) - if class_prior is not None: - assert_true(class_prior.shape[0] == len(np.unique(y)), - 'MockClassifier extra fit_param class_prior.shape[0]' - ' is {0}, should be {1}'.format(class_prior.shape[0], - len(np.unique(y)))) - if sparse_sample_weight is not None: - fmt = ('MockClassifier extra fit_param sparse_sample_weight' - '.shape[0] is {0}, should be {1}') - assert_true(sparse_sample_weight.shape[0] == X.shape[0], - fmt.format(sparse_sample_weight.shape[0], X.shape[0])) - if sparse_param is not None: - fmt = ('MockClassifier extra fit_param sparse_param.shape ' - 'is ({0}, {1}), should be ({2}, {3})') - assert_true(sparse_param.shape == P_sparse.shape, - fmt.format(sparse_param.shape[0], - sparse_param.shape[1], - P_sparse.shape[0], P_sparse.shape[1])) - return self - - def predict(self, T): - if self.allow_nd: - T = T.reshape(len(T), -1) - return T[:, 0] - - def score(self, X=None, Y=None): - return 1. / (1 + np.abs(self.a)) - - def get_params(self, deep=False): - return {'a': self.a, 'allow_nd': self.allow_nd} - -X = np.ones((10, 2)) -X_sparse = coo_matrix(X) -W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))), - shape=(10, 1)) -P_sparse = coo_matrix(np.eye(5)) - -# avoid StratifiedKFold's Warning about least populated class in y -y = np.arange(10) % 3 - -############################################################################## -# Tests - - -def check_valid_split(train, test, n_samples=None): - # Use python sets to get more informative assertion failure messages - train, test = set(train), set(test) - - # Train and test split should not overlap - assert_equal(train.intersection(test), set()) - - if n_samples is not None: - # Check that the union of train an test split cover all the indices - assert_equal(train.union(test), set(range(n_samples))) - - -def check_cv_coverage(cv, expected_n_iter=None, n_samples=None): - # Check that a all the samples appear at least once in a test fold - if expected_n_iter is not None: - assert_equal(len(cv), expected_n_iter) - else: - expected_n_iter = len(cv) - - collected_test_samples = set() - iterations = 0 - for train, test in cv: - check_valid_split(train, test, n_samples=n_samples) - iterations += 1 - collected_test_samples.update(test) - - # Check that the accumulated test samples cover the whole dataset - assert_equal(iterations, expected_n_iter) - if n_samples is not None: - assert_equal(collected_test_samples, set(range(n_samples))) - - -def test_kfold_valueerrors(): - # Check that errors are raised if there is not enough samples - assert_raises(ValueError, cval.KFold, 3, 4) - - # Check that a warning is raised if the least populated class has too few - # members. - y = [3, 3, -1, -1, 3] - - cv = assert_warns_message(Warning, "The least populated class", - cval.StratifiedKFold, y, 3) - - # Check that despite the warning the folds are still computed even - # though all the classes are not necessarily represented at on each - # side of the split at each split - check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y)) - - # Check that errors are raised if all n_labels for individual - # classes are less than n_folds. - y = [3, 3, -1, -1, 2] - - assert_raises(ValueError, cval.StratifiedKFold, y, 3) - - # Error when number of folds is <= 1 - assert_raises(ValueError, cval.KFold, 2, 0) - assert_raises(ValueError, cval.KFold, 2, 1) - error_string = ("k-fold cross validation requires at least one" - " train / test split") - assert_raise_message(ValueError, error_string, - cval.StratifiedKFold, y, 0) - assert_raise_message(ValueError, error_string, - cval.StratifiedKFold, y, 1) - - # When n is not integer: - assert_raises(ValueError, cval.KFold, 2.5, 2) - - # When n_folds is not integer: - assert_raises(ValueError, cval.KFold, 5, 1.5) - assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) - - -def test_kfold_indices(): - # Check all indices are returned in the test folds - kf = cval.KFold(300, 3) - check_cv_coverage(kf, expected_n_iter=3, n_samples=300) - - # Check all indices are returned in the test folds even when equal-sized - # folds are not possible - kf = cval.KFold(17, 3) - check_cv_coverage(kf, expected_n_iter=3, n_samples=17) - - -def test_kfold_no_shuffle(): - # Manually check that KFold preserves the data ordering on toy datasets - splits = iter(cval.KFold(4, 2)) - train, test = next(splits) - assert_array_equal(test, [0, 1]) - assert_array_equal(train, [2, 3]) - - train, test = next(splits) - assert_array_equal(test, [2, 3]) - assert_array_equal(train, [0, 1]) - - splits = iter(cval.KFold(5, 2)) - train, test = next(splits) - assert_array_equal(test, [0, 1, 2]) - assert_array_equal(train, [3, 4]) - - train, test = next(splits) - assert_array_equal(test, [3, 4]) - assert_array_equal(train, [0, 1, 2]) - - -def test_stratified_kfold_no_shuffle(): - # Manually check that StratifiedKFold preserves the data ordering as much - # as possible on toy datasets in order to avoid hiding sample dependencies - # when possible - splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2)) - train, test = next(splits) - assert_array_equal(test, [0, 2]) - assert_array_equal(train, [1, 3]) - - train, test = next(splits) - assert_array_equal(test, [1, 3]) - assert_array_equal(train, [0, 2]) - - splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2)) - train, test = next(splits) - assert_array_equal(test, [0, 1, 3, 4]) - assert_array_equal(train, [2, 5, 6]) - - train, test = next(splits) - assert_array_equal(test, [2, 5, 6]) - assert_array_equal(train, [0, 1, 3, 4]) - - -def test_stratified_kfold_ratios(): - # Check that stratified kfold preserves label ratios in individual splits - # Repeat with shuffling turned off and on - n_samples = 1000 - labels = np.array([4] * int(0.10 * n_samples) + - [0] * int(0.89 * n_samples) + - [1] * int(0.01 * n_samples)) - for shuffle in [False, True]: - for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle): - assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10, - 2) - assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89, - 2) - assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01, - 2) - assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2) - assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2) - assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2) - - -def test_kfold_balance(): - # Check that KFold returns folds with balanced sizes - for kf in [cval.KFold(i, 5) for i in range(11, 17)]: - sizes = [] - for _, test in kf: - sizes.append(len(test)) - - assert_true((np.max(sizes) - np.min(sizes)) <= 1) - assert_equal(np.sum(sizes), kf.n) - - -def test_stratifiedkfold_balance(): - # Check that KFold returns folds with balanced sizes (only when - # stratification is possible) - # Repeat with shuffling turned off and on - labels = [0] * 3 + [1] * 14 - for shuffle in [False, True]: - for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle) - for i in range(11, 17)]: - sizes = [] - for _, test in skf: - sizes.append(len(test)) - - assert_true((np.max(sizes) - np.min(sizes)) <= 1) - assert_equal(np.sum(sizes), skf.n) - - -def test_shuffle_kfold(): - # Check the indices are shuffled properly, and that all indices are - # returned in the different test folds - kf = cval.KFold(300, 3, shuffle=True, random_state=0) - ind = np.arange(300) - - all_folds = None - for train, test in kf: - assert_true(np.any(np.arange(100) != ind[test])) - assert_true(np.any(np.arange(100, 200) != ind[test])) - assert_true(np.any(np.arange(200, 300) != ind[test])) - - if all_folds is None: - all_folds = ind[test].copy() - else: - all_folds = np.concatenate((all_folds, ind[test])) - - all_folds.sort() - assert_array_equal(all_folds, ind) - - -def test_shuffle_stratifiedkfold(): - # Check that shuffling is happening when requested, and for proper - # sample coverage - labels = [0] * 20 + [1] * 20 - kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0)) - kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1)) - for (_, test0), (_, test1) in zip(kf0, kf1): - assert_true(set(test0) != set(test1)) - check_cv_coverage(kf0, expected_n_iter=5, n_samples=40) - - -def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 - # The digits samples are dependent: they are apparently grouped by authors - # although we don't have any information on the groups segment locations - # for this data. We can highlight this fact be computing k-fold cross- - # validation with and without shuffling: we observe that the shuffling case - # wrongly makes the IID assumption and is therefore too optimistic: it - # estimates a much higher accuracy (around 0.96) than the non - # shuffling variant (around 0.86). - - digits = load_digits() - X, y = digits.data[:800], digits.target[:800] - model = SVC(C=10, gamma=0.005) - n = len(y) - - cv = cval.KFold(n, 5, shuffle=False) - mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() - assert_greater(0.88, mean_score) - assert_greater(mean_score, 0.85) - - # Shuffling the data artificially breaks the dependency and hides the - # overfitting of the model with regards to the writing style of the authors - # by yielding a seriously overestimated score: - - cv = cval.KFold(n, 5, shuffle=True, random_state=0) - mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() - assert_greater(mean_score, 0.95) - - cv = cval.KFold(n, 5, shuffle=True, random_state=1) - mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() - assert_greater(mean_score, 0.95) - - # Similarly, StratifiedKFold should try to shuffle the data as little - # as possible (while respecting the balanced class constraints) - # and thus be able to detect the dependency by not overestimating - # the CV score either. As the digits dataset is approximately balanced - # the estimated mean score is close to the score measured with - # non-shuffled KFold - - cv = cval.StratifiedKFold(y, 5) - mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() - assert_greater(0.88, mean_score) - assert_greater(mean_score, 0.85) - - -def test_label_kfold(): - rng = np.random.RandomState(0) - - # Parameters of the test - n_labels = 15 - n_samples = 1000 - n_folds = 5 - - # Construct the test data - tolerance = 0.05 * n_samples # 5 percent error allowed - labels = rng.randint(0, n_labels, n_samples) - folds = cval.LabelKFold(labels, n_folds=n_folds).idxs - ideal_n_labels_per_fold = n_samples // n_folds - - # Check that folds have approximately the same size - assert_equal(len(folds), len(labels)) - for i in np.unique(folds): - assert_greater_equal(tolerance, - abs(sum(folds == i) - ideal_n_labels_per_fold)) - - # Check that each label appears only in 1 fold - for label in np.unique(labels): - assert_equal(len(np.unique(folds[labels == label])), 1) - - # Check that no label is on both sides of the split - labels = np.asarray(labels, dtype=object) - for train, test in cval.LabelKFold(labels, n_folds=n_folds): - assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) - - # Construct the test data - labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', - 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', - 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', - 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', - 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', - 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', - 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] - labels = np.asarray(labels, dtype=object) - - n_labels = len(np.unique(labels)) - n_samples = len(labels) - n_folds = 5 - tolerance = 0.05 * n_samples # 5 percent error allowed - folds = cval.LabelKFold(labels, n_folds=n_folds).idxs - ideal_n_labels_per_fold = n_samples // n_folds - - # Check that folds have approximately the same size - assert_equal(len(folds), len(labels)) - for i in np.unique(folds): - assert_greater_equal(tolerance, - abs(sum(folds == i) - ideal_n_labels_per_fold)) - - # Check that each label appears only in 1 fold - for label in np.unique(labels): - assert_equal(len(np.unique(folds[labels == label])), 1) - - # Check that no label is on both sides of the split - for train, test in cval.LabelKFold(labels, n_folds=n_folds): - assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) - - # Should fail if there are more folds than labels - labels = np.array([1, 1, 1, 2, 2]) - assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3) - - -def test_shuffle_split(): - ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) - ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) - ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) - for typ in six.integer_types: - ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) - for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): - assert_array_equal(t1[0], t2[0]) - assert_array_equal(t2[0], t3[0]) - assert_array_equal(t3[0], t4[0]) - assert_array_equal(t1[1], t2[1]) - assert_array_equal(t2[1], t3[1]) - assert_array_equal(t3[1], t4[1]) - - -def test_stratified_shuffle_split_init(): - y = np.asarray([0, 1, 1, 1, 2, 2, 2]) - # Check that error is raised if there is a class with only one sample - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2) - - # Check that error is raised if the test set size is smaller than n_classes - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2) - # Check that error is raised if the train set size is smaller than - # n_classes - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2) - - y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) - # Check that errors are raised if there is not enough samples - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6) - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6) - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8) - - # Train size or test size too small - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2) - assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) - - -def test_stratified_shuffle_split_iter(): - ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), - np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), - np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), - np.array([-1] * 800 + [1] * 50) - ] - - for y in ys: - sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, - random_state=0) - test_size = np.ceil(0.33 * len(y)) - train_size = len(y) - test_size - for train, test in sss: - assert_array_equal(np.unique(y[train]), np.unique(y[test])) - # Checks if folds keep classes proportions - p_train = (np.bincount(np.unique(y[train], - return_inverse=True)[1]) / - float(len(y[train]))) - p_test = (np.bincount(np.unique(y[test], - return_inverse=True)[1]) / - float(len(y[test]))) - assert_array_almost_equal(p_train, p_test, 1) - assert_equal(len(train) + len(test), y.size) - assert_equal(len(train), train_size) - assert_equal(len(test), test_size) - assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) - - -def test_stratified_shuffle_split_even(): - # Test the StratifiedShuffleSplit, indices are drawn with a - # equal chance - n_folds = 5 - n_iter = 1000 - - def assert_counts_are_ok(idx_counts, p): - # Here we test that the distribution of the counts - # per index is close enough to a binomial - threshold = 0.05 / n_splits - bf = stats.binom(n_splits, p) - for count in idx_counts: - p = bf.pmf(count) - assert_true(p > threshold, - "An index is not drawn with chance corresponding " - "to even draws") - - for n_samples in (6, 22): - labels = np.array((n_samples // 2) * [0, 1]) - splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter, - test_size=1. / n_folds, - random_state=0) - - train_counts = [0] * n_samples - test_counts = [0] * n_samples - n_splits = 0 - for train, test in splits: - n_splits += 1 - for counter, ids in [(train_counts, train), (test_counts, test)]: - for id in ids: - counter[id] += 1 - assert_equal(n_splits, n_iter) - - assert_equal(len(train), splits.n_train) - assert_equal(len(test), splits.n_test) - assert_equal(len(set(train).intersection(test)), 0) - - label_counts = np.unique(labels) - assert_equal(splits.test_size, 1.0 / n_folds) - assert_equal(splits.n_train + splits.n_test, len(labels)) - assert_equal(len(label_counts), 2) - ex_test_p = float(splits.n_test) / n_samples - ex_train_p = float(splits.n_train) / n_samples - - assert_counts_are_ok(train_counts, ex_train_p) - assert_counts_are_ok(test_counts, ex_test_p) - - -def test_stratified_shuffle_split_overlap_train_test_bug(): - # See https://github.com/scikit-learn/scikit-learn/issues/6121 for - # the original bug report - labels = [0, 1, 2, 3] * 3 + [4, 5] * 5 - - splits = cval.StratifiedShuffleSplit(labels, n_iter=1, - test_size=0.5, random_state=0) - train, test = next(iter(splits)) - - assert_array_equal(np.intersect1d(train, test), []) - - -def test_predefinedsplit_with_kfold_split(): - # Check that PredefinedSplit can reproduce a split generated by Kfold. - folds = -1 * np.ones(10) - kf_train = [] - kf_test = [] - for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)): - kf_train.append(train_ind) - kf_test.append(test_ind) - folds[test_ind] = i - ps_train = [] - ps_test = [] - ps = cval.PredefinedSplit(folds) - for train_ind, test_ind in ps: - ps_train.append(train_ind) - ps_test.append(test_ind) - assert_array_equal(ps_train, kf_train) - assert_array_equal(ps_test, kf_test) - - -def test_label_shuffle_split(): - ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), - np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), - np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), - np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), - ] - - for y in ys: - n_iter = 6 - test_size = 1. / 3 - slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size, - random_state=0) - - # Make sure the repr works - repr(slo) - - # Test that the length is correct - assert_equal(len(slo), n_iter) - - y_unique = np.unique(y) - - for train, test in slo: - # First test: no train label is in the test set and vice versa - y_train_unique = np.unique(y[train]) - y_test_unique = np.unique(y[test]) - assert_false(np.any(np.in1d(y[train], y_test_unique))) - assert_false(np.any(np.in1d(y[test], y_train_unique))) - - # Second test: train and test add up to all the data - assert_equal(y[train].size + y[test].size, y.size) - - # Third test: train and test are disjoint - assert_array_equal(np.intersect1d(train, test), []) - - # Fourth test: # unique train and test labels are correct, - # +- 1 for rounding error - assert_true(abs(len(y_test_unique) - - round(test_size * len(y_unique))) <= 1) - assert_true(abs(len(y_train_unique) - - round((1.0 - test_size) * len(y_unique))) <= 1) - - -def test_leave_label_out_changing_labels(): - # Check that LeaveOneLabelOut and LeavePLabelOut work normally if - # the labels variable is changed before calling __iter__ - labels = np.array([0, 1, 2, 1, 1, 2, 0, 0]) - labels_changing = np.array(labels, copy=True) - lolo = cval.LeaveOneLabelOut(labels) - lolo_changing = cval.LeaveOneLabelOut(labels_changing) - lplo = cval.LeavePLabelOut(labels, p=2) - lplo_changing = cval.LeavePLabelOut(labels_changing, p=2) - labels_changing[:] = 0 - for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: - for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): - assert_array_equal(train, train_chan) - assert_array_equal(test, test_chan) - - -def test_cross_val_score(): - clf = MockClassifier() - for a in range(-10, 10): - clf.a = a - # Smoke test - scores = cval.cross_val_score(clf, X, y) - assert_array_equal(scores, clf.score(X, y)) - - # test with multioutput y - scores = cval.cross_val_score(clf, X_sparse, X) - assert_array_equal(scores, clf.score(X_sparse, X)) - - scores = cval.cross_val_score(clf, X_sparse, y) - assert_array_equal(scores, clf.score(X_sparse, y)) - - # test with multioutput y - scores = cval.cross_val_score(clf, X_sparse, X) - assert_array_equal(scores, clf.score(X_sparse, X)) - - # test with X and y as list - list_check = lambda x: isinstance(x, list) - clf = CheckingClassifier(check_X=list_check) - scores = cval.cross_val_score(clf, X.tolist(), y.tolist()) - - clf = CheckingClassifier(check_y=list_check) - scores = cval.cross_val_score(clf, X, y.tolist()) - - assert_raises(ValueError, cval.cross_val_score, clf, X, y, - scoring="sklearn") - - # test with 3d X and - X_3d = X[:, :, np.newaxis] - clf = MockClassifier(allow_nd=True) - scores = cval.cross_val_score(clf, X_3d, y) - - clf = MockClassifier(allow_nd=False) - assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y) - - -def test_cross_val_score_pandas(): - # check cross_val_score doesn't destroy pandas dataframe - types = [(MockDataFrame, MockDataFrame)] - try: - from pandas import Series, DataFrame - types.append((Series, DataFrame)) - except ImportError: - pass - for TargetType, InputFeatureType in types: - # X dataframe, y series - X_df, y_ser = InputFeatureType(X), TargetType(y) - check_df = lambda x: isinstance(x, InputFeatureType) - check_series = lambda x: isinstance(x, TargetType) - clf = CheckingClassifier(check_X=check_df, check_y=check_series) - cval.cross_val_score(clf, X_df, y_ser) - - -def test_cross_val_score_mask(): - # test that cross_val_score works with boolean masks - svm = SVC(kernel="linear") - iris = load_iris() - X, y = iris.data, iris.target - cv_indices = cval.KFold(len(y), 5) - scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices) - cv_indices = cval.KFold(len(y), 5) - cv_masks = [] - for train, test in cv_indices: - mask_train = np.zeros(len(y), dtype=np.bool) - mask_test = np.zeros(len(y), dtype=np.bool) - mask_train[train] = 1 - mask_test[test] = 1 - cv_masks.append((train, test)) - scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks) - assert_array_equal(scores_indices, scores_masks) - - -def test_cross_val_score_precomputed(): - # test for svm with precomputed kernel - svm = SVC(kernel="precomputed") - iris = load_iris() - X, y = iris.data, iris.target - linear_kernel = np.dot(X, X.T) - score_precomputed = cval.cross_val_score(svm, linear_kernel, y) - svm = SVC(kernel="linear") - score_linear = cval.cross_val_score(svm, X, y) - assert_array_equal(score_precomputed, score_linear) - - # Error raised for non-square X - svm = SVC(kernel="precomputed") - assert_raises(ValueError, cval.cross_val_score, svm, X, y) - - # test error is raised when the precomputed kernel is not array-like - # or sparse - assert_raises(ValueError, cval.cross_val_score, svm, - linear_kernel.tolist(), y) - - -def test_cross_val_score_fit_params(): - clf = MockClassifier() - n_samples = X.shape[0] - n_classes = len(np.unique(y)) - - DUMMY_INT = 42 - DUMMY_STR = '42' - DUMMY_OBJ = object() - - def assert_fit_params(clf): - # Function to test that the values are passed correctly to the - # classifier arguments for non-array type - - assert_equal(clf.dummy_int, DUMMY_INT) - assert_equal(clf.dummy_str, DUMMY_STR) - assert_equal(clf.dummy_obj, DUMMY_OBJ) - - fit_params = {'sample_weight': np.ones(n_samples), - 'class_prior': np.ones(n_classes) / n_classes, - 'sparse_sample_weight': W_sparse, - 'sparse_param': P_sparse, - 'dummy_int': DUMMY_INT, - 'dummy_str': DUMMY_STR, - 'dummy_obj': DUMMY_OBJ, - 'callback': assert_fit_params} - cval.cross_val_score(clf, X, y, fit_params=fit_params) - - -def test_cross_val_score_score_func(): - clf = MockClassifier() - _score_func_args = [] - - def score_func(y_test, y_predict): - _score_func_args.append((y_test, y_predict)) - return 1.0 - - with warnings.catch_warnings(record=True): - scoring = make_scorer(score_func) - score = cval.cross_val_score(clf, X, y, scoring=scoring) - assert_array_equal(score, [1.0, 1.0, 1.0]) - assert len(_score_func_args) == 3 - - -def test_cross_val_score_errors(): - class BrokenEstimator: - pass - - assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X) - - -def test_train_test_split_errors(): - assert_raises(ValueError, cval.train_test_split) - assert_raises(ValueError, cval.train_test_split, range(3), train_size=1.1) - assert_raises(ValueError, cval.train_test_split, range(3), test_size=0.6, - train_size=0.6) - assert_raises(ValueError, cval.train_test_split, range(3), - test_size=np.float32(0.6), train_size=np.float32(0.6)) - assert_raises(ValueError, cval.train_test_split, range(3), - test_size="wrong_type") - assert_raises(ValueError, cval.train_test_split, range(3), test_size=2, - train_size=4) - assert_raises(TypeError, cval.train_test_split, range(3), - some_argument=1.1) - assert_raises(ValueError, cval.train_test_split, range(3), range(42)) - - -def test_train_test_split(): - X = np.arange(100).reshape((10, 10)) - X_s = coo_matrix(X) - y = np.arange(10) - - # simple test - split = cval.train_test_split(X, y, test_size=None, train_size=.5) - X_train, X_test, y_train, y_test = split - assert_equal(len(y_test), len(y_train)) - # test correspondence of X and y - assert_array_equal(X_train[:, 0], y_train * 10) - assert_array_equal(X_test[:, 0], y_test * 10) - - # conversion of lists to arrays (deprecated?) - with warnings.catch_warnings(record=True): - split = cval.train_test_split(X, X_s, y.tolist()) - X_train, X_test, X_s_train, X_s_test, y_train, y_test = split - assert_array_equal(X_train, X_s_train.toarray()) - assert_array_equal(X_test, X_s_test.toarray()) - - # don't convert lists to anything else by default - split = cval.train_test_split(X, X_s, y.tolist()) - X_train, X_test, X_s_train, X_s_test, y_train, y_test = split - assert_true(isinstance(y_train, list)) - assert_true(isinstance(y_test, list)) - - # allow nd-arrays - X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) - y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) - split = cval.train_test_split(X_4d, y_3d) - assert_equal(split[0].shape, (7, 5, 3, 2)) - assert_equal(split[1].shape, (3, 5, 3, 2)) - assert_equal(split[2].shape, (7, 7, 11)) - assert_equal(split[3].shape, (3, 7, 11)) - - # test stratification option - y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) - for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], - [2, 4, 2, 4, 6]): - train, test = cval.train_test_split(y, - test_size=test_size, - stratify=y, - random_state=0) - assert_equal(len(test), exp_test_size) - assert_equal(len(test) + len(train), len(y)) - # check the 1:1 ratio of ones and twos in the data is preserved - assert_equal(np.sum(train == 1), np.sum(train == 2)) - - -def train_test_split_pandas(): - # check cross_val_score doesn't destroy pandas dataframe - types = [MockDataFrame] - try: - from pandas import DataFrame - types.append(DataFrame) - except ImportError: - pass - for InputFeatureType in types: - # X dataframe - X_df = InputFeatureType(X) - X_train, X_test = cval.train_test_split(X_df) - assert_true(isinstance(X_train, InputFeatureType)) - assert_true(isinstance(X_test, InputFeatureType)) - -def train_test_split_mock_pandas(): - # X mock dataframe - X_df = MockDataFrame(X) - X_train, X_test = cval.train_test_split(X_df) - assert_true(isinstance(X_train, MockDataFrame)) - assert_true(isinstance(X_test, MockDataFrame)) - - -def test_cross_val_score_with_score_func_classification(): - iris = load_iris() - clf = SVC(kernel='linear') - - # Default score (should be the accuracy score) - scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5) - assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2) - - # Correct classification score (aka. zero / one score) - should be the - # same as the default estimator score - zo_scores = cval.cross_val_score(clf, iris.data, iris.target, - scoring="accuracy", cv=5) - assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2) - - # F1 score (class are balanced so f1_score should be equal to zero/one - # score - f1_scores = cval.cross_val_score(clf, iris.data, iris.target, - scoring="f1_weighted", cv=5) - assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) - - -def test_cross_val_score_with_score_func_regression(): - X, y = make_regression(n_samples=30, n_features=20, n_informative=5, - random_state=0) - reg = Ridge() - - # Default score of the Ridge regression estimator - scores = cval.cross_val_score(reg, X, y, cv=5) - assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) - - # R2 score (aka. determination coefficient) - should be the - # same as the default estimator score - r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) - assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) - - # Mean squared error; this is a loss function, so "scores" are negative - neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5, - scoring="neg_mean_squared_error") - expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) - assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2) - - # Explained variance - scoring = make_scorer(explained_variance_score) - ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring) - assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) - - -def test_permutation_score(): - iris = load_iris() - X = iris.data - X_sparse = coo_matrix(X) - y = iris.target - svm = SVC(kernel='linear') - cv = cval.StratifiedKFold(y, 2) - - score, scores, pvalue = cval.permutation_test_score( - svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") - assert_greater(score, 0.9) - assert_almost_equal(pvalue, 0.0, 1) - - score_label, _, pvalue_label = cval.permutation_test_score( - svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", - labels=np.ones(y.size), random_state=0) - assert_true(score_label == score) - assert_true(pvalue_label == pvalue) - - # check that we obtain the same results with a sparse representation - svm_sparse = SVC(kernel='linear') - cv_sparse = cval.StratifiedKFold(y, 2) - score_label, _, pvalue_label = cval.permutation_test_score( - svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, - scoring="accuracy", labels=np.ones(y.size), random_state=0) - - assert_true(score_label == score) - assert_true(pvalue_label == pvalue) - - # test with custom scoring object - def custom_score(y_true, y_pred): - return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) - / y_true.shape[0]) - - scorer = make_scorer(custom_score) - score, _, pvalue = cval.permutation_test_score( - svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) - assert_almost_equal(score, .93, 2) - assert_almost_equal(pvalue, 0.01, 3) - - # set random y - y = np.mod(np.arange(len(y)), 3) - - score, scores, pvalue = cval.permutation_test_score( - svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") - - assert_less(score, 0.5) - assert_greater(pvalue, 0.2) - - -def test_cross_val_generator_with_indices(): - X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - y = np.array([1, 1, 2, 2]) - labels = np.array([1, 2, 3, 4]) - # explicitly passing indices value is deprecated - loo = cval.LeaveOneOut(4) - lpo = cval.LeavePOut(4, 2) - kf = cval.KFold(4, 2) - skf = cval.StratifiedKFold(y, 2) - lolo = cval.LeaveOneLabelOut(labels) - lopo = cval.LeavePLabelOut(labels, 2) - ps = cval.PredefinedSplit([1, 1, 2, 2]) - ss = cval.ShuffleSplit(2) - for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: - for train, test in cv: - assert_not_equal(np.asarray(train).dtype.kind, 'b') - assert_not_equal(np.asarray(train).dtype.kind, 'b') - X[train], X[test] - y[train], y[test] - - -@ignore_warnings -def test_cross_val_generator_with_default_indices(): - X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - y = np.array([1, 1, 2, 2]) - labels = np.array([1, 2, 3, 4]) - loo = cval.LeaveOneOut(4) - lpo = cval.LeavePOut(4, 2) - kf = cval.KFold(4, 2) - skf = cval.StratifiedKFold(y, 2) - lolo = cval.LeaveOneLabelOut(labels) - lopo = cval.LeavePLabelOut(labels, 2) - ss = cval.ShuffleSplit(2) - ps = cval.PredefinedSplit([1, 1, 2, 2]) - for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: - for train, test in cv: - assert_not_equal(np.asarray(train).dtype.kind, 'b') - assert_not_equal(np.asarray(train).dtype.kind, 'b') - X[train], X[test] - y[train], y[test] - - -def test_shufflesplit_errors(): - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1, - train_size=0.95) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) - assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) - assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, - train_size=None) - - -def test_shufflesplit_reproducible(): - # Check that iterating twice on the ShuffleSplit gives the same - # sequence of train-test when the random_state is given - ss = cval.ShuffleSplit(10, random_state=21) - assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) - - -def test_safe_split_with_precomputed_kernel(): - clf = SVC() - clfp = SVC(kernel="precomputed") - - iris = load_iris() - X, y = iris.data, iris.target - K = np.dot(X, X.T) - - cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) - tr, te = list(cv)[0] - - X_tr, y_tr = cval._safe_split(clf, X, y, tr) - K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) - assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) - - X_te, y_te = cval._safe_split(clf, X, y, te, tr) - K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) - assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T)) - - -def test_cross_val_score_allow_nans(): - # Check that cross_val_score allows input data with NaNs - X = np.arange(200, dtype=np.float64).reshape(10, -1) - X[2, :] = np.nan - y = np.repeat([0, 1], X.shape[0] / 2) - p = Pipeline([ - ('imputer', Imputer(strategy='mean', missing_values='NaN')), - ('classifier', MockClassifier()), - ]) - cval.cross_val_score(p, X, y, cv=5) - - -def test_train_test_split_allow_nans(): - # Check that train_test_split allows input data with NaNs - X = np.arange(200, dtype=np.float64).reshape(10, -1) - X[2, :] = np.nan - y = np.repeat([0, 1], X.shape[0] / 2) - cval.train_test_split(X, y, test_size=0.2, random_state=42) - - -def test_permutation_test_score_allow_nans(): - # Check that permutation_test_score allows input data with NaNs - X = np.arange(200, dtype=np.float64).reshape(10, -1) - X[2, :] = np.nan - y = np.repeat([0, 1], X.shape[0] / 2) - p = Pipeline([ - ('imputer', Imputer(strategy='mean', missing_values='NaN')), - ('classifier', MockClassifier()), - ]) - cval.permutation_test_score(p, X, y, cv=5) - - -def test_check_cv_return_types(): - X = np.ones((9, 2)) - cv = cval.check_cv(3, X, classifier=False) - assert_true(isinstance(cv, cval.KFold)) - - y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) - cv = cval.check_cv(3, X, y_binary, classifier=True) - assert_true(isinstance(cv, cval.StratifiedKFold)) - - y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) - cv = cval.check_cv(3, X, y_multiclass, classifier=True) - assert_true(isinstance(cv, cval.StratifiedKFold)) - - X = np.ones((5, 2)) - y_multilabel = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 0]] - cv = cval.check_cv(3, X, y_multilabel, classifier=True) - assert_true(isinstance(cv, cval.KFold)) - - y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) - cv = cval.check_cv(3, X, y_multioutput, classifier=True) - assert_true(isinstance(cv, cval.KFold)) - - -def test_cross_val_score_multilabel(): - X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], - [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) - y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], - [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) - clf = KNeighborsClassifier(n_neighbors=1) - scoring_micro = make_scorer(precision_score, average='micro') - scoring_macro = make_scorer(precision_score, average='macro') - scoring_samples = make_scorer(precision_score, average='samples') - score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) - score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) - score_samples = cval.cross_val_score(clf, X, y, - scoring=scoring_samples, cv=5) - assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) - assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) - assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) - - -def test_cross_val_predict(): - boston = load_boston() - X, y = boston.data, boston.target - cv = cval.KFold(len(boston.target)) - - est = Ridge() - - # Naive loop (should be same as cross_val_predict): - preds2 = np.zeros_like(y) - for train, test in cv: - est.fit(X[train], y[train]) - preds2[test] = est.predict(X[test]) - - preds = cval.cross_val_predict(est, X, y, cv=cv) - assert_array_almost_equal(preds, preds2) - - preds = cval.cross_val_predict(est, X, y) - assert_equal(len(preds), len(y)) - - cv = cval.LeaveOneOut(len(y)) - preds = cval.cross_val_predict(est, X, y, cv=cv) - assert_equal(len(preds), len(y)) - - Xsp = X.copy() - Xsp *= (Xsp > np.median(Xsp)) - Xsp = coo_matrix(Xsp) - preds = cval.cross_val_predict(est, Xsp, y) - assert_array_almost_equal(len(preds), len(y)) - - preds = cval.cross_val_predict(KMeans(), X) - assert_equal(len(preds), len(y)) - - def bad_cv(): - for i in range(4): - yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) - - assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv()) - - -def test_cross_val_predict_input_types(): - clf = Ridge() - # Smoke test - predictions = cval.cross_val_predict(clf, X, y) - assert_equal(predictions.shape, (10,)) - - # test with multioutput y - with ignore_warnings(category=ConvergenceWarning): - predictions = cval.cross_val_predict(clf, X_sparse, X) - assert_equal(predictions.shape, (10, 2)) - - predictions = cval.cross_val_predict(clf, X_sparse, y) - assert_array_equal(predictions.shape, (10,)) - - # test with multioutput y - with ignore_warnings(category=ConvergenceWarning): - predictions = cval.cross_val_predict(clf, X_sparse, X) - assert_array_equal(predictions.shape, (10, 2)) - - # test with X and y as list - list_check = lambda x: isinstance(x, list) - clf = CheckingClassifier(check_X=list_check) - predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist()) - - clf = CheckingClassifier(check_y=list_check) - predictions = cval.cross_val_predict(clf, X, y.tolist()) - - # test with 3d X and - X_3d = X[:, :, np.newaxis] - check_3d = lambda x: x.ndim == 3 - clf = CheckingClassifier(check_X=check_3d) - predictions = cval.cross_val_predict(clf, X_3d, y) - assert_array_equal(predictions.shape, (10,)) - - -def test_cross_val_predict_pandas(): - # check cross_val_score doesn't destroy pandas dataframe - types = [(MockDataFrame, MockDataFrame)] - try: - from pandas import Series, DataFrame - types.append((Series, DataFrame)) - except ImportError: - pass - for TargetType, InputFeatureType in types: - # X dataframe, y series - X_df, y_ser = InputFeatureType(X), TargetType(y) - check_df = lambda x: isinstance(x, InputFeatureType) - check_series = lambda x: isinstance(x, TargetType) - clf = CheckingClassifier(check_X=check_df, check_y=check_series) - cval.cross_val_predict(clf, X_df, y_ser) - - -def test_sparse_fit_params(): - iris = load_iris() - X, y = iris.data, iris.target - clf = MockClassifier() - fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))} - a = cval.cross_val_score(clf, X, y, fit_params=fit_params) - assert_array_equal(a, np.ones(3)) - - -def test_check_is_partition(): - p = np.arange(100) - assert_true(cval._check_is_partition(p, 100)) - assert_false(cval._check_is_partition(np.delete(p, 23), 100)) - - p[0] = 23 - assert_false(cval._check_is_partition(p, 100)) - - -def test_cross_val_predict_sparse_prediction(): - # check that cross_val_predict gives same result for sparse and dense input - X, y = make_multilabel_classification(n_classes=2, n_labels=1, - allow_unlabeled=False, - return_indicator=True, - random_state=1) - X_sparse = csr_matrix(X) - y_sparse = csr_matrix(y) - classif = OneVsRestClassifier(SVC(kernel='linear')) - preds = cval.cross_val_predict(classif, X, y, cv=10) - preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10) - preds_sparse = preds_sparse.toarray() - assert_array_almost_equal(preds_sparse, preds) From 6dfe9aa732a6860ea0d24489b62efe98b289cd06 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 15:44:44 +0100 Subject: [PATCH 02/14] Fix imports (from corss_validation module to model_selection module) --- sklearn/feature_selection/rfe.py | 3 ++- sklearn/grid_search.py | 4 ++-- sklearn/learning_curve.py | 5 +++-- sklearn/tests/test_grid_search.py | 2 +- sklearn/tests/test_learning_curve.py | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 5bde9e57c3f9f..576c872982f5a 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -9,6 +9,7 @@ import numpy as np from ..utils import check_X_y, safe_sqr from ..utils.metaestimators import if_delegate_has_method +from ..utils.metaestimators import _safe_split from ..utils.validation import check_is_fitted from ..base import BaseEstimator from ..base import MetaEstimatorMixin @@ -16,7 +17,7 @@ from ..base import is_classifier from ..externals.joblib import Parallel, delayed from ..model_selection import check_cv -from ..model_selection._validation import _safe_split, _score +from ..model_selection._validation import _score from ..metrics.scorer import check_scoring from .base import SelectorMixin diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 76cdaa7cb1de5..e36d22c501621 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -21,8 +21,8 @@ from .base import BaseEstimator, is_classifier, clone from .base import MetaEstimatorMixin -from .cross_validation import check_cv -from .cross_validation import _fit_and_score +from .model_selection import check_cv +from .model_selection._validation import _fit_and_score from .externals.joblib import Parallel, delayed from .externals import six from .utils import check_random_state diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 5571138d68d83..0bb24046680ec 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -9,9 +9,10 @@ import numpy as np from .base import is_classifier, clone -from .cross_validation import check_cv +from .model_selection import check_cv from .externals.joblib import Parallel, delayed -from .cross_validation import _safe_split, _score, _fit_and_score +from .utils.metaestimators import _safe_split +from .model_selection._validation import _fit_and_score, _score from .metrics.scorer import check_scoring from .utils import indexable diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index f3c003e8c5be5..3605da1613e13 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -45,12 +45,12 @@ from sklearn.linear_model import Ridge from sklearn.exceptions import FitFailedWarning +from sklearn.model_selection import KFold, StratifiedKFold with warnings.catch_warnings(): warnings.simplefilter('ignore') from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV, ParameterGrid, ParameterSampler) - from sklearn.cross_validation import KFold, StratifiedKFold from sklearn.preprocessing import Imputer from sklearn.pipeline import Pipeline diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index afaae84b92b04..d75e6bc82f6b3 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -14,11 +14,11 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_false from sklearn.datasets import make_classification +from sklearn.model_selection import KFold with warnings.catch_warnings(): warnings.simplefilter('ignore') from sklearn.learning_curve import learning_curve, validation_curve - from sklearn.cross_validation import KFold from sklearn.linear_model import PassiveAggressiveClassifier From af424240be12734ef2a365fb4205892d32acd72d Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 15:45:40 +0100 Subject: [PATCH 03/14] Remove tests checking old implementation --- sklearn/model_selection/tests/test_split.py | 26 --------------------- 1 file changed, 26 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 3f54aaf3c66fc..0071129d8ce73 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1210,36 +1210,10 @@ def test_check_cv(): cv = check_cv(3, y_multioutput, classifier=True) np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) - # Check if the old style classes are wrapped to have a split method - X = np.ones(9) - y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) - cv1 = check_cv(3, y_multiclass, classifier=True) - - with warnings.catch_warnings(record=True): - from sklearn.cross_validation import StratifiedKFold as OldSKF - - cv2 = check_cv(OldSKF(y_multiclass, n_folds=3)) - np.testing.assert_equal(list(cv1.split(X, y_multiclass)), - list(cv2.split())) - assert_raises(ValueError, check_cv, cv="lolo") def test_cv_iterable_wrapper(): - y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) - - with warnings.catch_warnings(record=True): - from sklearn.cross_validation import StratifiedKFold as OldSKF - - cv = OldSKF(y_multiclass, n_folds=3) - wrapped_old_skf = _CVIterableWrapper(cv) - - # Check if split works correctly - np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) - - # Check if get_n_splits works correctly - assert_equal(len(cv), wrapped_old_skf.get_n_splits()) - kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, From 2362011efcbf6651ed6ce4c3cea2cafab67857e2 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 16:47:37 +0100 Subject: [PATCH 04/14] Remove grid_search and learning_curve also deprecated --- sklearn/__init__.py | 13 +- sklearn/grid_search.py | 1046 -------------------------- sklearn/learning_curve.py | 361 --------- sklearn/tests/test_grid_search.py | 815 -------------------- sklearn/tests/test_learning_curve.py | 312 -------- 5 files changed, 6 insertions(+), 2541 deletions(-) delete mode 100644 sklearn/grid_search.py delete mode 100644 sklearn/learning_curve.py delete mode 100644 sklearn/tests/test_grid_search.py delete mode 100644 sklearn/tests/test_learning_curve.py diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 27879e16be363..4c1f6f8e829e0 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -137,13 +137,12 @@ def config_context(**new_config): __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', 'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions', 'externals', 'feature_extraction', 'feature_selection', - 'gaussian_process', 'grid_search', 'isotonic', - 'kernel_approximation', 'kernel_ridge', 'learning_curve', - 'linear_model', 'manifold', 'metrics', 'mixture', - 'model_selection', 'multiclass', 'multioutput', 'naive_bayes', - 'neighbors', 'neural_network', 'pipeline', 'preprocessing', - 'random_projection', 'semi_supervised', 'svm', 'tree', - 'discriminant_analysis', + 'gaussian_process', 'isotonic', 'kernel_approximation', + 'kernel_ridge', 'linear_model', 'manifold', 'metrics', + 'mixture', 'model_selection', 'multiclass', 'multioutput', + 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', + 'preprocessing', 'random_projection', 'semi_supervised', 'svm', + 'tree', 'discriminant_analysis', # Non-modules: 'clone'] diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py deleted file mode 100644 index e36d22c501621..0000000000000 --- a/sklearn/grid_search.py +++ /dev/null @@ -1,1046 +0,0 @@ -""" -The :mod:`sklearn.grid_search` includes utilities to fine-tune the parameters -of an estimator. -""" -from __future__ import print_function - -# Author: Alexandre Gramfort , -# Gael Varoquaux -# Andreas Mueller -# Olivier Grisel -# License: BSD 3 clause - -from abc import ABCMeta, abstractmethod -from collections import Mapping, namedtuple, Sized -from functools import partial, reduce -from itertools import product -import operator -import warnings - -import numpy as np - -from .base import BaseEstimator, is_classifier, clone -from .base import MetaEstimatorMixin -from .model_selection import check_cv -from .model_selection._validation import _fit_and_score -from .externals.joblib import Parallel, delayed -from .externals import six -from .utils import check_random_state -from .utils.random import sample_without_replacement -from .utils.validation import _num_samples, indexable -from .utils.metaestimators import if_delegate_has_method -from .metrics.scorer import check_scoring - - -__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', - 'ParameterSampler', 'RandomizedSearchCV'] - - -warnings.warn("This module was deprecated in version 0.18 in favor of the " - "model_selection module into which all the refactored classes " - "and functions are moved. This module will be removed in 0.20.", - DeprecationWarning) - - -class ParameterGrid(object): - """Grid of parameters with a discrete number of values for each. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.ParameterGrid` instead. - - Can be used to iterate over parameter value combinations with the - Python built-in function iter. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - param_grid : dict of string to sequence, or sequence of such - The parameter grid to explore, as a dictionary mapping estimator - parameters to sequences of allowed values. - - An empty dict signifies default parameters. - - A sequence of dicts signifies a sequence of grids to search, and is - useful to avoid exploring parameter combinations that make no sense - or have no effect. See the examples below. - - Examples - -------- - >>> from sklearn.grid_search import ParameterGrid - >>> param_grid = {'a': [1, 2], 'b': [True, False]} - >>> list(ParameterGrid(param_grid)) == ( - ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, - ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) - True - - >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] - >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, - ... {'kernel': 'rbf', 'gamma': 1}, - ... {'kernel': 'rbf', 'gamma': 10}] - True - >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1} - True - - See also - -------- - :class:`GridSearchCV`: - uses ``ParameterGrid`` to perform a full parallelized parameter search. - """ - - def __init__(self, param_grid): - if isinstance(param_grid, Mapping): - # wrap dictionary in a singleton list to support either dict - # or list of dicts - param_grid = [param_grid] - self.param_grid = param_grid - - def __iter__(self): - """Iterate over the points in the grid. - - Returns - ------- - params : iterator over dict of string to any - Yields dictionaries mapping each estimator parameter to one of its - allowed values. - """ - for p in self.param_grid: - # Always sort the keys of a dictionary, for reproducibility - items = sorted(p.items()) - if not items: - yield {} - else: - keys, values = zip(*items) - for v in product(*values): - params = dict(zip(keys, v)) - yield params - - def __len__(self): - """Number of points on the grid.""" - # Product function that can handle iterables (np.product can't). - product = partial(reduce, operator.mul) - return sum(product(len(v) for v in p.values()) if p else 1 - for p in self.param_grid) - - def __getitem__(self, ind): - """Get the parameters that would be ``ind``th in iteration - - Parameters - ---------- - ind : int - The iteration index - - Returns - ------- - params : dict of string to any - Equal to list(self)[ind] - """ - # This is used to make discrete sampling without replacement memory - # efficient. - for sub_grid in self.param_grid: - # XXX: could memoize information used here - if not sub_grid: - if ind == 0: - return {} - else: - ind -= 1 - continue - - # Reverse so most frequent cycling parameter comes first - keys, values_lists = zip(*sorted(sub_grid.items())[::-1]) - sizes = [len(v_list) for v_list in values_lists] - total = np.product(sizes) - - if ind >= total: - # Try the next grid - ind -= total - else: - out = {} - for key, v_list, n in zip(keys, values_lists, sizes): - ind, offset = divmod(ind, n) - out[key] = v_list[offset] - return out - - raise IndexError('ParameterGrid index out of range') - - -class ParameterSampler(object): - """Generator on parameters sampled from given distributions. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.ParameterSampler` instead. - - Non-deterministic iterable over random candidate combinations for hyper- - parameter search. If all parameters are presented as a list, - sampling without replacement is performed. If at least one parameter - is given as a distribution, sampling with replacement is used. - It is highly recommended to use continuous distributions for continuous - parameters. - - Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept - a custom RNG instance and always use the singleton RNG from - ``numpy.random``. Hence setting ``random_state`` will not guarantee a - deterministic iteration whenever ``scipy.stats`` distributions are used to - define the parameter search space. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - param_distributions : dict - Dictionary where the keys are parameters and values - are distributions from which a parameter is to be sampled. - Distributions either have to provide a ``rvs`` function - to sample from them, or can be given as a list of values, - where a uniform distribution is assumed. - - n_iter : integer - Number of parameter settings that are produced. - - random_state : int, RandomState instance or None, optional (default=None) - Pseudo random number generator state used for random uniform sampling - from lists of possible values instead of scipy.stats distributions. - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Returns - ------- - params : dict of string to any - **Yields** dictionaries mapping each estimator parameter to - as sampled value. - - Examples - -------- - >>> from sklearn.grid_search import ParameterSampler - >>> from scipy.stats.distributions import expon - >>> import numpy as np - >>> np.random.seed(0) - >>> param_grid = {'a':[1, 2], 'b': expon()} - >>> param_list = list(ParameterSampler(param_grid, n_iter=4)) - >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) - ... for d in param_list] - >>> rounded_list == [{'b': 0.89856, 'a': 1}, - ... {'b': 0.923223, 'a': 1}, - ... {'b': 1.878964, 'a': 2}, - ... {'b': 1.038159, 'a': 2}] - True - """ - def __init__(self, param_distributions, n_iter, random_state=None): - self.param_distributions = param_distributions - self.n_iter = n_iter - self.random_state = random_state - - def __iter__(self): - # check if all distributions are given as lists - # in this case we want to sample without replacement - all_lists = np.all([not hasattr(v, "rvs") - for v in self.param_distributions.values()]) - rnd = check_random_state(self.random_state) - - if all_lists: - # look up sampled parameter settings in parameter grid - param_grid = ParameterGrid(self.param_distributions) - grid_size = len(param_grid) - - if grid_size < self.n_iter: - raise ValueError( - "The total space of parameters %d is smaller " - "than n_iter=%d." % (grid_size, self.n_iter) - + " For exhaustive searches, use GridSearchCV.") - for i in sample_without_replacement(grid_size, self.n_iter, - random_state=rnd): - yield param_grid[i] - - else: - # Always sort the keys of a dictionary, for reproducibility - items = sorted(self.param_distributions.items()) - for _ in six.moves.range(self.n_iter): - params = dict() - for k, v in items: - if hasattr(v, "rvs"): - params[k] = v.rvs() - else: - params[k] = v[rnd.randint(len(v))] - yield params - - def __len__(self): - """Number of points that will be sampled.""" - return self.n_iter - - -def fit_grid_point(X, y, estimator, parameters, train, test, scorer, - verbose, error_score='raise', **fit_params): - """Run fit on one set of parameters. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.fit_grid_point` instead. - - Parameters - ---------- - X : array-like, sparse matrix or list - Input data. - - y : array-like or None - Targets for input data. - - estimator : estimator object - A object of that type is instantiated for each grid point. - This is assumed to implement the scikit-learn estimator interface. - Either estimator needs to provide a ``score`` function, - or ``scoring`` must be passed. - - parameters : dict - Parameters to be set on estimator for this grid point. - - train : ndarray, dtype int or bool - Boolean mask or indices for training set. - - test : ndarray, dtype int or bool - Boolean mask or indices for test set. - - scorer : callable or None. - If provided must be a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - verbose : int - Verbosity level. - - **fit_params : kwargs - Additional parameter passed to the fit function of the estimator. - - error_score : 'raise' (default) or numeric - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - Returns - ------- - score : float - Score of this parameter setting on given training / test split. - - parameters : dict - The parameters that have been evaluated. - - n_samples_test : int - Number of test samples in this split. - """ - score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train, - test, verbose, parameters, - fit_params, error_score) - return score, parameters, n_samples_test - - -def _check_param_grid(param_grid): - if hasattr(param_grid, 'items'): - param_grid = [param_grid] - - for p in param_grid: - for name, v in p.items(): - if isinstance(v, np.ndarray) and v.ndim > 1: - raise ValueError("Parameter array should be one-dimensional.") - - check = [isinstance(v, k) for k in (list, tuple, np.ndarray)] - if True not in check: - raise ValueError("Parameter values for parameter ({0}) need " - "to be a sequence.".format(name)) - - if len(v) == 0: - raise ValueError("Parameter values for parameter ({0}) need " - "to be a non-empty sequence.".format(name)) - - -class _CVScoreTuple (namedtuple('_CVScoreTuple', - ('parameters', - 'mean_validation_score', - 'cv_validation_scores'))): - # A raw namedtuple is very memory efficient as it packs the attributes - # in a struct to get rid of the __dict__ of attributes in particular it - # does not copy the string for the keys on each instance. - # By deriving a namedtuple class just to introduce the __repr__ method we - # would also reintroduce the __dict__ on the instance. By telling the - # Python interpreter that this subclass uses static __slots__ instead of - # dynamic attributes. Furthermore we don't need any additional slot in the - # subclass so we set __slots__ to the empty tuple. - __slots__ = () - - def __repr__(self): - """Simple custom repr to summarize the main info""" - return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format( - self.mean_validation_score, - np.std(self.cv_validation_scores), - self.parameters) - - -class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, - MetaEstimatorMixin)): - """Base class for hyper parameter search with cross-validation.""" - - @abstractmethod - def __init__(self, estimator, scoring=None, - fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', - error_score='raise'): - - self.scoring = scoring - self.estimator = estimator - self.n_jobs = n_jobs - self.fit_params = fit_params if fit_params is not None else {} - self.iid = iid - self.refit = refit - self.cv = cv - self.verbose = verbose - self.pre_dispatch = pre_dispatch - self.error_score = error_score - - @property - def _estimator_type(self): - return self.estimator._estimator_type - - @property - def classes_(self): - return self.best_estimator_.classes_ - - def score(self, X, y=None): - """Returns the score on the given data, if the estimator has been refit. - - This uses the score defined by ``scoring`` where provided, and the - ``best_estimator_.score`` method otherwise. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Input data, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - Returns - ------- - score : float - - Notes - ----- - * The long-standing behavior of this method changed in version 0.16. - * It no longer uses the metric provided by ``estimator.score`` if the - ``scoring`` parameter was set when fitting. - - """ - if self.scorer_ is None: - raise ValueError("No score function explicitly defined, " - "and the estimator doesn't provide one %s" - % self.best_estimator_) - return self.scorer_(self.best_estimator_, X, y) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def predict(self, X): - """Call predict on the estimator with the best found parameters. - - Only available if ``refit=True`` and the underlying estimator supports - ``predict``. - - Parameters - ----------- - X : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.predict(X) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def predict_proba(self, X): - """Call predict_proba on the estimator with the best found parameters. - - Only available if ``refit=True`` and the underlying estimator supports - ``predict_proba``. - - Parameters - ----------- - X : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.predict_proba(X) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def predict_log_proba(self, X): - """Call predict_log_proba on the estimator with the best found parameters. - - Only available if ``refit=True`` and the underlying estimator supports - ``predict_log_proba``. - - Parameters - ----------- - X : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.predict_log_proba(X) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def decision_function(self, X): - """Call decision_function on the estimator with the best found parameters. - - Only available if ``refit=True`` and the underlying estimator supports - ``decision_function``. - - Parameters - ----------- - X : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.decision_function(X) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def transform(self, X): - """Call transform on the estimator with the best found parameters. - - Only available if the underlying estimator supports ``transform`` and - ``refit=True``. - - Parameters - ----------- - X : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.transform(X) - - @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) - def inverse_transform(self, Xt): - """Call inverse_transform on the estimator with the best found parameters. - - Only available if the underlying estimator implements ``inverse_transform`` and - ``refit=True``. - - Parameters - ----------- - Xt : indexable, length n_samples - Must fulfill the input assumptions of the - underlying estimator. - - """ - return self.best_estimator_.inverse_transform(Xt) - - def _fit(self, X, y, parameter_iterable): - """Actual fitting, performing the search over parameters.""" - - estimator = self.estimator - cv = self.cv - self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) - - n_samples = _num_samples(X) - X, y = indexable(X, y) - - if y is not None: - if len(y) != n_samples: - raise ValueError('Target variable (y) has a different number ' - 'of samples (%i) than data (X: %i samples)' - % (len(y), n_samples)) - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - - if self.verbose > 0: - if isinstance(parameter_iterable, Sized): - n_candidates = len(parameter_iterable) - print("Fitting {0} folds for each of {1} candidates, totalling" - " {2} fits".format(len(cv), n_candidates, - n_candidates * len(cv))) - - base_estimator = clone(self.estimator) - - pre_dispatch = self.pre_dispatch - - out = Parallel( - n_jobs=self.n_jobs, verbose=self.verbose, - pre_dispatch=pre_dispatch - )( - delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, - train, test, self.verbose, parameters, - self.fit_params, return_parameters=True, - error_score=self.error_score) - for parameters in parameter_iterable - for train, test in cv) - - # Out is a list of triplet: score, estimator, n_test_samples - n_fits = len(out) - n_folds = len(cv) - - scores = list() - grid_scores = list() - for grid_start in range(0, n_fits, n_folds): - n_test_samples = 0 - score = 0 - all_scores = [] - for this_score, this_n_test_samples, _, parameters in \ - out[grid_start:grid_start + n_folds]: - all_scores.append(this_score) - if self.iid: - this_score *= this_n_test_samples - n_test_samples += this_n_test_samples - score += this_score - if self.iid: - score /= float(n_test_samples) - else: - score /= float(n_folds) - scores.append((score, parameters)) - # TODO: shall we also store the test_fold_sizes? - grid_scores.append(_CVScoreTuple( - parameters, - score, - np.array(all_scores))) - # Store the computed scores - self.grid_scores_ = grid_scores - - # Find the best parameters by comparing on the mean validation score: - # note that `sorted` is deterministic in the way it breaks ties - best = sorted(grid_scores, key=lambda x: x.mean_validation_score, - reverse=True)[0] - self.best_params_ = best.parameters - self.best_score_ = best.mean_validation_score - - if self.refit: - # fit the best estimator using the entire dataset - # clone first to work around broken estimators - best_estimator = clone(base_estimator).set_params( - **best.parameters) - if y is not None: - best_estimator.fit(X, y, **self.fit_params) - else: - best_estimator.fit(X, **self.fit_params) - self.best_estimator_ = best_estimator - return self - - -class GridSearchCV(BaseSearchCV): - """Exhaustive search over specified parameter values for an estimator. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.GridSearchCV` instead. - - Important members are fit, predict. - - GridSearchCV implements a "fit" and a "score" method. - It also implements "predict", "predict_proba", "decision_function", - "transform" and "inverse_transform" if they are implemented in the - estimator used. - - The parameters of the estimator used to apply these methods are optimized - by cross-validated grid-search over a parameter grid. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object. - A object of that type is instantiated for each grid point. - This is assumed to implement the scikit-learn estimator interface. - Either estimator needs to provide a ``score`` function, - or ``scoring`` must be passed. - - param_grid : dict or list of dictionaries - Dictionary with parameters names (string) as keys and lists of - parameter settings to try as values, or a list of such - dictionaries, in which case the grids spanned by each dictionary - in the list are explored. This enables searching over any sequence - of parameter settings. - - scoring : string, callable or None, default=None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - If ``None``, the ``score`` method of the estimator is used. - - fit_params : dict, optional - Parameters to pass to the fit method. - - n_jobs: int, default: 1 : - The maximum number of estimators fit in parallel. - - - If -1 all CPUs are used. - - - If 1 is given, no parallel computing code is used at all, - which is useful for debugging. - - - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used. - For example, with ``n_jobs = -2`` all CPUs but one are used. - - .. versionchanged:: 0.17 - Upgraded to joblib 0.9.3. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - iid : boolean, default=True - If True, the data is assumed to be identically distributed across - the folds, and the loss minimized is the total loss per sample, - and not the mean loss across the folds. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. In all - other cases, :class:`sklearn.model_selection.KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - refit : boolean, default=True - Refit the best estimator with the entire dataset. - If "False", it is impossible to make predictions using - this GridSearchCV instance after fitting. - - verbose : integer - Controls the verbosity: the higher, the more messages. - - error_score : 'raise' (default) or numeric - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - - Examples - -------- - >>> from sklearn import svm, grid_search, datasets - >>> iris = datasets.load_iris() - >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} - >>> svr = svm.SVC() - >>> clf = grid_search.GridSearchCV(svr, parameters) - >>> clf.fit(iris.data, iris.target) - ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - GridSearchCV(cv=None, error_score=..., - estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., - decision_function_shape='ovr', degree=..., gamma=..., - kernel='rbf', max_iter=-1, probability=False, - random_state=None, shrinking=True, tol=..., - verbose=False), - fit_params={}, iid=..., n_jobs=1, - param_grid=..., pre_dispatch=..., refit=..., - scoring=..., verbose=...) - - - Attributes - ---------- - grid_scores_ : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold - - best_estimator_ : estimator - Estimator that was chosen by the search, i.e. estimator - which gave highest score (or smallest loss if specified) - on the left out data. Not available if refit=False. - - best_score_ : float - Score of best_estimator on the left out data. - - best_params_ : dict - Parameter setting that gave the best results on the hold out data. - - scorer_ : function - Scorer function used on the held out data to choose the best - parameters for the model. - - Notes - ------ - The parameters selected are those that maximize the score of the left out - data, unless an explicit score is passed in which case it is used instead. - - If `n_jobs` was set to a value higher than one, the data is copied for each - point in the grid (and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - --------- - :class:`ParameterGrid`: - generates all the combinations of a hyperparameter grid. - - :func:`sklearn.cross_validation.train_test_split`: - utility function to split the data into a development set usable - for fitting a GridSearchCV instance and an evaluation set for - its final evaluation. - - :func:`sklearn.metrics.make_scorer`: - Make a scorer from a performance metric or loss function. - - """ - - def __init__(self, estimator, param_grid, scoring=None, fit_params=None, - n_jobs=1, iid=True, refit=True, cv=None, verbose=0, - pre_dispatch='2*n_jobs', error_score='raise'): - - super(GridSearchCV, self).__init__( - estimator, scoring, fit_params, n_jobs, iid, - refit, cv, verbose, pre_dispatch, error_score) - self.param_grid = param_grid - _check_param_grid(param_grid) - - def fit(self, X, y=None): - """Run fit with all sets of parameters. - - Parameters - ---------- - - X : array-like, shape = [n_samples, n_features] - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - """ - return self._fit(X, y, ParameterGrid(self.param_grid)) - - -class RandomizedSearchCV(BaseSearchCV): - """Randomized search on hyper parameters. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :class:`sklearn.model_selection.RandomizedSearchCV` instead. - - RandomizedSearchCV implements a "fit" and a "score" method. - It also implements "predict", "predict_proba", "decision_function", - "transform" and "inverse_transform" if they are implemented in the - estimator used. - - The parameters of the estimator used to apply these methods are optimized - by cross-validated search over parameter settings. - - In contrast to GridSearchCV, not all parameter values are tried out, but - rather a fixed number of parameter settings is sampled from the specified - distributions. The number of parameter settings that are tried is - given by n_iter. - - If all parameters are presented as a list, - sampling without replacement is performed. If at least one parameter - is given as a distribution, sampling with replacement is used. - It is highly recommended to use continuous distributions for continuous - parameters. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object. - A object of that type is instantiated for each grid point. - This is assumed to implement the scikit-learn estimator interface. - Either estimator needs to provide a ``score`` function, - or ``scoring`` must be passed. - - param_distributions : dict - Dictionary with parameters names (string) as keys and distributions - or lists of parameters to try. Distributions must provide a ``rvs`` - method for sampling (such as those from scipy.stats.distributions). - If a list is given, it is sampled uniformly. - - n_iter : int, default=10 - Number of parameter settings that are sampled. n_iter trades - off runtime vs quality of the solution. - - scoring : string, callable or None, default=None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - If ``None``, the ``score`` method of the estimator is used. - - fit_params : dict, optional - Parameters to pass to the fit method. - - n_jobs: int, default: 1 : - The maximum number of estimators fit in parallel. - - - If -1 all CPUs are used. - - - If 1 is given, no parallel computing code is used at all, - which is useful for debugging. - - - For ``n_jobs`` below -1, ``(n_cpus + n_jobs + 1)`` are used. - For example, with ``n_jobs = -2`` all CPUs but one are used. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - iid : boolean, default=True - If True, the data is assumed to be identically distributed across - the folds, and the loss minimized is the total loss per sample, - and not the mean loss across the folds. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. In all - other cases, :class:`sklearn.model_selection.KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - refit : boolean, default=True - Refit the best estimator with the entire dataset. - If "False", it is impossible to make predictions using - this RandomizedSearchCV instance after fitting. - - verbose : integer - Controls the verbosity: the higher, the more messages. - - random_state : int, RandomState instance or None, optional, default=None - Pseudo random number generator state used for random uniform sampling - from lists of possible values instead of scipy.stats distributions. - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - error_score : 'raise' (default) or numeric - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - - Attributes - ---------- - grid_scores_ : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold - - best_estimator_ : estimator - Estimator that was chosen by the search, i.e. estimator - which gave highest score (or smallest loss if specified) - on the left out data. Not available if refit=False. - - best_score_ : float - Score of best_estimator on the left out data. - - best_params_ : dict - Parameter setting that gave the best results on the hold out data. - - Notes - ----- - The parameters selected are those that maximize the score of the held-out - data, according to the scoring parameter. - - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - -------- - :class:`GridSearchCV`: - Does exhaustive search over a grid of parameters. - - :class:`ParameterSampler`: - A generator over parameter settings, constructed from - param_distributions. - - """ - - def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, - fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, - verbose=0, pre_dispatch='2*n_jobs', random_state=None, - error_score='raise'): - - self.param_distributions = param_distributions - self.n_iter = n_iter - self.random_state = random_state - super(RandomizedSearchCV, self).__init__( - estimator=estimator, scoring=scoring, fit_params=fit_params, - n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, - pre_dispatch=pre_dispatch, error_score=error_score) - - def fit(self, X, y=None): - """Run fit on the estimator with randomly drawn parameters. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vector, where n_samples in the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - """ - sampled_params = ParameterSampler(self.param_distributions, - self.n_iter, - random_state=self.random_state) - return self._fit(X, y, sampled_params) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py deleted file mode 100644 index 0bb24046680ec..0000000000000 --- a/sklearn/learning_curve.py +++ /dev/null @@ -1,361 +0,0 @@ -"""Utilities to evaluate models with respect to a variable -""" -# Author: Alexander Fabisch -# -# License: BSD 3 clause - -import warnings - -import numpy as np - -from .base import is_classifier, clone -from .model_selection import check_cv -from .externals.joblib import Parallel, delayed -from .utils.metaestimators import _safe_split -from .model_selection._validation import _fit_and_score, _score -from .metrics.scorer import check_scoring -from .utils import indexable - - -warnings.warn("This module was deprecated in version 0.18 in favor of the " - "model_selection module into which all the functions are moved." - " This module will be removed in 0.20", - DeprecationWarning) - - -__all__ = ['learning_curve', 'validation_curve'] - - -def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), - cv=None, scoring=None, exploit_incremental_learning=False, - n_jobs=1, pre_dispatch="all", verbose=0, - error_score='raise'): - """Learning curve. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.learning_curve` instead. - - Determines cross-validated training and test scores for different training - set sizes. - - A cross-validation generator splits the whole dataset k times in training - and test data. Subsets of the training set with varying sizes will be used - to train the estimator and a score for each training subset size and the - test set will be computed. Afterwards, the scores will be averaged over - all k runs for each training subset size. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. - - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape (n_samples) or (n_samples, n_features), optional - Target relative to X for classification or regression; - None for unsupervised learning. - - train_sizes : array-like, shape (n_ticks,), dtype float or int - Relative or absolute numbers of training examples that will be used to - generate the learning curve. If the dtype is float, it is regarded as a - fraction of the maximum size of the training set (that is determined - by the selected validation method), i.e. it has to be within (0, 1]. - Otherwise it is interpreted as absolute sizes of the training sets. - Note that for classification the number of samples usually have to - be big enough to contain at least one sample from each class. - (default: np.linspace(0.1, 1.0, 5)) - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. In all - other cases, :class:`sklearn.model_selection.KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - exploit_incremental_learning : boolean, optional, default: False - If the estimator supports incremental learning, this will be - used to speed up fitting for different training set sizes. - - n_jobs : integer, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : integer or string, optional - Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can - be an expression like '2*n_jobs'. - - verbose : integer, optional - Controls the verbosity: the higher, the more messages. - - error_score : 'raise' (default) or numeric - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - Returns - ------- - train_sizes_abs : array, shape = (n_unique_ticks,), dtype int - Numbers of training examples that has been used to generate the - learning curve. Note that the number of ticks might be less - than n_ticks because duplicate entries will be removed. - - train_scores : array, shape (n_ticks, n_cv_folds) - Scores on training sets. - - test_scores : array, shape (n_ticks, n_cv_folds) - Scores on test set. - - Notes - ----- - See :ref:`examples/model_selection/plot_learning_curve.py - ` - """ - if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): - raise ValueError("An estimator must support the partial_fit interface " - "to exploit incremental learning") - - X, y = indexable(X, y) - # Make a list since we will be iterating multiple times over the folds - cv = list(check_cv(cv, X, y, classifier=is_classifier(estimator))) - scorer = check_scoring(estimator, scoring=scoring) - - # HACK as long as boolean indices are allowed in cv generators - if cv[0][0].dtype == bool: - new_cv = [] - for i in range(len(cv)): - new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0])) - cv = new_cv - - n_max_training_samples = len(cv[0][0]) - # Because the lengths of folds can be significantly different, it is - # not guaranteed that we use all of the available training data when we - # use the first 'n_max_training_samples' samples. - train_sizes_abs = _translate_train_sizes(train_sizes, - n_max_training_samples) - n_unique_ticks = train_sizes_abs.shape[0] - if verbose > 0: - print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) - - parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, - verbose=verbose) - if exploit_incremental_learning: - classes = np.unique(y) if is_classifier(estimator) else None - out = parallel(delayed(_incremental_fit_estimator)( - clone(estimator), X, y, classes, train, test, train_sizes_abs, - scorer, verbose) for train, test in cv) - else: - out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, scorer, train[:n_train_samples], test, - verbose, parameters=None, fit_params=None, return_train_score=True, - error_score=error_score) - for train, test in cv for n_train_samples in train_sizes_abs) - out = np.array(out)[:, :2] - n_cv_folds = out.shape[0] // n_unique_ticks - out = out.reshape(n_cv_folds, n_unique_ticks, 2) - - out = np.asarray(out).transpose((2, 1, 0)) - - return train_sizes_abs, out[0], out[1] - - -def _translate_train_sizes(train_sizes, n_max_training_samples): - """Determine absolute sizes of training subsets and validate 'train_sizes'. - - Examples: - _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] - _translate_train_sizes([5, 10], 10) -> [5, 10] - - Parameters - ---------- - train_sizes : array-like, shape (n_ticks,), dtype float or int - Numbers of training examples that will be used to generate the - learning curve. If the dtype is float, it is regarded as a - fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. - - n_max_training_samples : int - Maximum number of training samples (upper bound of 'train_sizes'). - - Returns - ------- - train_sizes_abs : array, shape (n_unique_ticks,), dtype int - Numbers of training examples that will be used to generate the - learning curve. Note that the number of ticks might be less - than n_ticks because duplicate entries will be removed. - """ - train_sizes_abs = np.asarray(train_sizes) - n_ticks = train_sizes_abs.shape[0] - n_min_required_samples = np.min(train_sizes_abs) - n_max_required_samples = np.max(train_sizes_abs) - if np.issubdtype(train_sizes_abs.dtype, np.floating): - if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("train_sizes has been interpreted as fractions " - "of the maximum number of training samples and " - "must be within (0, 1], but is within [%f, %f]." - % (n_min_required_samples, - n_max_required_samples)) - train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype( - dtype=np.int, copy=False) - train_sizes_abs = np.clip(train_sizes_abs, 1, - n_max_training_samples) - else: - if (n_min_required_samples <= 0 or - n_max_required_samples > n_max_training_samples): - raise ValueError("train_sizes has been interpreted as absolute " - "numbers of training samples and must be within " - "(0, %d], but is within [%d, %d]." - % (n_max_training_samples, - n_min_required_samples, - n_max_required_samples)) - - train_sizes_abs = np.unique(train_sizes_abs) - if n_ticks > train_sizes_abs.shape[0]: - warnings.warn("Removed duplicate entries from 'train_sizes'. Number " - "of ticks will be less than the size of " - "'train_sizes' %d instead of %d)." - % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) - - return train_sizes_abs - - -def _incremental_fit_estimator(estimator, X, y, classes, train, test, - train_sizes, scorer, verbose): - """Train estimator on training subsets incrementally and compute scores.""" - train_scores, test_scores = [], [] - partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) - for n_train_samples, partial_train in partitions: - train_subset = train[:n_train_samples] - X_train, y_train = _safe_split(estimator, X, y, train_subset) - X_partial_train, y_partial_train = _safe_split(estimator, X, y, - partial_train) - X_test, y_test = _safe_split(estimator, X, y, test, train_subset) - if y_partial_train is None: - estimator.partial_fit(X_partial_train, classes=classes) - else: - estimator.partial_fit(X_partial_train, y_partial_train, - classes=classes) - train_scores.append(_score(estimator, X_train, y_train, scorer)) - test_scores.append(_score(estimator, X_test, y_test, scorer)) - return np.array((train_scores, test_scores)).T - - -def validation_curve(estimator, X, y, param_name, param_range, cv=None, - scoring=None, n_jobs=1, pre_dispatch="all", verbose=0): - """Validation curve. - - .. deprecated:: 0.18 - This module will be removed in 0.20. - Use :func:`sklearn.model_selection.validation_curve` instead. - - Determine training and test scores for varying parameter values. - - Compute scores for an estimator with different values of a specified - parameter. This is similar to grid search with one parameter. However, this - will also compute training scores and is merely a utility for plotting the - results. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. - - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape (n_samples) or (n_samples, n_features), optional - Target relative to X for classification or regression; - None for unsupervised learning. - - param_name : string - Name of the parameter that will be varied. - - param_range : array-like, shape (n_values,) - The values of the parameter that will be evaluated. - - cv : int, cross-validation generator or an iterable, optional - Determines the cross-validation splitting strategy. - Possible inputs for cv are: - - - None, to use the default 3-fold cross-validation, - - integer, to specify the number of folds. - - An object to be used as a cross-validation generator. - - An iterable yielding train/test splits. - - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. In all - other cases, :class:`sklearn.model_selection.KFold` is used. - - Refer :ref:`User Guide ` for the various - cross-validation strategies that can be used here. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - n_jobs : integer, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : integer or string, optional - Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can - be an expression like '2*n_jobs'. - - verbose : integer, optional - Controls the verbosity: the higher, the more messages. - - Returns - ------- - train_scores : array, shape (n_ticks, n_cv_folds) - Scores on training sets. - - test_scores : array, shape (n_ticks, n_cv_folds) - Scores on test set. - - Notes - ----- - See - :ref:`examples/model_selection/plot_validation_curve.py - ` - """ - X, y = indexable(X, y) - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, scoring=scoring) - - parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, - verbose=verbose) - out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, scorer, train, test, verbose, - parameters={param_name: v}, fit_params=None, return_train_score=True) - for train, test in cv for v in param_range) - - out = np.asarray(out)[:, :2] - n_params = len(param_range) - n_cv_folds = out.shape[0] // n_params - out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0)) - - return out[0], out[1] diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py deleted file mode 100644 index 3605da1613e13..0000000000000 --- a/sklearn/tests/test_grid_search.py +++ /dev/null @@ -1,815 +0,0 @@ -""" -Testing for grid search module (sklearn.grid_search) - -""" - -from collections import Iterable, Sized -from sklearn.externals.six.moves import cStringIO as StringIO -from sklearn.externals.six.moves import xrange -from itertools import chain, product -import pickle -import warnings -import sys - -import numpy as np -import scipy.sparse as sp - -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import assert_raise_message -from sklearn.utils.testing import assert_false, assert_true -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_almost_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_no_warnings -from sklearn.utils.testing import ignore_warnings -from sklearn.utils.mocking import CheckingClassifier, MockDataFrame - -from scipy.stats import bernoulli, expon, uniform - -from sklearn.externals.six.moves import zip -from sklearn.base import BaseEstimator -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs -from sklearn.datasets import make_multilabel_classification -from sklearn.svm import LinearSVC, SVC -from sklearn.tree import DecisionTreeRegressor -from sklearn.tree import DecisionTreeClassifier -from sklearn.cluster import KMeans -from sklearn.neighbors import KernelDensity -from sklearn.metrics import f1_score -from sklearn.metrics import make_scorer -from sklearn.metrics import roc_auc_score -from sklearn.linear_model import Ridge - -from sklearn.exceptions import FitFailedWarning -from sklearn.model_selection import KFold, StratifiedKFold - -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV, - ParameterGrid, ParameterSampler) - -from sklearn.preprocessing import Imputer -from sklearn.pipeline import Pipeline - - -# Neither of the following two estimators inherit from BaseEstimator, -# to test hyperparameter search on user-defined classifiers. -class MockClassifier(object): - """Dummy classifier to test the cross-validation""" - def __init__(self, foo_param=0): - self.foo_param = foo_param - - def fit(self, X, Y): - assert_true(len(X) == len(Y)) - return self - - def predict(self, T): - return T.shape[0] - - def transform(self, X): - return X - self.foo_param - - def inverse_transform(self, X): - return X + self.foo_param - - predict_proba = predict - decision_function = predict - - def score(self, X=None, Y=None): - if self.foo_param > 1: - score = 1. - else: - score = 0. - return score - - def get_params(self, deep=False): - return {'foo_param': self.foo_param} - - def set_params(self, **params): - self.foo_param = params['foo_param'] - return self - - -class LinearSVCNoScore(LinearSVC): - """An LinearSVC classifier that has no score method.""" - @property - def score(self): - raise AttributeError - -X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) -y = np.array([1, 1, 2, 2]) - - -def assert_grid_iter_equals_getitem(grid): - assert_equal(list(grid), [grid[i] for i in range(len(grid))]) - - -def test_parameter_grid(): - # Test basic properties of ParameterGrid. - params1 = {"foo": [1, 2, 3]} - grid1 = ParameterGrid(params1) - assert_true(isinstance(grid1, Iterable)) - assert_true(isinstance(grid1, Sized)) - assert_equal(len(grid1), 3) - assert_grid_iter_equals_getitem(grid1) - - params2 = {"foo": [4, 2], - "bar": ["ham", "spam", "eggs"]} - grid2 = ParameterGrid(params2) - assert_equal(len(grid2), 6) - - # loop to assert we can iterate over the grid multiple times - for i in xrange(2): - # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) - points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) - assert_equal(points, - set(("bar", x, "foo", y) - for x, y in product(params2["bar"], params2["foo"]))) - - assert_grid_iter_equals_getitem(grid2) - - # Special case: empty grid (useful to get default estimator settings) - empty = ParameterGrid({}) - assert_equal(len(empty), 1) - assert_equal(list(empty), [{}]) - assert_grid_iter_equals_getitem(empty) - assert_raises(IndexError, lambda: empty[1]) - - has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}]) - assert_equal(len(has_empty), 4) - assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}]) - assert_grid_iter_equals_getitem(has_empty) - - -def test_grid_search(): - # Test that the best estimator contains the right value for foo_param - clf = MockClassifier() - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) - # make sure it selects the smallest parameter in case of ties - old_stdout = sys.stdout - sys.stdout = StringIO() - grid_search.fit(X, y) - sys.stdout = old_stdout - assert_equal(grid_search.best_estimator_.foo_param, 2) - - for i, foo_i in enumerate([1, 2, 3]): - assert_true(grid_search.grid_scores_[i][0] - == {'foo_param': foo_i}) - # Smoke test the score etc: - grid_search.score(X, y) - grid_search.predict_proba(X) - grid_search.decision_function(X) - grid_search.transform(X) - - # Test exception handling on scoring - grid_search.scoring = 'sklearn' - assert_raises(ValueError, grid_search.fit, X, y) - - -def test_transform_inverse_transform_round_trip(): - clf = MockClassifier() - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3) - grid_search.fit(X, y) - X_round_trip = grid_search.inverse_transform(grid_search.transform(X)) - assert_array_equal(X, X_round_trip) - - -@ignore_warnings -def test_grid_search_no_score(): - # Test grid-search on classifier that has no score function. - clf = LinearSVC(random_state=0) - X, y = make_blobs(random_state=0, centers=2) - Cs = [.1, 1, 10] - clf_no_score = LinearSVCNoScore(random_state=0) - grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy') - grid_search.fit(X, y) - - grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs}, - scoring='accuracy') - # smoketest grid search - grid_search_no_score.fit(X, y) - - # check that best params are equal - assert_equal(grid_search_no_score.best_params_, grid_search.best_params_) - # check that we can call score and that it gives the correct result - assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y)) - - # giving no scoring function raises an error - grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs}) - assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit, - [[1]]) - - -def test_grid_search_score_method(): - X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, - random_state=0) - clf = LinearSVC(random_state=0) - grid = {'C': [.1]} - - search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y) - search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y) - search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid, - scoring='roc_auc').fit(X, y) - search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y) - - # ChangedBehaviourWarning occurred previously (prior to #9005) - score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y) - score_accuracy = assert_no_warnings(search_accuracy.score, X, y) - score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score, - X, y) - score_auc = assert_no_warnings(search_auc.score, X, y) - - # ensure the test is sane - assert_true(score_auc < 1.0) - assert_true(score_accuracy < 1.0) - assert_not_equal(score_auc, score_accuracy) - - assert_almost_equal(score_accuracy, score_no_scoring) - assert_almost_equal(score_auc, score_no_score_auc) - - -def test_trivial_grid_scores(): - # Test search over a "grid" with only one point. - # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV. - clf = MockClassifier() - grid_search = GridSearchCV(clf, {'foo_param': [1]}) - grid_search.fit(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) - - random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) - random_search.fit(X, y) - assert_true(hasattr(random_search, "grid_scores_")) - - -def test_no_refit(): - # Test that grid search can be used for model selection only - clf = MockClassifier() - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) - grid_search.fit(X, y) - assert_true(hasattr(grid_search, "best_params_")) - - -def test_grid_search_error(): - # Test that grid search will capture errors on data with different - # length - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - - clf = LinearSVC() - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - assert_raises(ValueError, cv.fit, X_[:180], y_) - - -def test_grid_search_iid(): - # test the iid parameter - # noise-free simple 2d-data - X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, - cluster_std=0.1, shuffle=False, n_samples=80) - # split dataset into two folds that are not iid - # first one contains data of all 4 blobs, second only from two. - mask = np.ones(X.shape[0], dtype=np.bool) - mask[np.where(y == 1)[0][::2]] = 0 - mask[np.where(y == 2)[0][::2]] = 0 - # this leads to perfect classification on one fold and a score of 1/3 on - # the other - svm = SVC(kernel='linear') - # create "cv" for splits - cv = [[mask, ~mask], [~mask, mask]] - # once with iid=True (default) - grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv) - grid_search.fit(X, y) - first = grid_search.grid_scores_[0] - assert_equal(first.parameters['C'], 1) - assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.]) - # for first split, 1/4 of dataset is in test, for second 3/4. - # take weighted average - assert_almost_equal(first.mean_validation_score, - 1 * 1. / 4. + 1. / 3. * 3. / 4.) - - # once with iid=False - grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv, - iid=False) - grid_search.fit(X, y) - first = grid_search.grid_scores_[0] - assert_equal(first.parameters['C'], 1) - # scores are the same as above - assert_array_almost_equal(first.cv_validation_scores, [1, 1. / 3.]) - # averaged score is just mean of scores - assert_almost_equal(first.mean_validation_score, - np.mean(first.cv_validation_scores)) - - -def test_grid_search_one_grid_point(): - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} - - clf = SVC() - cv = GridSearchCV(clf, param_dict) - cv.fit(X_, y_) - - clf = SVC(C=1.0, kernel="rbf", gamma=0.1) - clf.fit(X_, y_) - - assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_) - - -def test_grid_search_bad_param_grid(): - param_dict = {"C": 1.0} - clf = SVC() - assert_raises(ValueError, GridSearchCV, clf, param_dict) - - param_dict = {"C": []} - clf = SVC() - assert_raises(ValueError, GridSearchCV, clf, param_dict) - - param_dict = {"C": np.ones(6).reshape(3, 2)} - clf = SVC() - assert_raises(ValueError, GridSearchCV, clf, param_dict) - - -def test_grid_search_sparse(): - # Test that grid search works with both dense and sparse matrices - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - - clf = LinearSVC() - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - cv.fit(X_[:180], y_[:180]) - y_pred = cv.predict(X_[180:]) - C = cv.best_estimator_.C - - X_ = sp.csr_matrix(X_) - clf = LinearSVC() - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - cv.fit(X_[:180].tocoo(), y_[:180]) - y_pred2 = cv.predict(X_[180:]) - C2 = cv.best_estimator_.C - - assert_true(np.mean(y_pred == y_pred2) >= .9) - assert_equal(C, C2) - - -def test_grid_search_sparse_scoring(): - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - - clf = LinearSVC() - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") - cv.fit(X_[:180], y_[:180]) - y_pred = cv.predict(X_[180:]) - C = cv.best_estimator_.C - - X_ = sp.csr_matrix(X_) - clf = LinearSVC() - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") - cv.fit(X_[:180], y_[:180]) - y_pred2 = cv.predict(X_[180:]) - C2 = cv.best_estimator_.C - - assert_array_equal(y_pred, y_pred2) - assert_equal(C, C2) - # Smoke test the score - # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), - # cv.score(X_[:180], y[:180])) - - # test loss where greater is worse - def f1_loss(y_true_, y_pred_): - return -f1_score(y_true_, y_pred_) - F1Loss = make_scorer(f1_loss, greater_is_better=False) - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) - cv.fit(X_[:180], y_[:180]) - y_pred3 = cv.predict(X_[180:]) - C3 = cv.best_estimator_.C - - assert_equal(C, C3) - assert_array_equal(y_pred, y_pred3) - - -def test_grid_search_precomputed_kernel(): - # Test that grid search works when the input features are given in the - # form of a precomputed kernel matrix - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - - # compute the training kernel matrix corresponding to the linear kernel - K_train = np.dot(X_[:180], X_[:180].T) - y_train = y_[:180] - - clf = SVC(kernel='precomputed') - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - cv.fit(K_train, y_train) - - assert_true(cv.best_score_ >= 0) - - # compute the test kernel matrix - K_test = np.dot(X_[180:], X_[:180].T) - y_test = y_[180:] - - y_pred = cv.predict(K_test) - - assert_true(np.mean(y_pred == y_test) >= 0) - - # test error is raised when the precomputed kernel is not array-like - # or sparse - assert_raises(ValueError, cv.fit, K_train.tolist(), y_train) - - -def test_grid_search_precomputed_kernel_error_nonsquare(): - # Test that grid search returns an error with a non-square precomputed - # training kernel matrix - K_train = np.zeros((10, 20)) - y_train = np.ones((10, )) - clf = SVC(kernel='precomputed') - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - assert_raises(ValueError, cv.fit, K_train, y_train) - - -def test_grid_search_precomputed_kernel_error_kernel_function(): - # Test that grid search returns an error when using a kernel_function - X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) - kernel_function = lambda x1, x2: np.dot(x1, x2.T) - clf = SVC(kernel=kernel_function) - cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) - assert_raises(ValueError, cv.fit, X_, y_) - - -class BrokenClassifier(BaseEstimator): - """Broken classifier that cannot be fit twice""" - - def __init__(self, parameter=None): - self.parameter = parameter - - def fit(self, X, y): - assert_true(not hasattr(self, 'has_been_fit_')) - self.has_been_fit_ = True - - def predict(self, X): - return np.zeros(X.shape[0]) - - -@ignore_warnings -def test_refit(): - # Regression test for bug in refitting - # Simulates re-fitting a broken estimator; this used to break with - # sparse SVMs. - X = np.arange(100).reshape(10, 10) - y = np.array([0] * 5 + [1] * 5) - - clf = GridSearchCV(BrokenClassifier(), [{'parameter': [0, 1]}], - scoring="precision", refit=True) - clf.fit(X, y) - - -def test_gridsearch_nd(): - # Pass X as list in GridSearchCV - X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) - y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) - check_X = lambda x: x.shape[1:] == (5, 3, 2) - check_y = lambda x: x.shape[1:] == (7, 11) - clf = CheckingClassifier(check_X=check_X, check_y=check_y) - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) - grid_search.fit(X_4d, y_3d).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) - - -def test_X_as_list(): - # Pass X as list in GridSearchCV - X = np.arange(100).reshape(10, 10) - y = np.array([0] * 5 + [1] * 5) - - clf = CheckingClassifier(check_X=lambda x: isinstance(x, list)) - cv = KFold(n=len(X), n_folds=3) - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) - grid_search.fit(X.tolist(), y).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) - - -def test_y_as_list(): - # Pass y as list in GridSearchCV - X = np.arange(100).reshape(10, 10) - y = np.array([0] * 5 + [1] * 5) - - clf = CheckingClassifier(check_y=lambda x: isinstance(x, list)) - cv = KFold(n=len(X), n_folds=3) - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv) - grid_search.fit(X, y.tolist()).score(X, y) - assert_true(hasattr(grid_search, "grid_scores_")) - - -def test_pandas_input(): - # check cross_val_score doesn't destroy pandas dataframe - types = [(MockDataFrame, MockDataFrame)] - try: - from pandas import Series, DataFrame - types.append((DataFrame, Series)) - except ImportError: - pass - - X = np.arange(100).reshape(10, 10) - y = np.array([0] * 5 + [1] * 5) - - for InputFeatureType, TargetType in types: - # X dataframe, y series - X_df, y_ser = InputFeatureType(X), TargetType(y) - check_df = lambda x: isinstance(x, InputFeatureType) - check_series = lambda x: isinstance(x, TargetType) - clf = CheckingClassifier(check_X=check_df, check_y=check_series) - - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) - grid_search.fit(X_df, y_ser).score(X_df, y_ser) - grid_search.predict(X_df) - assert_true(hasattr(grid_search, "grid_scores_")) - - -def test_unsupervised_grid_search(): - # test grid-search with unsupervised estimator - X, y = make_blobs(random_state=0) - km = KMeans(random_state=0) - grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]), - scoring='adjusted_rand_score') - grid_search.fit(X, y) - # ARI can find the right number :) - assert_equal(grid_search.best_params_["n_clusters"], 3) - - # Now without a score, and without y - grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4])) - grid_search.fit(X) - assert_equal(grid_search.best_params_["n_clusters"], 4) - - -def test_gridsearch_no_predict(): - # test grid-search with an estimator without predict. - # slight duplication of a test from KDE - def custom_scoring(estimator, X): - return 42 if estimator.bandwidth == .1 else 0 - X, _ = make_blobs(cluster_std=.1, random_state=1, - centers=[[0, 1], [1, 0], [0, 0]]) - search = GridSearchCV(KernelDensity(), - param_grid=dict(bandwidth=[.01, .1, 1]), - scoring=custom_scoring) - search.fit(X) - assert_equal(search.best_params_['bandwidth'], .1) - assert_equal(search.best_score_, 42) - - -def test_param_sampler(): - # test basic properties of param sampler - param_distributions = {"kernel": ["rbf", "linear"], - "C": uniform(0, 1)} - sampler = ParameterSampler(param_distributions=param_distributions, - n_iter=10, random_state=0) - samples = [x for x in sampler] - assert_equal(len(samples), 10) - for sample in samples: - assert_true(sample["kernel"] in ["rbf", "linear"]) - assert_true(0 <= sample["C"] <= 1) - - -def test_randomized_search_grid_scores(): - # Make a dataset with a lot of noise to get various kind of prediction - # errors across CV folds and parameter settings - X, y = make_classification(n_samples=200, n_features=100, n_informative=3, - random_state=0) - - # XXX: as of today (scipy 0.12) it's not possible to set the random seed - # of scipy.stats distributions: the assertions in this test should thus - # not depend on the randomization - params = dict(C=expon(scale=10), - gamma=expon(scale=0.1)) - n_cv_iter = 3 - n_search_iter = 30 - search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, - param_distributions=params, iid=False) - search.fit(X, y) - assert_equal(len(search.grid_scores_), n_search_iter) - - # Check consistency of the structure of each cv_score item - for cv_score in search.grid_scores_: - assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) - # Because we set iid to False, the mean_validation score is the - # mean of the fold mean scores instead of the aggregate sample-wise - # mean score - assert_almost_equal(np.mean(cv_score.cv_validation_scores), - cv_score.mean_validation_score) - assert_equal(list(sorted(cv_score.parameters.keys())), - list(sorted(params.keys()))) - - # Check the consistency with the best_score_ and best_params_ attributes - sorted_grid_scores = list(sorted(search.grid_scores_, - key=lambda x: x.mean_validation_score)) - best_score = sorted_grid_scores[-1].mean_validation_score - assert_equal(search.best_score_, best_score) - - tied_best_params = [s.parameters for s in sorted_grid_scores - if s.mean_validation_score == best_score] - assert_true(search.best_params_ in tied_best_params, - "best_params_={0} is not part of the" - " tied best models: {1}".format( - search.best_params_, tied_best_params)) - - -def test_grid_search_score_consistency(): - # test that correct scores are used - clf = LinearSVC(random_state=0) - X, y = make_blobs(random_state=0, centers=2) - Cs = [.1, 1, 10] - for score in ['f1', 'roc_auc']: - grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) - grid_search.fit(X, y) - cv = StratifiedKFold(n_folds=3, y=y) - for C, scores in zip(Cs, grid_search.grid_scores_): - clf.set_params(C=C) - scores = scores[2] # get the separate runs from grid scores - i = 0 - for train, test in cv: - clf.fit(X[train], y[train]) - if score == "f1": - correct_score = f1_score(y[test], clf.predict(X[test])) - elif score == "roc_auc": - dec = clf.decision_function(X[test]) - correct_score = roc_auc_score(y[test], dec) - assert_almost_equal(correct_score, scores[i]) - i += 1 - - -def test_pickle(): - # Test that a fit search can be pickled - clf = MockClassifier() - grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) - grid_search.fit(X, y) - pickle.dumps(grid_search) # smoke test - - random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, - refit=True, n_iter=3) - random_search.fit(X, y) - pickle.dumps(random_search) # smoke test - - -def test_grid_search_with_multioutput_data(): - # Test search with multi-output estimator - - X, y = make_multilabel_classification(random_state=0) - - est_parameters = {"max_depth": [1, 2, 3, 4]} - cv = KFold(y.shape[0], random_state=0) - - estimators = [DecisionTreeRegressor(random_state=0), - DecisionTreeClassifier(random_state=0)] - - # Test with grid search cv - for est in estimators: - grid_search = GridSearchCV(est, est_parameters, cv=cv) - grid_search.fit(X, y) - for parameters, _, cv_validation_scores in grid_search.grid_scores_: - est.set_params(**parameters) - - for i, (train, test) in enumerate(cv): - est.fit(X[train], y[train]) - correct_score = est.score(X[test], y[test]) - assert_almost_equal(correct_score, - cv_validation_scores[i]) - - # Test with a randomized search - for est in estimators: - random_search = RandomizedSearchCV(est, est_parameters, - cv=cv, n_iter=3) - random_search.fit(X, y) - for parameters, _, cv_validation_scores in random_search.grid_scores_: - est.set_params(**parameters) - - for i, (train, test) in enumerate(cv): - est.fit(X[train], y[train]) - correct_score = est.score(X[test], y[test]) - assert_almost_equal(correct_score, - cv_validation_scores[i]) - - -def test_predict_proba_disabled(): - # Test predict_proba when disabled on estimator. - X = np.arange(20).reshape(5, -1) - y = [0, 0, 1, 1, 1] - clf = SVC(probability=False) - gs = GridSearchCV(clf, {}, cv=2).fit(X, y) - assert_false(hasattr(gs, "predict_proba")) - - -def test_grid_search_allows_nans(): - # Test GridSearchCV with Imputer - X = np.arange(20, dtype=np.float64).reshape(5, -1) - X[2, :] = np.nan - y = [0, 0, 1, 1, 1] - p = Pipeline([ - ('imputer', Imputer(strategy='mean', missing_values='NaN')), - ('classifier', MockClassifier()), - ]) - GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) - - -class FailingClassifier(BaseEstimator): - """Classifier that raises a ValueError on fit()""" - - FAILING_PARAMETER = 2 - - def __init__(self, parameter=None): - self.parameter = parameter - - def fit(self, X, y=None): - if self.parameter == FailingClassifier.FAILING_PARAMETER: - raise ValueError("Failing classifier failed as required") - - def predict(self, X): - return np.zeros(X.shape[0]) - - -def test_grid_search_failing_classifier(): - # GridSearchCV with on_error != 'raise' - # Ensures that a warning is raised and score reset where appropriate. - - X, y = make_classification(n_samples=20, n_features=10, random_state=0) - - clf = FailingClassifier() - - # refit=False because we only want to check that errors caused by fits - # to individual folds will be caught and warnings raised instead. If - # refit was done, then an exception would be raised on refit and not - # caught by grid_search (expected behavior), and this would cause an - # error in this test. - gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', - refit=False, error_score=0.0) - - assert_warns(FitFailedWarning, gs.fit, X, y) - - # Ensure that grid scores were set to zero as required for those fits - # that are expected to fail. - assert all(np.all(this_point.cv_validation_scores == 0.0) - for this_point in gs.grid_scores_ - if this_point.parameters['parameter'] == - FailingClassifier.FAILING_PARAMETER) - - gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', - refit=False, error_score=float('nan')) - assert_warns(FitFailedWarning, gs.fit, X, y) - assert all(np.all(np.isnan(this_point.cv_validation_scores)) - for this_point in gs.grid_scores_ - if this_point.parameters['parameter'] == - FailingClassifier.FAILING_PARAMETER) - - -def test_grid_search_failing_classifier_raise(): - # GridSearchCV with on_error == 'raise' raises the error - - X, y = make_classification(n_samples=20, n_features=10, random_state=0) - - clf = FailingClassifier() - - # refit=False because we want to test the behaviour of the grid search part - gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', - refit=False, error_score='raise') - - # FailingClassifier issues a ValueError so this is what we look for. - assert_raises(ValueError, gs.fit, X, y) - - -def test_parameters_sampler_replacement(): - # raise error if n_iter too large - params = {'first': [0, 1], 'second': ['a', 'b', 'c']} - sampler = ParameterSampler(params, n_iter=7) - assert_raises(ValueError, list, sampler) - # degenerates to GridSearchCV if n_iter the same as grid_size - sampler = ParameterSampler(params, n_iter=6) - samples = list(sampler) - assert_equal(len(samples), 6) - for values in ParameterGrid(params): - assert_true(values in samples) - - # test sampling without replacement in a large grid - params = {'a': range(10), 'b': range(10), 'c': range(10)} - sampler = ParameterSampler(params, n_iter=99, random_state=42) - samples = list(sampler) - assert_equal(len(samples), 99) - hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c']) - for p in samples] - assert_equal(len(set(hashable_samples)), 99) - - # doesn't go into infinite loops - params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} - sampler = ParameterSampler(params_distribution, n_iter=7) - samples = list(sampler) - assert_equal(len(samples), 7) - - -def test_classes__property(): - # Test that classes_ property matches best_esimator_.classes_ - X = np.arange(100).reshape(10, 10) - y = np.array([0] * 5 + [1] * 5) - Cs = [.1, 1, 10] - - grid_search = GridSearchCV(LinearSVC(random_state=0), {'C': Cs}) - grid_search.fit(X, y) - assert_array_equal(grid_search.best_estimator_.classes_, - grid_search.classes_) - - # Test that regressors do not have a classes_ attribute - grid_search = GridSearchCV(Ridge(), {'alpha': [1.0, 2.0]}) - grid_search.fit(X, y) - assert_false(hasattr(grid_search, 'classes_')) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py deleted file mode 100644 index d75e6bc82f6b3..0000000000000 --- a/sklearn/tests/test_learning_curve.py +++ /dev/null @@ -1,312 +0,0 @@ -# Author: Alexander Fabisch -# -# License: BSD 3 clause - -import sys -from sklearn.externals.six.moves import cStringIO as StringIO -import numpy as np -import warnings -from sklearn.base import BaseEstimator -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_false -from sklearn.datasets import make_classification -from sklearn.model_selection import KFold - -with warnings.catch_warnings(): - warnings.simplefilter('ignore') - from sklearn.learning_curve import learning_curve, validation_curve - -from sklearn.linear_model import PassiveAggressiveClassifier - - -class MockImprovingEstimator(BaseEstimator): - """Dummy classifier to test the learning curve""" - def __init__(self, n_max_train_sizes): - self.n_max_train_sizes = n_max_train_sizes - self.train_sizes = 0 - self.X_subset = None - - def fit(self, X_subset, y_subset=None): - self.X_subset = X_subset - self.train_sizes = X_subset.shape[0] - return self - - def predict(self, X): - raise NotImplementedError - - def score(self, X=None, Y=None): - # training score becomes worse (2 -> 1), test error better (0 -> 1) - if self._is_training_data(X): - return 2. - float(self.train_sizes) / self.n_max_train_sizes - else: - return float(self.train_sizes) / self.n_max_train_sizes - - def _is_training_data(self, X): - return X is self.X_subset - - -class MockIncrementalImprovingEstimator(MockImprovingEstimator): - """Dummy classifier that provides partial_fit""" - def __init__(self, n_max_train_sizes): - super(MockIncrementalImprovingEstimator, - self).__init__(n_max_train_sizes) - self.x = None - - def _is_training_data(self, X): - return self.x in X - - def partial_fit(self, X, y=None, **params): - self.train_sizes += X.shape[0] - self.x = X[0] - - -class MockEstimatorWithParameter(BaseEstimator): - """Dummy classifier to test the validation curve""" - def __init__(self, param=0.5): - self.X_subset = None - self.param = param - - def fit(self, X_subset, y_subset): - self.X_subset = X_subset - self.train_sizes = X_subset.shape[0] - return self - - def predict(self, X): - raise NotImplementedError - - def score(self, X=None, y=None): - return self.param if self._is_training_data(X) else 1 - self.param - - def _is_training_data(self, X): - return X is self.X_subset - - -class MockEstimatorFailing(BaseEstimator): - """Dummy classifier to test error_score in learning curve""" - def fit(self, X_subset, y_subset): - raise ValueError() - - def score(self, X=None, y=None): - return None - - -class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter): - """Dummy classifier that disallows repeated calls of fit method""" - - def fit(self, X_subset, y_subset): - assert_false( - hasattr(self, 'fit_called_'), - 'fit is called the second time' - ) - self.fit_called_ = True - return super(type(self), self).fit(X_subset, y_subset) - - -def test_learning_curve(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(20) - with warnings.catch_warnings(record=True) as w: - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)) - if len(w) > 0: - raise RuntimeError("Unexpected warning: %r" % w[0].message) - assert_equal(train_scores.shape, (10, 3)) - assert_equal(test_scores.shape, (10, 3)) - assert_array_equal(train_sizes, np.linspace(2, 20, 10)) - assert_array_almost_equal(train_scores.mean(axis=1), - np.linspace(1.9, 1.0, 10)) - assert_array_almost_equal(test_scores.mean(axis=1), - np.linspace(0.1, 1.0, 10)) - - -def test_learning_curve_unsupervised(): - X, _ = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(20) - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)) - assert_array_equal(train_sizes, np.linspace(2, 20, 10)) - assert_array_almost_equal(train_scores.mean(axis=1), - np.linspace(1.9, 1.0, 10)) - assert_array_almost_equal(test_scores.mean(axis=1), - np.linspace(0.1, 1.0, 10)) - - -def test_learning_curve_verbose(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(20) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - train_sizes, train_scores, test_scores = \ - learning_curve(estimator, X, y, cv=3, verbose=1) - finally: - out = sys.stdout.getvalue() - sys.stdout.close() - sys.stdout = old_stdout - - assert("[learning_curve]" in out) - - -def test_learning_curve_error_score(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockEstimatorFailing() - _, _, test_scores = learning_curve(estimator, X, y, cv=3, error_score=0) - all_zeros = not np.any(test_scores) - assert(all_zeros) - - -def test_learning_curve_error_score_default_raise(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockEstimatorFailing() - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3) - - -def test_learning_curve_incremental_learning_not_possible(): - X, y = make_classification(n_samples=2, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - # The mockup does not have partial_fit() - estimator = MockImprovingEstimator(1) - assert_raises(ValueError, learning_curve, estimator, X, y, - exploit_incremental_learning=True) - - -def test_learning_curve_incremental_learning(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockIncrementalImprovingEstimator(20) - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y, cv=3, exploit_incremental_learning=True, - train_sizes=np.linspace(0.1, 1.0, 10)) - assert_array_equal(train_sizes, np.linspace(2, 20, 10)) - assert_array_almost_equal(train_scores.mean(axis=1), - np.linspace(1.9, 1.0, 10)) - assert_array_almost_equal(test_scores.mean(axis=1), - np.linspace(0.1, 1.0, 10)) - - -def test_learning_curve_incremental_learning_unsupervised(): - X, _ = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockIncrementalImprovingEstimator(20) - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y=None, cv=3, exploit_incremental_learning=True, - train_sizes=np.linspace(0.1, 1.0, 10)) - assert_array_equal(train_sizes, np.linspace(2, 20, 10)) - assert_array_almost_equal(train_scores.mean(axis=1), - np.linspace(1.9, 1.0, 10)) - assert_array_almost_equal(test_scores.mean(axis=1), - np.linspace(0.1, 1.0, 10)) - - -def test_learning_curve_batch_and_incremental_learning_are_equal(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - train_sizes = np.linspace(0.2, 1.0, 5) - estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, - shuffle=False) - - train_sizes_inc, train_scores_inc, test_scores_inc = \ - learning_curve( - estimator, X, y, train_sizes=train_sizes, - cv=3, exploit_incremental_learning=True) - train_sizes_batch, train_scores_batch, test_scores_batch = \ - learning_curve( - estimator, X, y, cv=3, train_sizes=train_sizes, - exploit_incremental_learning=False) - - assert_array_equal(train_sizes_inc, train_sizes_batch) - assert_array_almost_equal(train_scores_inc.mean(axis=1), - train_scores_batch.mean(axis=1)) - assert_array_almost_equal(test_scores_inc.mean(axis=1), - test_scores_batch.mean(axis=1)) - - -def test_learning_curve_n_sample_range_out_of_bounds(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(20) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0, 1]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0.0, 1.0]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0.1, 1.1]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0, 20]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[1, 21]) - - -def test_learning_curve_remove_duplicate_sample_sizes(): - X, y = make_classification(n_samples=3, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(2) - train_sizes, _, _ = assert_warns( - RuntimeWarning, learning_curve, estimator, X, y, cv=3, - train_sizes=np.linspace(0.33, 1.0, 3)) - assert_array_equal(train_sizes, [1, 2]) - - -def test_learning_curve_with_boolean_indices(): - X, y = make_classification(n_samples=30, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - estimator = MockImprovingEstimator(20) - cv = KFold(n=30, n_folds=3) - train_sizes, train_scores, test_scores = learning_curve( - estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10)) - assert_array_equal(train_sizes, np.linspace(2, 20, 10)) - assert_array_almost_equal(train_scores.mean(axis=1), - np.linspace(1.9, 1.0, 10)) - assert_array_almost_equal(test_scores.mean(axis=1), - np.linspace(0.1, 1.0, 10)) - - -def test_validation_curve(): - X, y = make_classification(n_samples=2, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - param_range = np.linspace(0, 1, 10) - with warnings.catch_warnings(record=True) as w: - train_scores, test_scores = validation_curve( - MockEstimatorWithParameter(), X, y, param_name="param", - param_range=param_range, cv=2 - ) - if len(w) > 0: - raise RuntimeError("Unexpected warning: %r" % w[0].message) - - assert_array_almost_equal(train_scores.mean(axis=1), param_range) - assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range) - - -def test_validation_curve_clone_estimator(): - X, y = make_classification(n_samples=2, n_features=1, n_informative=1, - n_redundant=0, n_classes=2, - n_clusters_per_class=1, random_state=0) - - param_range = np.linspace(1, 0, 10) - _, _ = validation_curve( - MockEstimatorWithSingleFitCallAllowed(), X, y, - param_name="param", param_range=param_range, cv=2 - ) From 776bba1248ebfa98edcf5eebbb35b5e6fa79ecd1 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 17:30:14 +0100 Subject: [PATCH 05/14] Remove gaussian_process --- sklearn/gaussian_process/__init__.py | 3 +- sklearn/gaussian_process/gaussian_process.py | 882 ------------------ .../tests/test_gaussian_process.py | 175 ---- sklearn/utils/estimator_checks.py | 4 +- 4 files changed, 2 insertions(+), 1062 deletions(-) delete mode 100644 sklearn/gaussian_process/gaussian_process.py delete mode 100644 sklearn/gaussian_process/tests/test_gaussian_process.py diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py index 48d9aa05aaf84..377f15795ee58 100644 --- a/sklearn/gaussian_process/__init__.py +++ b/sklearn/gaussian_process/__init__.py @@ -14,10 +14,9 @@ from .gpc import GaussianProcessClassifier from . import kernels -from .gaussian_process import GaussianProcess from . import correlation_models from . import regression_models -__all__ = ['GaussianProcess', 'correlation_models', 'regression_models', +__all__ = ['correlation_models', 'regression_models', 'GaussianProcessRegressor', 'GaussianProcessClassifier', 'kernels'] diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py deleted file mode 100644 index 8c7491e648d31..0000000000000 --- a/sklearn/gaussian_process/gaussian_process.py +++ /dev/null @@ -1,882 +0,0 @@ -# -*- coding: utf-8 -*- - -# Author: Vincent Dubourg -# (mostly translation, see implementation details) -# License: BSD 3 clause - -from __future__ import print_function - -import numpy as np -from scipy import linalg, optimize - -from ..base import BaseEstimator, RegressorMixin -from ..metrics.pairwise import manhattan_distances -from ..utils import check_random_state, check_array, check_X_y -from ..utils.validation import check_is_fitted -from . import regression_models as regression -from . import correlation_models as correlation -from ..utils import deprecated - -MACHINE_EPSILON = np.finfo(np.double).eps - - -@deprecated("l1_cross_distances was deprecated in version 0.18 " - "and will be removed in 0.20.") -def l1_cross_distances(X): - """ - Computes the nonzero componentwise L1 cross-distances between the vectors - in X. - - Parameters - ---------- - - X : array_like - An array with shape (n_samples, n_features) - - Returns - ------- - - D : array with shape (n_samples * (n_samples - 1) / 2, n_features) - The array of componentwise L1 cross-distances. - - ij : arrays with shape (n_samples * (n_samples - 1) / 2, 2) - The indices i and j of the vectors in X associated to the cross- - distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]). - """ - X = check_array(X) - n_samples, n_features = X.shape - n_nonzero_cross_dist = n_samples * (n_samples - 1) // 2 - ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int) - D = np.zeros((n_nonzero_cross_dist, n_features)) - ll_1 = 0 - for k in range(n_samples - 1): - ll_0 = ll_1 - ll_1 = ll_0 + n_samples - k - 1 - ij[ll_0:ll_1, 0] = k - ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples) - D[ll_0:ll_1] = np.abs(X[k] - X[(k + 1):n_samples]) - - return D, ij - - -@deprecated("GaussianProcess was deprecated in version 0.18 and will be " - "removed in 0.20. Use the GaussianProcessRegressor instead.") -class GaussianProcess(BaseEstimator, RegressorMixin): - """The legacy Gaussian Process model class. - - .. deprecated:: 0.18 - This class will be removed in 0.20. - Use the :class:`GaussianProcessRegressor` instead. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - regr : string or callable, optional - A regression function returning an array of outputs of the linear - regression functional basis. The number of observations n_samples - should be greater than the size p of this basis. - Default assumes a simple constant regression trend. - Available built-in regression models are:: - - 'constant', 'linear', 'quadratic' - - corr : string or callable, optional - A stationary autocorrelation function returning the autocorrelation - between two points x and x'. - Default assumes a squared-exponential autocorrelation model. - Built-in correlation models are:: - - 'absolute_exponential', 'squared_exponential', - 'generalized_exponential', 'cubic', 'linear' - - beta0 : double array_like, optional - The regression weight vector to perform Ordinary Kriging (OK). - Default assumes Universal Kriging (UK) so that the vector beta of - regression weights is estimated using the maximum likelihood - principle. - - storage_mode : string, optional - A string specifying whether the Cholesky decomposition of the - correlation matrix should be stored in the class (storage_mode = - 'full') or not (storage_mode = 'light'). - Default assumes storage_mode = 'full', so that the - Cholesky decomposition of the correlation matrix is stored. - This might be a useful parameter when one is not interested in the - MSE and only plan to estimate the BLUP, for which the correlation - matrix is not required. - - verbose : boolean, optional - A boolean specifying the verbose level. - Default is verbose = False. - - theta0 : double array_like, optional - An array with shape (n_features, ) or (1, ). - The parameters in the autocorrelation model. - If thetaL and thetaU are also specified, theta0 is considered as - the starting point for the maximum likelihood estimation of the - best set of parameters. - Default assumes isotropic autocorrelation model with theta0 = 1e-1. - - thetaL : double array_like, optional - An array with shape matching theta0's. - Lower bound on the autocorrelation parameters for maximum - likelihood estimation. - Default is None, so that it skips maximum likelihood estimation and - it uses theta0. - - thetaU : double array_like, optional - An array with shape matching theta0's. - Upper bound on the autocorrelation parameters for maximum - likelihood estimation. - Default is None, so that it skips maximum likelihood estimation and - it uses theta0. - - normalize : boolean, optional - Input X and observations y are centered and reduced wrt - means and standard deviations estimated from the n_samples - observations provided. - Default is normalize = True so that data is normalized to ease - maximum likelihood estimation. - - nugget : double or ndarray, optional - Introduce a nugget effect to allow smooth predictions from noisy - data. If nugget is an ndarray, it must be the same length as the - number of data points used for the fit. - The nugget is added to the diagonal of the assumed training covariance; - in this way it acts as a Tikhonov regularization in the problem. In - the special case of the squared exponential correlation function, the - nugget mathematically represents the variance of the input values. - Default assumes a nugget close to machine precision for the sake of - robustness (nugget = 10. * MACHINE_EPSILON). - - optimizer : string, optional - A string specifying the optimization algorithm to be used. - Default uses 'fmin_cobyla' algorithm from scipy.optimize. - Available optimizers are:: - - 'fmin_cobyla', 'Welch' - - 'Welch' optimizer is dued to Welch et al., see reference [WBSWM1992]_. - It consists in iterating over several one-dimensional optimizations - instead of running one single multi-dimensional optimization. - - random_start : int, optional - The number of times the Maximum Likelihood Estimation should be - performed from a random starting point. - The first MLE always uses the specified starting point (theta0), - the next starting points are picked at random according to an - exponential distribution (log-uniform on [thetaL, thetaU]). - Default does not use random starting point (random_start = 1). - - random_state : int, RandomState instance or None, optional (default=None) - The generator used to shuffle the sequence of coordinates of theta in - the Welch optimizer. If int, random_state is the seed used by the - random number generator; If RandomState instance, random_state is the - random number generator; If None, the random number generator is the - RandomState instance used by `np.random`. - - Attributes - ---------- - theta_ : array - Specified theta OR the best set of autocorrelation parameters (the \ - sought maximizer of the reduced likelihood function). - - reduced_likelihood_function_value_ : array - The optimal reduced likelihood function value. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.gaussian_process import GaussianProcess - >>> X = np.array([[1., 3., 5., 6., 7., 8.]]).T - >>> y = (X * np.sin(X)).ravel() - >>> gp = GaussianProcess(theta0=0.1, thetaL=.001, thetaU=1.) - >>> gp.fit(X, y) # doctest: +ELLIPSIS - GaussianProcess(beta0=None... - ... - - Notes - ----- - The presentation implementation is based on a translation of the DACE - Matlab toolbox, see reference [NLNS2002]_. - - References - ---------- - - .. [NLNS2002] `H.B. Nielsen, S.N. Lophaven, H. B. Nielsen and J. - Sondergaard. DACE - A MATLAB Kriging Toolbox.` (2002) - http://imedea.uib-csic.es/master/cambioglobal/Modulo_V_cod101615/Lab/lab_maps/krigging/DACE-krigingsoft/dace/dace.pdf - - .. [WBSWM1992] `W.J. Welch, R.J. Buck, J. Sacks, H.P. Wynn, T.J. Mitchell, - and M.D. Morris (1992). Screening, predicting, and computer - experiments. Technometrics, 34(1) 15--25.` - http://www.jstor.org/stable/1269548 - """ - - _regression_types = { - 'constant': regression.constant, - 'linear': regression.linear, - 'quadratic': regression.quadratic} - - _correlation_types = { - 'absolute_exponential': correlation.absolute_exponential, - 'squared_exponential': correlation.squared_exponential, - 'generalized_exponential': correlation.generalized_exponential, - 'cubic': correlation.cubic, - 'linear': correlation.linear} - - _optimizer_types = [ - 'fmin_cobyla', - 'Welch'] - - def __init__(self, regr='constant', corr='squared_exponential', beta0=None, - storage_mode='full', verbose=False, theta0=1e-1, - thetaL=None, thetaU=None, optimizer='fmin_cobyla', - random_start=1, normalize=True, - nugget=10. * MACHINE_EPSILON, random_state=None): - - self.regr = regr - self.corr = corr - self.beta0 = beta0 - self.storage_mode = storage_mode - self.verbose = verbose - self.theta0 = theta0 - self.thetaL = thetaL - self.thetaU = thetaU - self.normalize = normalize - self.nugget = nugget - self.optimizer = optimizer - self.random_start = random_start - self.random_state = random_state - - def fit(self, X, y): - """ - The Gaussian Process model fitting method. - - Parameters - ---------- - X : double array_like - An array with shape (n_samples, n_features) with the input at which - observations were made. - - y : double array_like - An array with shape (n_samples, ) or shape (n_samples, n_targets) - with the observations of the output to be predicted. - - Returns - ------- - gp : self - A fitted Gaussian Process model object awaiting data to perform - predictions. - """ - # Run input checks - self._check_params() - - self.random_state = check_random_state(self.random_state) - - # Force data to 2D numpy.array - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) - self.y_ndim_ = y.ndim - if y.ndim == 1: - y = y[:, np.newaxis] - - # Check shapes of DOE & observations - n_samples, n_features = X.shape - _, n_targets = y.shape - - # Run input checks - self._check_params(n_samples) - - # Normalize data or don't - if self.normalize: - X_mean = np.mean(X, axis=0) - X_std = np.std(X, axis=0) - y_mean = np.mean(y, axis=0) - y_std = np.std(y, axis=0) - X_std[X_std == 0.] = 1. - y_std[y_std == 0.] = 1. - # center and scale X if necessary - X = (X - X_mean) / X_std - y = (y - y_mean) / y_std - else: - X_mean = np.zeros(1) - X_std = np.ones(1) - y_mean = np.zeros(1) - y_std = np.ones(1) - - # Calculate matrix of distances D between samples - D, ij = l1_cross_distances(X) - if (np.min(np.sum(D, axis=1)) == 0. - and self.corr != correlation.pure_nugget): - raise Exception("Multiple input features cannot have the same" - " target value.") - - # Regression matrix and parameters - F = self.regr(X) - n_samples_F = F.shape[0] - if F.ndim > 1: - p = F.shape[1] - else: - p = 1 - if n_samples_F != n_samples: - raise Exception("Number of rows in F and X do not match. Most " - "likely something is going wrong with the " - "regression model.") - if p > n_samples_F: - raise Exception(("Ordinary least squares problem is undetermined " - "n_samples=%d must be greater than the " - "regression model size p=%d.") % (n_samples, p)) - if self.beta0 is not None: - if self.beta0.shape[0] != p: - raise Exception("Shapes of beta0 and F do not match.") - - # Set attributes - self.X = X - self.y = y - self.D = D - self.ij = ij - self.F = F - self.X_mean, self.X_std = X_mean, X_std - self.y_mean, self.y_std = y_mean, y_std - - # Determine Gaussian Process model parameters - if self.thetaL is not None and self.thetaU is not None: - # Maximum Likelihood Estimation of the parameters - if self.verbose: - print("Performing Maximum Likelihood Estimation of the " - "autocorrelation parameters...") - self.theta_, self.reduced_likelihood_function_value_, par = \ - self._arg_max_reduced_likelihood_function() - if np.isinf(self.reduced_likelihood_function_value_): - raise Exception("Bad parameter region. " - "Try increasing upper bound") - - else: - # Given parameters - if self.verbose: - print("Given autocorrelation parameters. " - "Computing Gaussian Process model parameters...") - self.theta_ = self.theta0 - self.reduced_likelihood_function_value_, par = \ - self.reduced_likelihood_function() - if np.isinf(self.reduced_likelihood_function_value_): - raise Exception("Bad point. Try increasing theta0.") - - self.beta = par['beta'] - self.gamma = par['gamma'] - self.sigma2 = par['sigma2'] - self.C = par['C'] - self.Ft = par['Ft'] - self.G = par['G'] - - if self.storage_mode == 'light': - # Delete heavy data (it will be computed again if required) - # (it is required only when MSE is wanted in self.predict) - if self.verbose: - print("Light storage mode specified. " - "Flushing autocorrelation matrix...") - self.D = None - self.ij = None - self.F = None - self.C = None - self.Ft = None - self.G = None - - return self - - def predict(self, X, eval_MSE=False, batch_size=None): - """ - This function evaluates the Gaussian Process model at x. - - Parameters - ---------- - X : array_like - An array with shape (n_eval, n_features) giving the point(s) at - which the prediction(s) should be made. - - eval_MSE : boolean, optional - A boolean specifying whether the Mean Squared Error should be - evaluated or not. - Default assumes evalMSE = False and evaluates only the BLUP (mean - prediction). - - batch_size : integer, optional - An integer giving the maximum number of points that can be - evaluated simultaneously (depending on the available memory). - Default is None so that all given points are evaluated at the same - time. - - Returns - ------- - y : array_like, shape (n_samples, ) or (n_samples, n_targets) - An array with shape (n_eval, ) if the Gaussian Process was trained - on an array of shape (n_samples, ) or an array with shape - (n_eval, n_targets) if the Gaussian Process was trained on an array - of shape (n_samples, n_targets) with the Best Linear Unbiased - Prediction at x. - - MSE : array_like, optional (if eval_MSE == True) - An array with shape (n_eval, ) or (n_eval, n_targets) as with y, - with the Mean Squared Error at x. - """ - check_is_fitted(self, "X") - - # Check input shapes - X = check_array(X) - n_eval, _ = X.shape - n_samples, n_features = self.X.shape - n_samples_y, n_targets = self.y.shape - - # Run input checks - self._check_params(n_samples) - - if X.shape[1] != n_features: - raise ValueError(("The number of features in X (X.shape[1] = %d) " - "should match the number of features used " - "for fit() " - "which is %d.") % (X.shape[1], n_features)) - - if batch_size is None: - # No memory management - # (evaluates all given points in a single batch run) - - # Normalize input - X = (X - self.X_mean) / self.X_std - - # Get pairwise componentwise L1-distances to the input training set - dx = manhattan_distances(X, Y=self.X, sum_over_features=False) - # Get regression function and correlation - f = self.regr(X) - r = self.corr(self.theta_, dx).reshape(n_eval, n_samples) - - # Scaled predictor - y_ = np.dot(f, self.beta) + np.dot(r, self.gamma) - - # Predictor - y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets) - - if self.y_ndim_ == 1: - y = y.ravel() - - # Mean Squared Error - if eval_MSE: - C = self.C - if C is None: - # Light storage mode (need to recompute C, F, Ft and G) - if self.verbose: - print("This GaussianProcess used 'light' storage mode " - "at instantiation. Need to recompute " - "autocorrelation matrix...") - reduced_likelihood_function_value, par = \ - self.reduced_likelihood_function() - self.C = par['C'] - self.Ft = par['Ft'] - self.G = par['G'] - - rt = linalg.solve_triangular(self.C, r.T, lower=True) - - if self.beta0 is None: - # Universal Kriging - u = linalg.solve_triangular(self.G.T, - np.dot(self.Ft.T, rt) - f.T, - lower=True) - else: - # Ordinary Kriging - u = np.zeros((n_targets, n_eval)) - - MSE = np.dot(self.sigma2.reshape(n_targets, 1), - (1. - (rt ** 2.).sum(axis=0) - + (u ** 2.).sum(axis=0))[np.newaxis, :]) - MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets) - - # Mean Squared Error might be slightly negative depending on - # machine precision: force to zero! - MSE[MSE < 0.] = 0. - - if self.y_ndim_ == 1: - MSE = MSE.ravel() - - return y, MSE - - else: - - return y - - else: - # Memory management - - if type(batch_size) is not int or batch_size <= 0: - raise Exception("batch_size must be a positive integer") - - if eval_MSE: - - y, MSE = np.zeros(n_eval), np.zeros(n_eval) - for k in range(max(1, int(n_eval / batch_size))): - batch_from = k * batch_size - batch_to = min([(k + 1) * batch_size + 1, n_eval + 1]) - y[batch_from:batch_to], MSE[batch_from:batch_to] = \ - self.predict(X[batch_from:batch_to], - eval_MSE=eval_MSE, batch_size=None) - - return y, MSE - - else: - - y = np.zeros(n_eval) - for k in range(max(1, int(n_eval / batch_size))): - batch_from = k * batch_size - batch_to = min([(k + 1) * batch_size + 1, n_eval + 1]) - y[batch_from:batch_to] = \ - self.predict(X[batch_from:batch_to], - eval_MSE=eval_MSE, batch_size=None) - - return y - - def reduced_likelihood_function(self, theta=None): - """ - This function determines the BLUP parameters and evaluates the reduced - likelihood function for the given autocorrelation parameters theta. - - Maximizing this function wrt the autocorrelation parameters theta is - equivalent to maximizing the likelihood of the assumed joint Gaussian - distribution of the observations y evaluated onto the design of - experiments X. - - Parameters - ---------- - theta : array_like, optional - An array containing the autocorrelation parameters at which the - Gaussian Process model parameters should be determined. - Default uses the built-in autocorrelation parameters - (ie ``theta = self.theta_``). - - Returns - ------- - reduced_likelihood_function_value : double - The value of the reduced likelihood function associated to the - given autocorrelation parameters theta. - - par : dict - A dictionary containing the requested Gaussian Process model - parameters: - - - ``sigma2`` is the Gaussian Process variance. - - ``beta`` is the generalized least-squares regression weights for - Universal Kriging or given beta0 for Ordinary Kriging. - - ``gamma`` is the Gaussian Process weights. - - ``C`` is the Cholesky decomposition of the correlation - matrix [R]. - - ``Ft`` is the solution of the linear equation system - [R] x Ft = F - - ``G`` is the QR decomposition of the matrix Ft. - """ - check_is_fitted(self, "X") - - if theta is None: - # Use built-in autocorrelation parameters - theta = self.theta_ - - # Initialize output - reduced_likelihood_function_value = - np.inf - par = {} - - # Retrieve data - n_samples = self.X.shape[0] - D = self.D - ij = self.ij - F = self.F - - if D is None: - # Light storage mode (need to recompute D, ij and F) - D, ij = l1_cross_distances(self.X) - if (np.min(np.sum(D, axis=1)) == 0. - and self.corr != correlation.pure_nugget): - raise Exception("Multiple X are not allowed") - F = self.regr(self.X) - - # Set up R - r = self.corr(theta, D) - R = np.eye(n_samples) * (1. + self.nugget) - R[ij[:, 0], ij[:, 1]] = r - R[ij[:, 1], ij[:, 0]] = r - - # Cholesky decomposition of R - try: - C = linalg.cholesky(R, lower=True) - except linalg.LinAlgError: - return reduced_likelihood_function_value, par - - # Get generalized least squares solution - Ft = linalg.solve_triangular(C, F, lower=True) - Q, G = linalg.qr(Ft, mode='economic') - - sv = linalg.svd(G, compute_uv=False) - rcondG = sv[-1] / sv[0] - if rcondG < 1e-10: - # Check F - sv = linalg.svd(F, compute_uv=False) - condF = sv[0] / sv[-1] - if condF > 1e15: - raise Exception("F is too ill conditioned. Poor combination " - "of regression model and observations.") - else: - # Ft is too ill conditioned, get out (try different theta) - return reduced_likelihood_function_value, par - - Yt = linalg.solve_triangular(C, self.y, lower=True) - if self.beta0 is None: - # Universal Kriging - beta = linalg.solve_triangular(G, np.dot(Q.T, Yt)) - else: - # Ordinary Kriging - beta = np.array(self.beta0) - - rho = Yt - np.dot(Ft, beta) - sigma2 = (rho ** 2.).sum(axis=0) / n_samples - # The determinant of R is equal to the squared product of the diagonal - # elements of its Cholesky decomposition C - detR = (np.diag(C) ** (2. / n_samples)).prod() - - # Compute/Organize output - reduced_likelihood_function_value = - sigma2.sum() * detR - par['sigma2'] = sigma2 * self.y_std ** 2. - par['beta'] = beta - par['gamma'] = linalg.solve_triangular(C.T, rho) - par['C'] = C - par['Ft'] = Ft - par['G'] = G - - return reduced_likelihood_function_value, par - - def _arg_max_reduced_likelihood_function(self): - """ - This function estimates the autocorrelation parameters theta as the - maximizer of the reduced likelihood function. - (Minimization of the opposite reduced likelihood function is used for - convenience) - - Parameters - ---------- - self : All parameters are stored in the Gaussian Process model object. - - Returns - ------- - optimal_theta : array_like - The best set of autocorrelation parameters (the sought maximizer of - the reduced likelihood function). - - optimal_reduced_likelihood_function_value : double - The optimal reduced likelihood function value. - - optimal_par : dict - The BLUP parameters associated to thetaOpt. - """ - - # Initialize output - best_optimal_theta = [] - best_optimal_rlf_value = [] - best_optimal_par = [] - - if self.verbose: - print("The chosen optimizer is: " + str(self.optimizer)) - if self.random_start > 1: - print(str(self.random_start) + " random starts are required.") - - percent_completed = 0. - - # Force optimizer to fmin_cobyla if the model is meant to be isotropic - if self.optimizer == 'Welch' and self.theta0.size == 1: - self.optimizer = 'fmin_cobyla' - - if self.optimizer == 'fmin_cobyla': - - def minus_reduced_likelihood_function(log10t): - return - self.reduced_likelihood_function( - theta=10. ** log10t)[0] - - constraints = [] - for i in range(self.theta0.size): - constraints.append(lambda log10t, i=i: - log10t[i] - np.log10(self.thetaL[0, i])) - constraints.append(lambda log10t, i=i: - np.log10(self.thetaU[0, i]) - log10t[i]) - - for k in range(self.random_start): - - if k == 0: - # Use specified starting point as first guess - theta0 = self.theta0 - else: - # Generate a random starting point log10-uniformly - # distributed between bounds - log10theta0 = (np.log10(self.thetaL) - + self.random_state.rand(*self.theta0.shape) - * np.log10(self.thetaU / self.thetaL)) - theta0 = 10. ** log10theta0 - - # Run Cobyla - try: - log10_optimal_theta = \ - optimize.fmin_cobyla(minus_reduced_likelihood_function, - np.log10(theta0).ravel(), - constraints, disp=0) - except ValueError as ve: - print("Optimization failed. Try increasing the ``nugget``") - raise ve - - optimal_theta = 10. ** log10_optimal_theta - optimal_rlf_value, optimal_par = \ - self.reduced_likelihood_function(theta=optimal_theta) - - # Compare the new optimizer to the best previous one - if k > 0: - if optimal_rlf_value > best_optimal_rlf_value: - best_optimal_rlf_value = optimal_rlf_value - best_optimal_par = optimal_par - best_optimal_theta = optimal_theta - else: - best_optimal_rlf_value = optimal_rlf_value - best_optimal_par = optimal_par - best_optimal_theta = optimal_theta - if self.verbose and self.random_start > 1: - if (20 * k) / self.random_start > percent_completed: - percent_completed = (20 * k) / self.random_start - print("%s completed" % (5 * percent_completed)) - - optimal_rlf_value = best_optimal_rlf_value - optimal_par = best_optimal_par - optimal_theta = best_optimal_theta - - elif self.optimizer == 'Welch': - - # Backup of the given attributes - theta0, thetaL, thetaU = self.theta0, self.thetaL, self.thetaU - corr = self.corr - verbose = self.verbose - - # This will iterate over fmin_cobyla optimizer - self.optimizer = 'fmin_cobyla' - self.verbose = False - - # Initialize under isotropy assumption - if verbose: - print("Initialize under isotropy assumption...") - self.theta0 = check_array(self.theta0.min()) - self.thetaL = check_array(self.thetaL.min()) - self.thetaU = check_array(self.thetaU.max()) - theta_iso, optimal_rlf_value_iso, par_iso = \ - self._arg_max_reduced_likelihood_function() - optimal_theta = theta_iso + np.zeros(theta0.shape) - - # Iterate over all dimensions of theta allowing for anisotropy - if verbose: - print("Now improving allowing for anisotropy...") - for i in self.random_state.permutation(theta0.size): - if verbose: - print("Proceeding along dimension %d..." % (i + 1)) - self.theta0 = check_array(theta_iso) - self.thetaL = check_array(thetaL[0, i]) - self.thetaU = check_array(thetaU[0, i]) - - def corr_cut(t, d): - return corr(check_array(np.hstack([optimal_theta[0][0:i], - t[0], - optimal_theta[0][(i + - 1)::]])), - d) - - self.corr = corr_cut - optimal_theta[0, i], optimal_rlf_value, optimal_par = \ - self._arg_max_reduced_likelihood_function() - - # Restore the given attributes - self.theta0, self.thetaL, self.thetaU = theta0, thetaL, thetaU - self.corr = corr - self.optimizer = 'Welch' - self.verbose = verbose - - else: - - raise NotImplementedError("This optimizer ('%s') is not " - "implemented yet. Please contribute!" - % self.optimizer) - - return optimal_theta, optimal_rlf_value, optimal_par - - def _check_params(self, n_samples=None): - - # Check regression model - if not callable(self.regr): - if self.regr in self._regression_types: - self.regr = self._regression_types[self.regr] - else: - raise ValueError("regr should be one of %s or callable, " - "%s was given." - % (self._regression_types.keys(), self.regr)) - - # Check regression weights if given (Ordinary Kriging) - if self.beta0 is not None: - self.beta0 = np.atleast_2d(self.beta0) - if self.beta0.shape[1] != 1: - # Force to column vector - self.beta0 = self.beta0.T - - # Check correlation model - if not callable(self.corr): - if self.corr in self._correlation_types: - self.corr = self._correlation_types[self.corr] - else: - raise ValueError("corr should be one of %s or callable, " - "%s was given." - % (self._correlation_types.keys(), self.corr)) - - # Check storage mode - if self.storage_mode != 'full' and self.storage_mode != 'light': - raise ValueError("Storage mode should either be 'full' or " - "'light', %s was given." % self.storage_mode) - - # Check correlation parameters - self.theta0 = np.atleast_2d(self.theta0) - lth = self.theta0.size - - if self.thetaL is not None and self.thetaU is not None: - self.thetaL = np.atleast_2d(self.thetaL) - self.thetaU = np.atleast_2d(self.thetaU) - if self.thetaL.size != lth or self.thetaU.size != lth: - raise ValueError("theta0, thetaL and thetaU must have the " - "same length.") - if np.any(self.thetaL <= 0) or np.any(self.thetaU < self.thetaL): - raise ValueError("The bounds must satisfy O < thetaL <= " - "thetaU.") - - elif self.thetaL is None and self.thetaU is None: - if np.any(self.theta0 <= 0): - raise ValueError("theta0 must be strictly positive.") - - elif self.thetaL is None or self.thetaU is None: - raise ValueError("thetaL and thetaU should either be both or " - "neither specified.") - - # Force verbose type to bool - self.verbose = bool(self.verbose) - - # Force normalize type to bool - self.normalize = bool(self.normalize) - - # Check nugget value - self.nugget = np.asarray(self.nugget) - if np.any(self.nugget) < 0.: - raise ValueError("nugget must be positive or zero.") - if (n_samples is not None - and self.nugget.shape not in [(), (n_samples,)]): - raise ValueError("nugget must be either a scalar " - "or array of length n_samples.") - - # Check optimizer - if self.optimizer not in self._optimizer_types: - raise ValueError("optimizer should be one of %s" - % self._optimizer_types) - - # Force random_start type to int - self.random_start = int(self.random_start) diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py deleted file mode 100644 index 37d872fc99fb5..0000000000000 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ /dev/null @@ -1,175 +0,0 @@ -""" -Testing for Gaussian Process module (sklearn.gaussian_process) -""" - -# Author: Vincent Dubourg -# License: BSD 3 clause - -import numpy as np - -from sklearn.gaussian_process import GaussianProcess -from sklearn.gaussian_process import regression_models as regression -from sklearn.gaussian_process import correlation_models as correlation -from sklearn.datasets import make_regression -from sklearn.utils.testing import assert_greater, assert_true, assert_raises - - -f = lambda x: x * np.sin(x) -X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T -X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T -y = f(X).ravel() - - -def test_1d(regr=regression.constant, corr=correlation.squared_exponential, - random_start=10, beta0=None): - # MLE estimation of a one-dimensional Gaussian Process model. - # Check random start optimization. - # Test the interpolating property. - gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, - theta0=1e-2, thetaL=1e-4, thetaU=1e-1, - random_start=random_start, verbose=False).fit(X, y) - y_pred, MSE = gp.predict(X, eval_MSE=True) - y2_pred, MSE2 = gp.predict(X2, eval_MSE=True) - - assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.) - and np.allclose(MSE2, 0., atol=10)) - - -def test_2d(regr=regression.constant, corr=correlation.squared_exponential, - random_start=10, beta0=None): - # MLE estimation of a two-dimensional Gaussian Process model accounting for - # anisotropy. Check random start optimization. - # Test the interpolating property. - b, kappa, e = 5., .5, .1 - g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. - X = np.array([[-4.61611719, -6.00099547], - [4.10469096, 5.32782448], - [0.00000000, -0.50000000], - [-6.17289014, -4.6984743], - [1.3109306, -6.93271427], - [-5.03823144, 3.10584743], - [-2.87600388, 6.74310541], - [5.21301203, 4.26386883]]) - y = g(X).ravel() - - thetaL = [1e-4] * 2 - thetaU = [1e-1] * 2 - gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, - theta0=[1e-2] * 2, thetaL=thetaL, - thetaU=thetaU, - random_start=random_start, verbose=False) - gp.fit(X, y) - y_pred, MSE = gp.predict(X, eval_MSE=True) - - assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)) - - eps = np.finfo(gp.theta_.dtype).eps - assert_true(np.all(gp.theta_ >= thetaL - eps)) # Lower bounds of hyperparameters - assert_true(np.all(gp.theta_ <= thetaU + eps)) # Upper bounds of hyperparameters - - -def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential, - random_start=10, beta0=None): - # MLE estimation of a two-dimensional Gaussian Process model accounting for - # anisotropy. Check random start optimization. - # Test the GP interpolation for 2D output - b, kappa, e = 5., .5, .1 - g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. - f = lambda x: np.vstack((g(x), g(x))).T - X = np.array([[-4.61611719, -6.00099547], - [4.10469096, 5.32782448], - [0.00000000, -0.50000000], - [-6.17289014, -4.6984743], - [1.3109306, -6.93271427], - [-5.03823144, 3.10584743], - [-2.87600388, 6.74310541], - [5.21301203, 4.26386883]]) - y = f(X) - gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, - theta0=[1e-2] * 2, thetaL=[1e-4] * 2, - thetaU=[1e-1] * 2, - random_start=random_start, verbose=False) - gp.fit(X, y) - y_pred, MSE = gp.predict(X, eval_MSE=True) - - assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)) - - -def test_wrong_number_of_outputs(): - gp = GaussianProcess() - assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3]) - - -def test_more_builtin_correlation_models(random_start=1): - # Repeat test_1d and test_2d for several built-in correlation - # models specified as strings. - all_corr = ['absolute_exponential', 'squared_exponential', 'cubic', - 'linear'] - - for corr in all_corr: - test_1d(regr='constant', corr=corr, random_start=random_start) - test_2d(regr='constant', corr=corr, random_start=random_start) - test_2d_2d(regr='constant', corr=corr, random_start=random_start) - - -def test_ordinary_kriging(): - # Repeat test_1d and test_2d with given regression weights (beta0) for - # different regression models (Ordinary Kriging). - test_1d(regr='linear', beta0=[0., 0.5]) - test_1d(regr='quadratic', beta0=[0., 0.5, 0.5]) - test_2d(regr='linear', beta0=[0., 0.5, 0.5]) - test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) - test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5]) - test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) - - -def test_no_normalize(): - gp = GaussianProcess(normalize=False).fit(X, y) - y_pred = gp.predict(X) - assert_true(np.allclose(y_pred, y)) - - -def test_batch_size(): - # TypeError when using batch_size on Python 3, see - # https://github.com/scikit-learn/scikit-learn/issues/7329 for more - # details - gp = GaussianProcess() - gp.fit(X, y) - gp.predict(X, batch_size=1) - gp.predict(X, batch_size=1, eval_MSE=True) - - -def test_random_starts(): - # Test that an increasing number of random-starts of GP fitting only - # increases the reduced likelihood function of the optimal theta. - n_samples, n_features = 50, 3 - rng = np.random.RandomState(0) - X = rng.randn(n_samples, n_features) * 2 - 1 - y = np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1) - best_likelihood = -np.inf - for random_start in range(1, 5): - gp = GaussianProcess(regr="constant", corr="squared_exponential", - theta0=[1e-0] * n_features, - thetaL=[1e-4] * n_features, - thetaU=[1e+1] * n_features, - random_start=random_start, random_state=0, - verbose=False).fit(X, y) - rlf = gp.reduced_likelihood_function()[0] - assert_greater(rlf, best_likelihood - np.finfo(np.float32).eps) - best_likelihood = rlf - - -def test_mse_solving(): - # test the MSE estimate to be sane. - # non-regression test for ignoring off-diagonals of feature covariance, - # testing with nugget that renders covariance useless, only - # using the mean function, with low effective rank of data - gp = GaussianProcess(corr='absolute_exponential', theta0=1e-4, - thetaL=1e-12, thetaU=1e-2, nugget=1e-2, - optimizer='Welch', regr="linear", random_state=0) - - X, y = make_regression(n_informative=3, n_features=60, noise=50, - random_state=0, effective_rank=1) - - gp.fit(X, y) - assert_greater(1000, gp.predict(X, eval_MSE=True)[1].mean()) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fdbecc358be35..708fb8030de38 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -223,9 +223,7 @@ def _yield_all_checks(name, estimator): for check in _yield_clustering_checks(name, estimator): yield check yield check_fit2d_predict1d - if name != 'GaussianProcess': # FIXME - # XXX GaussianProcess deprecated in 0.20 - yield check_fit2d_1sample + yield check_fit2d_1sample yield check_fit2d_1feature yield check_fit1d yield check_get_params_invariance From 59e3f7d61852e255efdeaea16f088bffd293f7a2 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 17:59:44 +0100 Subject: [PATCH 06/14] remove code to be removed in 0.19 --- sklearn/multioutput.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 5b4389fd0f31b..1e0285db2f737 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -247,43 +247,6 @@ def partial_fit(self, X, y, sample_weight=None): super(MultiOutputRegressor, self).partial_fit( X, y, sample_weight=sample_weight) - def score(self, X, y, sample_weight=None): - """Returns the coefficient of determination R^2 of the prediction. - - The coefficient R^2 is defined as (1 - u/v), where u is the residual - sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression - sum of squares ((y_true - y_true.mean()) ** 2).sum(). - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). A constant model that always - predicts the expected value of y, disregarding the input features, - would get a R^2 score of 0.0. - - Notes - ----- - R^2 is calculated by weighting all the targets equally using - `multioutput='uniform_average'`. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Test samples. - - y : array-like, shape (n_samples) or (n_samples, n_outputs) - True values for X. - - sample_weight : array-like, shape [n_samples], optional - Sample weights. - - Returns - ------- - score : float - R^2 of self.predict(X) wrt. y. - """ - # XXX remove in 0.19 when r2_score default for multioutput changes - from .metrics import r2_score - return r2_score(y, self.predict(X), sample_weight=sample_weight, - multioutput='uniform_average') - class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin): """Multi target classification From 2ec39c0b8ff31441d575055fc28095b0f11698bd Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Thu, 9 Nov 2017 18:04:39 +0100 Subject: [PATCH 07/14] remove ransac's residual_metric --- sklearn/linear_model/ransac.py | 32 ++------------------ sklearn/linear_model/tests/test_ransac.py | 33 -------------------- sklearn/multioutput.py | 37 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 62 deletions(-) diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index fa3923dbebb14..322f9923b4925 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -135,17 +135,6 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin): as 0.99 (the default) and e is the current fraction of inliers w.r.t. the total number of samples. - residual_metric : callable, optional - Metric to reduce the dimensionality of the residuals to 1 for - multi-dimensional target values ``y.shape[1] > 1``. By default the sum - of absolute differences is used:: - - lambda dy: np.sum(np.abs(dy), axis=1) - - .. deprecated:: 0.18 - ``residual_metric`` is deprecated from 0.18 and will be removed in - 0.20. Use ``loss`` instead. - loss : string, callable, optional, default "absolute_loss" String inputs, "absolute_loss" and "squared_loss" are supported which find the absolute loss and squared loss per sample @@ -205,8 +194,8 @@ def __init__(self, base_estimator=None, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, - stop_probability=0.99, residual_metric=None, - loss='absolute_loss', random_state=None): + stop_probability=0.99, loss='absolute_loss', + random_state=None): self.base_estimator = base_estimator self.min_samples = min_samples @@ -218,7 +207,6 @@ def __init__(self, base_estimator=None, min_samples=None, self.stop_n_inliers = stop_n_inliers self.stop_score = stop_score self.stop_probability = stop_probability - self.residual_metric = residual_metric self.random_state = random_state self.loss = loss @@ -281,12 +269,6 @@ def fit(self, X, y, sample_weight=None): else: residual_threshold = self.residual_threshold - if self.residual_metric is not None: - warnings.warn( - "'residual_metric' was deprecated in version 0.18 and " - "will be removed in version 0.20. Use 'loss' instead.", - DeprecationWarning) - if self.loss == "absolute_loss": if y.ndim == 1: loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred) @@ -379,15 +361,7 @@ def fit(self, X, y, sample_weight=None): # residuals of all data for current random sample model y_pred = base_estimator.predict(X) - - # XXX: Deprecation: Remove this if block in 0.20 - if self.residual_metric is not None: - diff = y_pred - y - if diff.ndim == 1: - diff = diff.reshape(-1, 1) - residuals_subset = self.residual_metric(diff) - else: - residuals_subset = loss_function(y, y_pred) + residuals_subset = loss_function(y, y_pred) # classify data into inliers and outliers inlier_mask_subset = residuals_subset < residual_threshold diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 6f8e716f9ad19..176d3348246be 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -352,39 +352,6 @@ def test_ransac_multi_dimensional_targets(): assert_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) -# XXX: Remove in 0.20 -def test_ransac_residual_metric(): - residual_metric1 = lambda dy: np.sum(np.abs(dy), axis=1) - residual_metric2 = lambda dy: np.sum(dy ** 2, axis=1) - - yyy = np.column_stack([y, y, y]) - - base_estimator = LinearRegression() - ransac_estimator0 = RANSACRegressor(base_estimator, min_samples=2, - residual_threshold=5, random_state=0) - ransac_estimator1 = RANSACRegressor(base_estimator, min_samples=2, - residual_threshold=5, random_state=0, - residual_metric=residual_metric1) - ransac_estimator2 = RANSACRegressor(base_estimator, min_samples=2, - residual_threshold=5, random_state=0, - residual_metric=residual_metric2) - - # multi-dimensional - ransac_estimator0.fit(X, yyy) - assert_warns(DeprecationWarning, ransac_estimator1.fit, X, yyy) - assert_warns(DeprecationWarning, ransac_estimator2.fit, X, yyy) - assert_array_almost_equal(ransac_estimator0.predict(X), - ransac_estimator1.predict(X)) - assert_array_almost_equal(ransac_estimator0.predict(X), - ransac_estimator2.predict(X)) - - # one-dimensional - ransac_estimator0.fit(X, y) - assert_warns(DeprecationWarning, ransac_estimator2.fit, X, y) - assert_array_almost_equal(ransac_estimator0.predict(X), - ransac_estimator2.predict(X)) - - def test_ransac_residual_loss(): loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 1e0285db2f737..5b4389fd0f31b 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -247,6 +247,43 @@ def partial_fit(self, X, y, sample_weight=None): super(MultiOutputRegressor, self).partial_fit( X, y, sample_weight=sample_weight) + def score(self, X, y, sample_weight=None): + """Returns the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the residual + sum of squares ((y_true - y_pred) ** 2).sum() and v is the regression + sum of squares ((y_true - y_true.mean()) ** 2).sum(). + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always + predicts the expected value of y, disregarding the input features, + would get a R^2 score of 0.0. + + Notes + ----- + R^2 is calculated by weighting all the targets equally using + `multioutput='uniform_average'`. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Test samples. + + y : array-like, shape (n_samples) or (n_samples, n_outputs) + True values for X. + + sample_weight : array-like, shape [n_samples], optional + Sample weights. + + Returns + ------- + score : float + R^2 of self.predict(X) wrt. y. + """ + # XXX remove in 0.19 when r2_score default for multioutput changes + from .metrics import r2_score + return r2_score(y, self.predict(X), sample_weight=sample_weight, + multioutput='uniform_average') + class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin): """Multi target classification From c444763e9139b3f4cb2ca976dd8e474ec3a22c4f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 8 Sep 2017 12:10:59 -0400 Subject: [PATCH 08/14] remove RandomizedPCA (also from docs references etc) fixup! remove RandomizedPCA from docs references etc --- benchmarks/bench_plot_incremental_pca.py | 15 +- doc/modules/preprocessing.rst | 5 +- sklearn/decomposition/__init__.py | 3 +- sklearn/decomposition/incremental_pca.py | 1 - sklearn/decomposition/pca.py | 245 ----------------------- sklearn/decomposition/tests/test_pca.py | 21 -- sklearn/decomposition/truncated_svd.py | 1 - 7 files changed, 6 insertions(+), 285 deletions(-) diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py index 495d58f0f43ee..43b6ff9452c78 100644 --- a/benchmarks/bench_plot_incremental_pca.py +++ b/benchmarks/bench_plot_incremental_pca.py @@ -13,7 +13,7 @@ from collections import defaultdict import matplotlib.pyplot as plt from sklearn.datasets import fetch_lfw_people -from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA +from sklearn.decomposition import IncrementalPCA, PCA def plot_results(X, y, label): @@ -37,7 +37,6 @@ def plot_feature_times(all_times, batch_size, all_components, data): plot_results(all_components, all_times['pca'], label="PCA") plot_results(all_components, all_times['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) - plot_results(all_components, all_times['rpca'], label="RandomizedPCA") plt.legend(loc="upper left") plt.suptitle("Algorithm runtime vs. n_components\n \ LFW, size %i x %i" % data.shape) @@ -50,7 +49,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data): plot_results(all_components, all_errors['pca'], label="PCA") plot_results(all_components, all_errors['ipca'], label="IncrementalPCA, bsize=%i" % batch_size) - plot_results(all_components, all_errors['rpca'], label="RandomizedPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm error vs. n_components\n" "LFW, size %i x %i" % data.shape) @@ -61,7 +59,6 @@ def plot_feature_errors(all_errors, batch_size, all_components, data): def plot_batch_times(all_times, n_features, all_batch_sizes, data): plt.figure() plot_results(all_batch_sizes, all_times['pca'], label="PCA") - plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA") plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") plt.legend(loc="lower left") plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ @@ -92,11 +89,9 @@ def fixed_batch_size_comparison(data): all_errors = defaultdict(list) for n_components in all_features: pca = PCA(n_components=n_components) - rpca = RandomizedPCA(n_components=n_components, random_state=1999) ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), - ('ipca', ipca), - ('rpca', rpca)]} + ('ipca', ipca)]} for k in sorted(results_dict.keys()): all_times[k].append(results_dict[k]['time']) @@ -116,9 +111,7 @@ def variable_batch_size_comparison(data): all_times = defaultdict(list) all_errors = defaultdict(list) pca = PCA(n_components=n_components) - rpca = RandomizedPCA(n_components=n_components, random_state=1999) - results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), - ('rpca', rpca)]} + results_dict = {k: benchmark(est, data) for k, est in [('pca', pca)]} # Create flat baselines to compare the variation over batch size all_times['pca'].extend([results_dict['pca']['time']] * @@ -138,8 +131,6 @@ def variable_batch_size_comparison(data): all_errors['ipca'].append(results_dict['ipca']['error']) plot_batch_times(all_times, n_components, batch_sizes, data) - # RandomizedPCA error is always worse (approx 100x) than other PCA - # tests plot_batch_errors(all_errors, n_components, batch_sizes, data) faces = fetch_lfw_people(resize=.2, min_faces_per_person=5) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 8bcb14363d69c..29c77f5c32851 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -235,9 +235,8 @@ data. independently, since a downstream model can further make some assumption on the linear independence of the features. - To address this issue you can use :class:`sklearn.decomposition.PCA` - or :class:`sklearn.decomposition.RandomizedPCA` with ``whiten=True`` - to further remove the linear correlation across features. + To address this issue you can use :class:`sklearn.decomposition.PCA` with + ``whiten=True`` to further remove the linear correlation across features. .. topic:: Scaling a 1D array diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index faca56b91b1d8..34ad76ca46074 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -5,7 +5,7 @@ """ from .nmf import NMF, non_negative_factorization -from .pca import PCA, RandomizedPCA +from .pca import PCA from .incremental_pca import IncrementalPCA from .kernel_pca import KernelPCA from .sparse_pca import SparsePCA, MiniBatchSparsePCA @@ -26,7 +26,6 @@ 'MiniBatchSparsePCA', 'NMF', 'PCA', - 'RandomizedPCA', 'SparseCoder', 'SparsePCA', 'dict_learning', diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index 13e51090dd82e..9ed75928cf90c 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -136,7 +136,6 @@ class IncrementalPCA(_BasePCA): See also -------- PCA - RandomizedPCA KernelPCA SparsePCA TruncatedSVD diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 2b715b7e06824..4d528e5994a58 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -591,248 +591,3 @@ def score(self, X, y=None): Average log-likelihood of the samples under the current model """ return np.mean(self.score_samples(X)) - - -@deprecated("RandomizedPCA was deprecated in 0.18 and will be removed in " - "0.20. " - "Use PCA(svd_solver='randomized') instead. The new implementation " - "DOES NOT store whiten ``components_``. Apply transform to get " - "them.") -class RandomizedPCA(BaseEstimator, TransformerMixin): - """Principal component analysis (PCA) using randomized SVD - - .. deprecated:: 0.18 - This class will be removed in 0.20. - Use :class:`PCA` with parameter svd_solver 'randomized' instead. - The new implementation DOES NOT store whiten ``components_``. - Apply transform to get them. - - Linear dimensionality reduction using approximated Singular Value - Decomposition of the data and keeping only the most significant - singular vectors to project the data to a lower dimensional space. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_components : int, optional - Maximum number of components to keep. When not given or None, this - is set to n_features (the second dimension of the training data). - - copy : bool - If False, data passed to fit are overwritten and running - fit(X).transform(X) will not yield the expected results, - use fit_transform(X) instead. - - iterated_power : int, default=2 - Number of iterations for the power method. - - .. versionchanged:: 0.18 - - whiten : bool, optional - When True (False by default) the `components_` vectors are multiplied - by the square root of (n_samples) and divided by the singular values to - ensure uncorrelated outputs with unit component-wise variances. - - Whitening will remove some information from the transformed signal - (the relative variance scales of the components) but can sometime - improve the predictive accuracy of the downstream estimators by - making their data respect some hard-wired assumptions. - - random_state : int, RandomState instance or None, optional, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - Attributes - ---------- - components_ : array, shape (n_components, n_features) - Components with maximum variance. - - explained_variance_ratio_ : array, shape (n_components,) - Percentage of variance explained by each of the selected components. - If k is not set then all components are stored and the sum of explained - variances is equal to 1.0. - - singular_values_ : array, shape (n_components,) - The singular values corresponding to each of the selected components. - The singular values are equal to the 2-norms of the ``n_components`` - variables in the lower-dimensional space. - - mean_ : array, shape (n_features,) - Per-feature empirical mean, estimated from the training set. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.decomposition import RandomizedPCA - >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - >>> pca = RandomizedPCA(n_components=2) - >>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - RandomizedPCA(copy=True, iterated_power=2, n_components=2, - random_state=None, whiten=False) - >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS - [ 0.99244... 0.00755...] - >>> print(pca.singular_values_) # doctest: +ELLIPSIS - [ 6.30061... 0.54980...] - - See also - -------- - PCA - TruncatedSVD - - References - ---------- - - .. [Halko2009] `Finding structure with randomness: Stochastic algorithms - for constructing approximate matrix decompositions Halko, et al., 2009 - (arXiv:909)` - - .. [MRT] `A randomized algorithm for the decomposition of matrices - Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert` - - """ - - def __init__(self, n_components=None, copy=True, iterated_power=2, - whiten=False, random_state=None): - self.n_components = n_components - self.copy = copy - self.iterated_power = iterated_power - self.whiten = whiten - self.random_state = random_state - - def fit(self, X, y=None): - """Fit the model with X by extracting the first principal components. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. - - y : Ignored - - Returns - ------- - self : object - Returns the instance itself. - """ - self._fit(check_array(X)) - return self - - def _fit(self, X): - """Fit the model to the data X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples in the number of samples and - n_features is the number of features. - - Returns - ------- - X : ndarray, shape (n_samples, n_features) - The input data, copied, centered and whitened when requested. - """ - random_state = check_random_state(self.random_state) - X = np.atleast_2d(as_float_array(X, copy=self.copy)) - - n_samples = X.shape[0] - - # Center data - self.mean_ = np.mean(X, axis=0) - X -= self.mean_ - if self.n_components is None: - n_components = X.shape[1] - else: - n_components = self.n_components - - U, S, V = randomized_svd(X, n_components, - n_iter=self.iterated_power, - random_state=random_state) - - self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1) - full_var = np.var(X, ddof=1, axis=0).sum() - self.explained_variance_ratio_ = exp_var / full_var - self.singular_values_ = S # Store the singular values. - - if self.whiten: - self.components_ = V / S[:, np.newaxis] * sqrt(n_samples) - else: - self.components_ = V - - return X - - def transform(self, X): - """Apply dimensionality reduction on X. - - X is projected on the first principal components previous extracted - from a training set. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - - """ - check_is_fitted(self, 'mean_') - - X = check_array(X) - if self.mean_ is not None: - X = X - self.mean_ - - X = np.dot(X, self.components_.T) - return X - - def fit_transform(self, X, y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - y : Ignored - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - - """ - X = check_array(X) - X = self._fit(X) - return np.dot(X, self.components_.T) - - def inverse_transform(self, X): - """Transform data back to its original space. - - Returns an array X_original whose transform would be X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - New data, where n_samples in the number of samples - and n_components is the number of components. - - Returns - ------- - X_original array-like, shape (n_samples, n_features) - - Notes - ----- - If whitening is enabled, inverse_transform does not compute the - exact inverse operation of transform. - """ - check_is_fitted(self, 'mean_') - - X_original = np.dot(X, self.components_) - if self.mean_ is not None: - X_original = X_original + self.mean_ - return X_original diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index f1889d1462d2b..b3cf33a4b2176 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -17,7 +17,6 @@ from sklearn import datasets from sklearn.decomposition import PCA -from sklearn.decomposition import RandomizedPCA from sklearn.decomposition.pca import _assess_dimension_ from sklearn.decomposition.pca import _infer_dimension_ @@ -684,26 +683,6 @@ def test_svd_solver_auto(): assert_array_almost_equal(pca.components_, pca_test.components_) -def test_deprecation_randomized_pca(): - rng = np.random.RandomState(0) - X = rng.random_sample((5, 4)) - - depr_message = ("Class RandomizedPCA is deprecated; RandomizedPCA was " - "deprecated in 0.18 and will be " - "removed in 0.20. Use PCA(svd_solver='randomized') " - "instead. The new implementation DOES NOT store " - "whiten ``components_``. Apply transform to get them.") - - def fit_deprecated(X): - global Y - rpca = RandomizedPCA(random_state=0) - Y = rpca.fit_transform(X) - - assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X) - Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X) - assert_array_almost_equal(Y, Y_pca) - - def test_pca_sparse_input(): X = np.random.RandomState(0).rand(5, 4) X = sp.sparse.csr_matrix(X) diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index 726f9162eb925..268f8479f7a92 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -100,7 +100,6 @@ class TruncatedSVD(BaseEstimator, TransformerMixin): See also -------- PCA - RandomizedPCA References ---------- From a2e40d78eb088f3deff4ce099d0f45f781ce9665 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 8 Sep 2017 12:17:25 -0400 Subject: [PATCH 09/14] remove references to old GP, GMM and sparse_center_data Remove mixture/gmm --- sklearn/linear_model/tests/test_base.py | 74 -- sklearn/mixture/__init__.py | 14 +- sklearn/mixture/dpgmm.py | 869 ------------------------ sklearn/mixture/gmm.py | 853 ----------------------- sklearn/mixture/tests/test_dpgmm.py | 237 ------- sklearn/mixture/tests/test_gmm.py | 534 --------------- 6 files changed, 1 insertion(+), 2580 deletions(-) delete mode 100644 sklearn/mixture/dpgmm.py delete mode 100644 sklearn/mixture/gmm.py delete mode 100644 sklearn/mixture/tests/test_dpgmm.py delete mode 100644 sklearn/mixture/tests/test_gmm.py diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index ed53e1fbb4aa5..30e4cfdcced42 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -6,17 +6,14 @@ import numpy as np from scipy import sparse from scipy import linalg -from itertools import product from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import ignore_warnings from sklearn.linear_model.base import LinearRegression from sklearn.linear_model.base import _preprocess_data -from sklearn.linear_model.base import sparse_center_data, center_data from sklearn.linear_model.base import _rescale_data from sklearn.utils import check_random_state from sklearn.utils.testing import assert_greater @@ -402,74 +399,3 @@ def test_rescale_data(): rescaled_y2 = y * np.sqrt(sample_weight) assert_array_almost_equal(rescaled_X, rescaled_X2) assert_array_almost_equal(rescaled_y, rescaled_y2) - - -@ignore_warnings # all deprecation warnings -def test_deprecation_center_data(): - n_samples = 200 - n_features = 2 - - w = 1.0 + rng.rand(n_samples) - X = rng.rand(n_samples, n_features) - y = rng.rand(n_samples) - - param_grid = product([True, False], [True, False], [True, False], - [None, w]) - - for (fit_intercept, normalize, copy, sample_weight) in param_grid: - - XX = X.copy() # such that we can try copy=False as well - - X1, y1, X1_mean, X1_var, y1_mean = \ - center_data(XX, y, fit_intercept=fit_intercept, - normalize=normalize, copy=copy, - sample_weight=sample_weight) - - XX = X.copy() - - X2, y2, X2_mean, X2_var, y2_mean = \ - _preprocess_data(XX, y, fit_intercept=fit_intercept, - normalize=normalize, copy=copy, - sample_weight=sample_weight) - - assert_array_almost_equal(X1, X2) - assert_array_almost_equal(y1, y2) - assert_array_almost_equal(X1_mean, X2_mean) - assert_array_almost_equal(X1_var, X2_var) - assert_array_almost_equal(y1_mean, y2_mean) - - # Sparse cases - X = sparse.csr_matrix(X) - - for (fit_intercept, normalize, copy, sample_weight) in param_grid: - - X1, y1, X1_mean, X1_var, y1_mean = \ - center_data(X, y, fit_intercept=fit_intercept, normalize=normalize, - copy=copy, sample_weight=sample_weight) - - X2, y2, X2_mean, X2_var, y2_mean = \ - _preprocess_data(X, y, fit_intercept=fit_intercept, - normalize=normalize, copy=copy, - sample_weight=sample_weight, return_mean=False) - - assert_array_almost_equal(X1.toarray(), X2.toarray()) - assert_array_almost_equal(y1, y2) - assert_array_almost_equal(X1_mean, X2_mean) - assert_array_almost_equal(X1_var, X2_var) - assert_array_almost_equal(y1_mean, y2_mean) - - for (fit_intercept, normalize) in product([True, False], [True, False]): - - X1, y1, X1_mean, X1_var, y1_mean = \ - sparse_center_data(X, y, fit_intercept=fit_intercept, - normalize=normalize) - - X2, y2, X2_mean, X2_var, y2_mean = \ - _preprocess_data(X, y, fit_intercept=fit_intercept, - normalize=normalize, return_mean=True) - - assert_array_almost_equal(X1.toarray(), X2.toarray()) - assert_array_almost_equal(y1, y2) - assert_array_almost_equal(X1_mean, X2_mean) - assert_array_almost_equal(X1_var, X2_var) - assert_array_almost_equal(y1_mean, y2_mean) diff --git a/sklearn/mixture/__init__.py b/sklearn/mixture/__init__.py index 3622518352cae..08f55802e201e 100644 --- a/sklearn/mixture/__init__.py +++ b/sklearn/mixture/__init__.py @@ -2,21 +2,9 @@ The :mod:`sklearn.mixture` module implements mixture modeling algorithms. """ -from .gmm import sample_gaussian, log_multivariate_normal_density -from .gmm import GMM, distribute_covar_matrix_to_match_covariance_type -from .gmm import _validate_covars -from .dpgmm import DPGMM, VBGMM - from .gaussian_mixture import GaussianMixture from .bayesian_mixture import BayesianGaussianMixture -__all__ = ['DPGMM', - 'GMM', - 'VBGMM', - '_validate_covars', - 'distribute_covar_matrix_to_match_covariance_type', - 'log_multivariate_normal_density', - 'sample_gaussian', - 'GaussianMixture', +__all__ = ['GaussianMixture', 'BayesianGaussianMixture'] diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py deleted file mode 100644 index ddc861b4c19f0..0000000000000 --- a/sklearn/mixture/dpgmm.py +++ /dev/null @@ -1,869 +0,0 @@ -"""Bayesian Gaussian Mixture Models and -Dirichlet Process Gaussian Mixture Models""" -from __future__ import print_function - -# Author: Alexandre Passos (alexandre.tp@gmail.com) -# Bertrand Thirion -# -# Based on mixture.py by: -# Ron Weiss -# Fabian Pedregosa -# - -# Important note for the deprecation cleaning of 0.20 : -# All the function and classes of this file have been deprecated in 0.18. -# When you remove this file please also remove the related files -# - 'sklearn/mixture/gmm.py' -# - 'sklearn/mixture/test_dpgmm.py' -# - 'sklearn/mixture/test_gmm.py' - -import numpy as np -from scipy.special import digamma as _digamma, gammaln as _gammaln -from scipy import linalg -from scipy.linalg import pinvh -from scipy.spatial.distance import cdist - -from ..externals.six.moves import xrange -from ..utils import check_random_state, check_array, deprecated -from ..utils.fixes import logsumexp -from ..utils.extmath import squared_norm, stable_cumsum -from ..utils.validation import check_is_fitted -from .. import cluster -from .gmm import _GMMBase - - -@deprecated("The function digamma is deprecated in 0.18 and " - "will be removed in 0.20. Use scipy.special.digamma instead.") -def digamma(x): - return _digamma(x + np.finfo(np.float32).eps) - - -@deprecated("The function gammaln is deprecated in 0.18 and " - "will be removed in 0.20. Use scipy.special.gammaln instead.") -def gammaln(x): - return _gammaln(x + np.finfo(np.float32).eps) - - -@deprecated("The function log_normalize is deprecated in 0.18 and " - "will be removed in 0.20.") -def log_normalize(v, axis=0): - """Normalized probabilities from unnormalized log-probabilities""" - v = np.rollaxis(v, axis) - v = v.copy() - v -= v.max(axis=0) - out = logsumexp(v) - v = np.exp(v - out) - v += np.finfo(np.float32).eps - v /= np.sum(v, axis=0) - return np.swapaxes(v, 0, axis) - - -@deprecated("The function wishart_log_det is deprecated in 0.18 and " - "will be removed in 0.20.") -def wishart_log_det(a, b, detB, n_features): - """Expected value of the log of the determinant of a Wishart - - The expected value of the logarithm of the determinant of a - wishart-distributed random variable with the specified parameters.""" - l = np.sum(digamma(0.5 * (a - np.arange(-1, n_features - 1)))) - l += n_features * np.log(2) - return l + detB - - -@deprecated("The function wishart_logz is deprecated in 0.18 and " - "will be removed in 0.20.") -def wishart_logz(v, s, dets, n_features): - "The logarithm of the normalization constant for the wishart distribution" - z = 0. - z += 0.5 * v * n_features * np.log(2) - z += (0.25 * (n_features * (n_features - 1)) * np.log(np.pi)) - z += 0.5 * v * np.log(dets) - z += np.sum(gammaln(0.5 * (v - np.arange(n_features) + 1))) - return z - - -def _bound_wishart(a, B, detB): - """Returns a function of the dof, scale matrix and its determinant - used as an upper bound in variational approximation of the evidence""" - n_features = B.shape[0] - logprior = wishart_logz(a, B, detB, n_features) - logprior -= wishart_logz(n_features, - np.identity(n_features), - 1, n_features) - logprior += 0.5 * (a - 1) * wishart_log_det(a, B, detB, n_features) - logprior += 0.5 * a * np.trace(B) - return logprior - - -############################################################################## -# Variational bound on the log likelihood of each class -############################################################################## - - -def _sym_quad_form(x, mu, A): - """helper function to calculate symmetric quadratic form x.T * A * x""" - q = (cdist(x, mu[np.newaxis], "mahalanobis", VI=A) ** 2).reshape(-1) - return q - - -def _bound_state_log_lik(X, initial_bound, precs, means, covariance_type): - """Update the bound with likelihood terms, for standard covariance types""" - n_components, n_features = means.shape - n_samples = X.shape[0] - bound = np.empty((n_samples, n_components)) - bound[:] = initial_bound - if covariance_type in ['diag', 'spherical']: - for k in range(n_components): - d = X - means[k] - bound[:, k] -= 0.5 * np.sum(d * d * precs[k], axis=1) - elif covariance_type == 'tied': - for k in range(n_components): - bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs) - elif covariance_type == 'full': - for k in range(n_components): - bound[:, k] -= 0.5 * _sym_quad_form(X, means[k], precs[k]) - return bound - - -class _DPGMMBase(_GMMBase): - """Variational Inference for the Infinite Gaussian Mixture Model. - - DPGMM stands for Dirichlet Process Gaussian Mixture Model, and it - is an infinite mixture model with the Dirichlet Process as a prior - distribution on the number of clusters. In practice the - approximate inference algorithm uses a truncated distribution with - a fixed maximum number of components, but almost always the number - of components actually used depends on the data. - - Stick-breaking Representation of a Gaussian mixture model - probability distribution. This class allows for easy and efficient - inference of an approximate posterior distribution over the - parameters of a Gaussian mixture model with a variable number of - components (smaller than the truncation parameter n_components). - - Initialization is with normally-distributed means and identity - covariance, for proper convergence. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_components : int, default 1 - Number of mixture components. - - covariance_type : string, default 'diag' - String describing the type of covariance parameters to - use. Must be one of 'spherical', 'tied', 'diag', 'full'. - - alpha : float, default 1 - Real number representing the concentration parameter of - the dirichlet process. Intuitively, the Dirichlet Process - is as likely to start a new cluster for a point as it is - to add that point to a cluster with alpha elements. A - higher alpha means more clusters, as the expected number - of clusters is ``alpha*log(N)``. - - tol : float, default 1e-3 - Convergence threshold. - - n_iter : int, default 10 - Maximum number of iterations to perform before convergence. - - params : string, default 'wmc' - Controls which parameters are updated in the training - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. - - init_params : string, default 'wmc' - Controls which parameters are updated in the initialization - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. Defaults to 'wmc'. - - verbose : int, default 0 - Controls output verbosity. - - Attributes - ---------- - covariance_type : string - String describing the type of covariance parameters used by - the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'. - - n_components : int - Number of mixture components. - - weights_ : array, shape (`n_components`,) - Mixing weights for each mixture component. - - means_ : array, shape (`n_components`, `n_features`) - Mean parameters for each mixture component. - - precs_ : array - Precision (inverse covariance) parameters for each mixture - component. The shape depends on `covariance_type`:: - - (`n_components`, 'n_features') if 'spherical', - (`n_features`, `n_features`) if 'tied', - (`n_components`, `n_features`) if 'diag', - (`n_components`, `n_features`, `n_features`) if 'full' - - converged_ : bool - True when convergence was reached in fit(), False otherwise. - - See Also - -------- - GMM : Finite Gaussian mixture model fit with EM - - VBGMM : Finite Gaussian mixture model fit with a variational - algorithm, better for situations where there might be too little - data to get a good estimate of the covariance matrix. - """ - def __init__(self, n_components=1, covariance_type='diag', alpha=1.0, - random_state=None, tol=1e-3, verbose=0, min_covar=None, - n_iter=10, params='wmc', init_params='wmc'): - self.alpha = alpha - super(_DPGMMBase, self).__init__(n_components, covariance_type, - random_state=random_state, - tol=tol, min_covar=min_covar, - n_iter=n_iter, params=params, - init_params=init_params, - verbose=verbose) - - def _get_precisions(self): - """Return precisions as a full matrix.""" - if self.covariance_type == 'full': - return self.precs_ - elif self.covariance_type in ['diag', 'spherical']: - return [np.diag(cov) for cov in self.precs_] - elif self.covariance_type == 'tied': - return [self.precs_] * self.n_components - - def _get_covars(self): - return [pinvh(c) for c in self._get_precisions()] - - def _set_covars(self, covars): - raise NotImplementedError("""The variational algorithm does - not support setting the covariance parameters.""") - - def score_samples(self, X): - """Return the likelihood of the data under the model. - - Compute the bound on log probability of X under the model - and return the posterior distribution (responsibilities) of - each mixture component for each element of X. - - This is done by computing the parameters for the mean-field of - z for each observation. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - logprob : array_like, shape (n_samples,) - Log probabilities of each data point in X - responsibilities : array_like, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation - """ - check_is_fitted(self, 'gamma_') - - X = check_array(X) - if X.ndim == 1: - X = X[:, np.newaxis] - sd = digamma(self.gamma_.T[1] + self.gamma_.T[2]) - dgamma1 = digamma(self.gamma_.T[1]) - sd - dgamma2 = np.zeros(self.n_components) - dgamma2[0] = digamma(self.gamma_[0, 2]) - digamma(self.gamma_[0, 1] + - self.gamma_[0, 2]) - for j in range(1, self.n_components): - dgamma2[j] = dgamma2[j - 1] + digamma(self.gamma_[j - 1, 2]) - dgamma2[j] -= sd[j - 1] - dgamma = dgamma1 + dgamma2 - # Free memory and developers cognitive load: - del dgamma1, dgamma2, sd - - if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']: - raise NotImplementedError("This ctype is not implemented: %s" - % self.covariance_type) - p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_, - self.precs_, self.means_, - self.covariance_type) - z = p + dgamma - z = log_normalize(z, axis=-1) - bound = np.sum(z * p, axis=-1) - return bound, z - - def _update_concentration(self, z): - """Update the concentration parameters for each cluster""" - sz = np.sum(z, axis=0) - self.gamma_.T[1] = 1. + sz - self.gamma_.T[2].fill(0) - for i in range(self.n_components - 2, -1, -1): - self.gamma_[i, 2] = self.gamma_[i + 1, 2] + sz[i] - self.gamma_.T[2] += self.alpha - - def _update_means(self, X, z): - """Update the variational distributions for the means""" - n_features = X.shape[1] - for k in range(self.n_components): - if self.covariance_type in ['spherical', 'diag']: - num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0) - num *= self.precs_[k] - den = 1. + self.precs_[k] * np.sum(z.T[k]) - self.means_[k] = num / den - elif self.covariance_type in ['tied', 'full']: - if self.covariance_type == 'tied': - cov = self.precs_ - else: - cov = self.precs_[k] - den = np.identity(n_features) + cov * np.sum(z.T[k]) - num = np.sum(z.T[k].reshape((-1, 1)) * X, axis=0) - num = np.dot(cov, num) - self.means_[k] = linalg.lstsq(den, num)[0] - - def _update_precisions(self, X, z): - """Update the variational distributions for the precisions""" - n_features = X.shape[1] - if self.covariance_type == 'spherical': - self.dof_ = 0.5 * n_features * np.sum(z, axis=0) - for k in range(self.n_components): - # could be more memory efficient ? - sq_diff = np.sum((X - self.means_[k]) ** 2, axis=1) - self.scale_[k] = 1. - self.scale_[k] += 0.5 * np.sum(z.T[k] * (sq_diff + n_features)) - self.bound_prec_[k] = ( - 0.5 * n_features * ( - digamma(self.dof_[k]) - np.log(self.scale_[k]))) - self.precs_ = np.tile(self.dof_ / self.scale_, [n_features, 1]).T - - elif self.covariance_type == 'diag': - for k in range(self.n_components): - self.dof_[k].fill(1. + 0.5 * np.sum(z.T[k], axis=0)) - sq_diff = (X - self.means_[k]) ** 2 # see comment above - self.scale_[k] = np.ones(n_features) + 0.5 * np.dot( - z.T[k], (sq_diff + 1)) - self.precs_[k] = self.dof_[k] / self.scale_[k] - self.bound_prec_[k] = 0.5 * np.sum(digamma(self.dof_[k]) - - np.log(self.scale_[k])) - self.bound_prec_[k] -= 0.5 * np.sum(self.precs_[k]) - - elif self.covariance_type == 'tied': - self.dof_ = 2 + X.shape[0] + n_features - self.scale_ = (X.shape[0] + 1) * np.identity(n_features) - for k in range(self.n_components): - diff = X - self.means_[k] - self.scale_ += np.dot(diff.T, z[:, k:k + 1] * diff) - self.scale_ = pinvh(self.scale_) - self.precs_ = self.dof_ * self.scale_ - self.det_scale_ = linalg.det(self.scale_) - self.bound_prec_ = 0.5 * wishart_log_det( - self.dof_, self.scale_, self.det_scale_, n_features) - self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_) - - elif self.covariance_type == 'full': - for k in range(self.n_components): - sum_resp = np.sum(z.T[k]) - self.dof_[k] = 2 + sum_resp + n_features - self.scale_[k] = (sum_resp + 1) * np.identity(n_features) - diff = X - self.means_[k] - self.scale_[k] += np.dot(diff.T, z[:, k:k + 1] * diff) - self.scale_[k] = pinvh(self.scale_[k]) - self.precs_[k] = self.dof_[k] * self.scale_[k] - self.det_scale_[k] = linalg.det(self.scale_[k]) - self.bound_prec_[k] = 0.5 * wishart_log_det( - self.dof_[k], self.scale_[k], self.det_scale_[k], - n_features) - self.bound_prec_[k] -= 0.5 * self.dof_[k] * np.trace( - self.scale_[k]) - - def _monitor(self, X, z, n, end=False): - """Monitor the lower bound during iteration - - Debug method to help see exactly when it is failing to converge as - expected. - - Note: this is very expensive and should not be used by default.""" - if self.verbose > 0: - print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z))) - if end: - print("Cluster proportions:", self.gamma_.T[1]) - print("covariance_type:", self.covariance_type) - - def _do_mstep(self, X, z, params): - """Maximize the variational lower bound - - Update each of the parameters to maximize the lower bound.""" - self._monitor(X, z, "z") - self._update_concentration(z) - self._monitor(X, z, "gamma") - if 'm' in params: - self._update_means(X, z) - self._monitor(X, z, "mu") - if 'c' in params: - self._update_precisions(X, z) - self._monitor(X, z, "a and b", end=True) - - def _initialize_gamma(self): - "Initializes the concentration parameters" - self.gamma_ = self.alpha * np.ones((self.n_components, 3)) - - def _bound_concentration(self): - """The variational lower bound for the concentration parameter.""" - logprior = gammaln(self.alpha) * self.n_components - logprior += np.sum((self.alpha - 1) * ( - digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] + - self.gamma_.T[2]))) - logprior += np.sum(- gammaln(self.gamma_.T[1] + self.gamma_.T[2])) - logprior += np.sum(gammaln(self.gamma_.T[1]) + - gammaln(self.gamma_.T[2])) - logprior -= np.sum((self.gamma_.T[1] - 1) * ( - digamma(self.gamma_.T[1]) - digamma(self.gamma_.T[1] + - self.gamma_.T[2]))) - logprior -= np.sum((self.gamma_.T[2] - 1) * ( - digamma(self.gamma_.T[2]) - digamma(self.gamma_.T[1] + - self.gamma_.T[2]))) - return logprior - - def _bound_means(self): - "The variational lower bound for the mean parameters" - logprior = 0. - logprior -= 0.5 * squared_norm(self.means_) - logprior -= 0.5 * self.means_.shape[1] * self.n_components - return logprior - - def _bound_precisions(self): - """Returns the bound term related to precisions""" - logprior = 0. - if self.covariance_type == 'spherical': - logprior += np.sum(gammaln(self.dof_)) - logprior -= np.sum( - (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_))) - logprior += np.sum(- np.log(self.scale_) + self.dof_ - - self.precs_[:, 0]) - elif self.covariance_type == 'diag': - logprior += np.sum(gammaln(self.dof_)) - logprior -= np.sum( - (self.dof_ - 1) * digamma(np.maximum(0.5, self.dof_))) - logprior += np.sum(- np.log(self.scale_) + self.dof_ - self.precs_) - elif self.covariance_type == 'tied': - logprior += _bound_wishart(self.dof_, self.scale_, self.det_scale_) - elif self.covariance_type == 'full': - for k in range(self.n_components): - logprior += _bound_wishart(self.dof_[k], - self.scale_[k], - self.det_scale_[k]) - return logprior - - def _bound_proportions(self, z): - """Returns the bound term related to proportions""" - dg12 = digamma(self.gamma_.T[1] + self.gamma_.T[2]) - dg1 = digamma(self.gamma_.T[1]) - dg12 - dg2 = digamma(self.gamma_.T[2]) - dg12 - - cz = stable_cumsum(z[:, ::-1], axis=-1)[:, -2::-1] - logprior = np.sum(cz * dg2[:-1]) + np.sum(z * dg1) - del cz # Save memory - z_non_zeros = z[z > np.finfo(np.float32).eps] - logprior -= np.sum(z_non_zeros * np.log(z_non_zeros)) - return logprior - - def _logprior(self, z): - logprior = self._bound_concentration() - logprior += self._bound_means() - logprior += self._bound_precisions() - logprior += self._bound_proportions(z) - return logprior - - def lower_bound(self, X, z): - """returns a lower bound on model evidence based on X and membership""" - check_is_fitted(self, 'means_') - - if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']: - raise NotImplementedError("This ctype is not implemented: %s" - % self.covariance_type) - X = np.asarray(X) - if X.ndim == 1: - X = X[:, np.newaxis] - c = np.sum(z * _bound_state_log_lik(X, self._initial_bound + - self.bound_prec_, self.precs_, - self.means_, self.covariance_type)) - - return c + self._logprior(z) - - def _set_weights(self): - for i in xrange(self.n_components): - self.weights_[i] = self.gamma_[i, 1] / (self.gamma_[i, 1] - + self.gamma_[i, 2]) - self.weights_ /= np.sum(self.weights_) - - def _fit(self, X, y=None): - """Estimate model parameters with the variational - algorithm. - - For a full derivation and description of the algorithm see - doc/modules/dp-derivation.rst - or - http://scikit-learn.org/stable/modules/dp-derivation.html - - A initialization step is performed before entering the em - algorithm. If you want to avoid this step, set the keyword - argument init_params to the empty string '' when creating - the object. Likewise, if you would like just to do an - initialization, set n_iter=0. - - Parameters - ---------- - X : array_like, shape (n, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - responsibilities : array, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation. - """ - self.random_state_ = check_random_state(self.random_state) - - # initialization step - X = check_array(X) - if X.ndim == 1: - X = X[:, np.newaxis] - - n_samples, n_features = X.shape - z = np.ones((n_samples, self.n_components)) - z /= self.n_components - - self._initial_bound = - 0.5 * n_features * np.log(2 * np.pi) - self._initial_bound -= np.log(2 * np.pi * np.e) - - if (self.init_params != '') or not hasattr(self, 'gamma_'): - self._initialize_gamma() - - if 'm' in self.init_params or not hasattr(self, 'means_'): - self.means_ = cluster.KMeans( - n_clusters=self.n_components, - random_state=self.random_state_).fit(X).cluster_centers_[::-1] - - if 'w' in self.init_params or not hasattr(self, 'weights_'): - self.weights_ = np.tile(1.0 / self.n_components, self.n_components) - - if 'c' in self.init_params or not hasattr(self, 'precs_'): - if self.covariance_type == 'spherical': - self.dof_ = np.ones(self.n_components) - self.scale_ = np.ones(self.n_components) - self.precs_ = np.ones((self.n_components, n_features)) - self.bound_prec_ = 0.5 * n_features * ( - digamma(self.dof_) - np.log(self.scale_)) - elif self.covariance_type == 'diag': - self.dof_ = 1 + 0.5 * n_features - self.dof_ *= np.ones((self.n_components, n_features)) - self.scale_ = np.ones((self.n_components, n_features)) - self.precs_ = np.ones((self.n_components, n_features)) - self.bound_prec_ = 0.5 * (np.sum(digamma(self.dof_) - - np.log(self.scale_), 1)) - self.bound_prec_ -= 0.5 * np.sum(self.precs_, 1) - elif self.covariance_type == 'tied': - self.dof_ = 1. - self.scale_ = np.identity(n_features) - self.precs_ = np.identity(n_features) - self.det_scale_ = 1. - self.bound_prec_ = 0.5 * wishart_log_det( - self.dof_, self.scale_, self.det_scale_, n_features) - self.bound_prec_ -= 0.5 * self.dof_ * np.trace(self.scale_) - elif self.covariance_type == 'full': - self.dof_ = (1 + self.n_components + n_samples) - self.dof_ *= np.ones(self.n_components) - self.scale_ = [2 * np.identity(n_features) - for _ in range(self.n_components)] - self.precs_ = [np.identity(n_features) - for _ in range(self.n_components)] - self.det_scale_ = np.ones(self.n_components) - self.bound_prec_ = np.zeros(self.n_components) - for k in range(self.n_components): - self.bound_prec_[k] = wishart_log_det( - self.dof_[k], self.scale_[k], self.det_scale_[k], - n_features) - self.bound_prec_[k] -= (self.dof_[k] * - np.trace(self.scale_[k])) - self.bound_prec_ *= 0.5 - - # EM algorithms - current_log_likelihood = None - # reset self.converged_ to False - self.converged_ = False - - for i in range(self.n_iter): - prev_log_likelihood = current_log_likelihood - # Expectation step - curr_logprob, z = self.score_samples(X) - - current_log_likelihood = ( - curr_logprob.mean() + self._logprior(z) / n_samples) - - # Check for convergence. - if prev_log_likelihood is not None: - change = abs(current_log_likelihood - prev_log_likelihood) - if change < self.tol: - self.converged_ = True - break - - # Maximization step - self._do_mstep(X, z, self.params) - - if self.n_iter == 0: - # Need to make sure that there is a z value to output - # Output zeros because it was just a quick initialization - z = np.zeros((X.shape[0], self.n_components)) - - self._set_weights() - - return z - - -@deprecated("The `DPGMM` class is not working correctly and it's better " - "to use `sklearn.mixture.BayesianGaussianMixture` class with " - "parameter `weight_concentration_prior_type='dirichlet_process'` " - "instead. DPGMM is deprecated in 0.18 and will be " - "removed in 0.20.") -class DPGMM(_DPGMMBase): - """Dirichlet Process Gaussian Mixture Models - - .. deprecated:: 0.18 - This class will be removed in 0.20. - Use :class:`sklearn.mixture.BayesianGaussianMixture` with - parameter ``weight_concentration_prior_type='dirichlet_process'`` - instead. - - """ - - def __init__(self, n_components=1, covariance_type='diag', alpha=1.0, - random_state=None, tol=1e-3, verbose=0, min_covar=None, - n_iter=10, params='wmc', init_params='wmc'): - super(DPGMM, self).__init__( - n_components=n_components, covariance_type=covariance_type, - alpha=alpha, random_state=random_state, tol=tol, verbose=verbose, - min_covar=min_covar, n_iter=n_iter, params=params, - init_params=init_params) - - -@deprecated("The `VBGMM` class is not working correctly and it's better " - "to use `sklearn.mixture.BayesianGaussianMixture` class with " - "parameter `weight_concentration_prior_type=" - "'dirichlet_distribution'` instead. " - "VBGMM is deprecated in 0.18 and will be removed in 0.20.") -class VBGMM(_DPGMMBase): - """Variational Inference for the Gaussian Mixture Model - - .. deprecated:: 0.18 - This class will be removed in 0.20. - Use :class:`sklearn.mixture.BayesianGaussianMixture` with parameter - ``weight_concentration_prior_type='dirichlet_distribution'`` instead. - - Variational inference for a Gaussian mixture model probability - distribution. This class allows for easy and efficient inference - of an approximate posterior distribution over the parameters of a - Gaussian mixture model with a fixed number of components. - - Initialization is with normally-distributed means and identity - covariance, for proper convergence. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_components : int, default 1 - Number of mixture components. - - covariance_type : string, default 'diag' - String describing the type of covariance parameters to - use. Must be one of 'spherical', 'tied', 'diag', 'full'. - - alpha : float, default 1 - Real number representing the concentration parameter of - the dirichlet distribution. Intuitively, the higher the - value of alpha the more likely the variational mixture of - Gaussians model will use all components it can. - - tol : float, default 1e-3 - Convergence threshold. - - n_iter : int, default 10 - Maximum number of iterations to perform before convergence. - - params : string, default 'wmc' - Controls which parameters are updated in the training - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. - - init_params : string, default 'wmc' - Controls which parameters are updated in the initialization - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. Defaults to 'wmc'. - - verbose : int, default 0 - Controls output verbosity. - - Attributes - ---------- - covariance_type : string - String describing the type of covariance parameters used by - the DP-GMM. Must be one of 'spherical', 'tied', 'diag', 'full'. - - n_features : int - Dimensionality of the Gaussians. - - n_components : int (read-only) - Number of mixture components. - - weights_ : array, shape (`n_components`,) - Mixing weights for each mixture component. - - means_ : array, shape (`n_components`, `n_features`) - Mean parameters for each mixture component. - - precs_ : array - Precision (inverse covariance) parameters for each mixture - component. The shape depends on `covariance_type`:: - - (`n_components`, 'n_features') if 'spherical', - (`n_features`, `n_features`) if 'tied', - (`n_components`, `n_features`) if 'diag', - (`n_components`, `n_features`, `n_features`) if 'full' - - converged_ : bool - True when convergence was reached in fit(), False - otherwise. - - See Also - -------- - GMM : Finite Gaussian mixture model fit with EM - DPGMM : Infinite Gaussian mixture model, using the dirichlet - process, fit with a variational algorithm - """ - - def __init__(self, n_components=1, covariance_type='diag', alpha=1.0, - random_state=None, tol=1e-3, verbose=0, - min_covar=None, n_iter=10, params='wmc', init_params='wmc'): - super(VBGMM, self).__init__( - n_components, covariance_type, random_state=random_state, - tol=tol, verbose=verbose, min_covar=min_covar, - n_iter=n_iter, params=params, init_params=init_params) - self.alpha = alpha - - def _fit(self, X, y=None): - """Estimate model parameters with the variational algorithm. - - For a full derivation and description of the algorithm see - doc/modules/dp-derivation.rst - or - http://scikit-learn.org/stable/modules/dp-derivation.html - - A initialization step is performed before entering the EM - algorithm. If you want to avoid this step, set the keyword - argument init_params to the empty string '' when creating - the object. Likewise, if you just would like to do an - initialization, set n_iter=0. - - Parameters - ---------- - X : array_like, shape (n, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - responsibilities : array, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation. - """ - self.alpha_ = float(self.alpha) / self.n_components - return super(VBGMM, self)._fit(X, y) - - def score_samples(self, X): - """Return the likelihood of the data under the model. - - Compute the bound on log probability of X under the model - and return the posterior distribution (responsibilities) of - each mixture component for each element of X. - - This is done by computing the parameters for the mean-field of - z for each observation. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - logprob : array_like, shape (n_samples,) - Log probabilities of each data point in X - responsibilities : array_like, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation - """ - check_is_fitted(self, 'gamma_') - - X = check_array(X) - if X.ndim == 1: - X = X[:, np.newaxis] - dg = digamma(self.gamma_) - digamma(np.sum(self.gamma_)) - - if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']: - raise NotImplementedError("This ctype is not implemented: %s" - % self.covariance_type) - p = _bound_state_log_lik(X, self._initial_bound + self.bound_prec_, - self.precs_, self.means_, - self.covariance_type) - - z = p + dg - z = log_normalize(z, axis=-1) - bound = np.sum(z * p, axis=-1) - return bound, z - - def _update_concentration(self, z): - for i in range(self.n_components): - self.gamma_[i] = self.alpha_ + np.sum(z.T[i]) - - def _initialize_gamma(self): - self.gamma_ = self.alpha_ * np.ones(self.n_components) - - def _bound_proportions(self, z): - logprior = 0. - dg = digamma(self.gamma_) - dg -= digamma(np.sum(self.gamma_)) - logprior += np.sum(dg.reshape((-1, 1)) * z.T) - z_non_zeros = z[z > np.finfo(np.float32).eps] - logprior -= np.sum(z_non_zeros * np.log(z_non_zeros)) - return logprior - - def _bound_concentration(self): - logprior = gammaln(np.sum(self.gamma_)) - gammaln(self.n_components - * self.alpha_) - logprior -= np.sum(gammaln(self.gamma_) - gammaln(self.alpha_)) - sg = digamma(np.sum(self.gamma_)) - logprior += np.sum((self.gamma_ - self.alpha_) - * (digamma(self.gamma_) - sg)) - return logprior - - def _monitor(self, X, z, n, end=False): - """Monitor the lower bound during iteration - - Debug method to help see exactly when it is failing to converge as - expected. - - Note: this is very expensive and should not be used by default.""" - if self.verbose > 0: - print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z))) - if end: - print("Cluster proportions:", self.gamma_) - print("covariance_type:", self.covariance_type) - - def _set_weights(self): - self.weights_[:] = self.gamma_ - self.weights_ /= np.sum(self.weights_) diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py deleted file mode 100644 index 207eff9f1502a..0000000000000 --- a/sklearn/mixture/gmm.py +++ /dev/null @@ -1,853 +0,0 @@ -""" -Gaussian Mixture Models. - -This implementation corresponds to frequentist (non-Bayesian) formulation -of Gaussian Mixture Models. -""" - -# Author: Ron Weiss -# Fabian Pedregosa -# Bertrand Thirion - -# Important note for the deprecation cleaning of 0.20 : -# All the functions and classes of this file have been deprecated in 0.18. -# When you remove this file please also remove the related files -# - 'sklearn/mixture/dpgmm.py' -# - 'sklearn/mixture/test_dpgmm.py' -# - 'sklearn/mixture/test_gmm.py' -from time import time - -import numpy as np -from scipy import linalg - -from ..base import BaseEstimator -from ..utils import check_random_state, check_array, deprecated -from ..utils.fixes import logsumexp -from ..utils.validation import check_is_fitted -from .. import cluster - -from sklearn.externals.six.moves import zip - -EPS = np.finfo(float).eps - -@deprecated("The function log_multivariate_normal_density is deprecated in 0.18" - " and will be removed in 0.20.") -def log_multivariate_normal_density(X, means, covars, covariance_type='diag'): - """Compute the log probability under a multivariate Gaussian distribution. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row corresponds to a - single data point. - - means : array_like, shape (n_components, n_features) - List of n_features-dimensional mean vectors for n_components Gaussians. - Each row corresponds to a single mean vector. - - covars : array_like - List of n_components covariance parameters for each Gaussian. The shape - depends on `covariance_type`: - (n_components, n_features) if 'spherical', - (n_features, n_features) if 'tied', - (n_components, n_features) if 'diag', - (n_components, n_features, n_features) if 'full' - - covariance_type : string - Type of the covariance parameters. Must be one of - 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. - - Returns - ------- - lpr : array_like, shape (n_samples, n_components) - Array containing the log probabilities of each data point in - X under each of the n_components multivariate Gaussian distributions. - """ - log_multivariate_normal_density_dict = { - 'spherical': _log_multivariate_normal_density_spherical, - 'tied': _log_multivariate_normal_density_tied, - 'diag': _log_multivariate_normal_density_diag, - 'full': _log_multivariate_normal_density_full} - return log_multivariate_normal_density_dict[covariance_type]( - X, means, covars) - - -@deprecated("The function sample_gaussian is deprecated in 0.18" - " and will be removed in 0.20." - " Use numpy.random.multivariate_normal instead.") -def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1, - random_state=None): - """Generate random samples from a Gaussian distribution. - - Parameters - ---------- - mean : array_like, shape (n_features,) - Mean of the distribution. - - covar : array_like - Covariance of the distribution. The shape depends on `covariance_type`: - scalar if 'spherical', - (n_features) if 'diag', - (n_features, n_features) if 'tied', or 'full' - - covariance_type : string, optional - Type of the covariance parameters. Must be one of - 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. - - n_samples : int, optional - Number of samples to generate. Defaults to 1. - - Returns - ------- - X : array - Randomly generated sample. The shape depends on `n_samples`: - (n_features,) if `1` - (n_features, n_samples) otherwise - """ - return _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1, - random_state=None) - - -def _sample_gaussian(mean, covar, covariance_type='diag', n_samples=1, - random_state=None): - rng = check_random_state(random_state) - n_dim = len(mean) - rand = rng.randn(n_dim, n_samples) - if n_samples == 1: - rand.shape = (n_dim,) - - if covariance_type == 'spherical': - rand *= np.sqrt(covar) - elif covariance_type == 'diag': - rand = np.dot(np.diag(np.sqrt(covar)), rand) - else: - s, U = linalg.eigh(covar) - s.clip(0, out=s) # get rid of tiny negatives - np.sqrt(s, out=s) - U *= s - rand = np.dot(U, rand) - - return (rand.T + mean).T - - -class _GMMBase(BaseEstimator): - """Gaussian Mixture Model. - - Representation of a Gaussian mixture model probability distribution. - This class allows for easy evaluation of, sampling from, and - maximum-likelihood estimation of the parameters of a GMM distribution. - - Initializes parameters such that every mixture component has zero - mean and identity covariance. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - n_components : int, optional - Number of mixture components. Defaults to 1. - - covariance_type : string, optional - String describing the type of covariance parameters to - use. Must be one of 'spherical', 'tied', 'diag', 'full'. - Defaults to 'diag'. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - min_covar : float, optional - Floor on the diagonal of the covariance matrix to prevent - overfitting. Defaults to 1e-3. - - tol : float, optional - Convergence threshold. EM iterations will stop when average - gain in log-likelihood is below this threshold. Defaults to 1e-3. - - n_iter : int, optional - Number of EM iterations to perform. - - n_init : int, optional - Number of initializations to perform. The best results is kept. - - params : string, optional - Controls which parameters are updated in the training - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. Defaults to 'wmc'. - - init_params : string, optional - Controls which parameters are updated in the initialization - process. Can contain any combination of 'w' for weights, - 'm' for means, and 'c' for covars. Defaults to 'wmc'. - - verbose : int, default: 0 - Enable verbose output. If 1 then it always prints the current - initialization and iteration step. If greater than 1 then - it prints additionally the change and time needed for each step. - - Attributes - ---------- - weights_ : array, shape (`n_components`,) - This attribute stores the mixing weights for each mixture component. - - means_ : array, shape (`n_components`, `n_features`) - Mean parameters for each mixture component. - - covars_ : array - Covariance parameters for each mixture component. The shape - depends on `covariance_type`:: - - (n_components, n_features) if 'spherical', - (n_features, n_features) if 'tied', - (n_components, n_features) if 'diag', - (n_components, n_features, n_features) if 'full' - - converged_ : bool - True when convergence was reached in fit(), False otherwise. - - See Also - -------- - - DPGMM : Infinite gaussian mixture model, using the Dirichlet - process, fit with a variational algorithm - - - VBGMM : Finite gaussian mixture model fit with a variational - algorithm, better for situations where there might be too little - data to get a good estimate of the covariance matrix. - - Examples - -------- - - >>> import numpy as np - >>> from sklearn import mixture - >>> np.random.seed(1) - >>> g = mixture.GMM(n_components=2) - >>> # Generate random observations with two modes centered on 0 - >>> # and 10 to use for training. - >>> obs = np.concatenate((np.random.randn(100, 1), - ... 10 + np.random.randn(300, 1))) - >>> g.fit(obs) # doctest: +NORMALIZE_WHITESPACE - GMM(covariance_type='diag', init_params='wmc', min_covar=0.001, - n_components=2, n_init=1, n_iter=100, params='wmc', - random_state=None, tol=0.001, verbose=0) - >>> np.round(g.weights_, 2) - array([ 0.75, 0.25]) - >>> np.round(g.means_, 2) - array([[ 10.05], - [ 0.06]]) - >>> np.round(g.covars_, 2) # doctest: +SKIP - array([[[ 1.02]], - [[ 0.96]]]) - >>> g.predict([[0], [2], [9], [10]]) # doctest: +ELLIPSIS - array([1, 1, 0, 0]...) - >>> np.round(g.score([[0], [2], [9], [10]]), 2) - array([-2.19, -4.58, -1.75, -1.21]) - >>> # Refit the model on new data (initial parameters remain the - >>> # same), this time with an even split between the two modes. - >>> g.fit(20 * [[0]] + 20 * [[10]]) # doctest: +NORMALIZE_WHITESPACE - GMM(covariance_type='diag', init_params='wmc', min_covar=0.001, - n_components=2, n_init=1, n_iter=100, params='wmc', - random_state=None, tol=0.001, verbose=0) - >>> np.round(g.weights_, 2) - array([ 0.5, 0.5]) - - """ - - def __init__(self, n_components=1, covariance_type='diag', - random_state=None, tol=1e-3, min_covar=1e-3, - n_iter=100, n_init=1, params='wmc', init_params='wmc', - verbose=0): - self.n_components = n_components - self.covariance_type = covariance_type - self.tol = tol - self.min_covar = min_covar - self.random_state = random_state - self.n_iter = n_iter - self.n_init = n_init - self.params = params - self.init_params = init_params - self.verbose = verbose - - if covariance_type not in ['spherical', 'tied', 'diag', 'full']: - raise ValueError('Invalid value for covariance_type: %s' % - covariance_type) - - if n_init < 1: - raise ValueError('GMM estimation requires at least one run') - - def _get_covars(self): - """Covariance parameters for each mixture component. - - The shape depends on ``cvtype``:: - - (n_states, n_features) if 'spherical', - (n_features, n_features) if 'tied', - (n_states, n_features) if 'diag', - (n_states, n_features, n_features) if 'full' - - """ - if self.covariance_type == 'full': - return self.covars_ - elif self.covariance_type == 'diag': - return [np.diag(cov) for cov in self.covars_] - elif self.covariance_type == 'tied': - return [self.covars_] * self.n_components - elif self.covariance_type == 'spherical': - return [np.diag(cov) for cov in self.covars_] - - def _set_covars(self, covars): - """Provide values for covariance.""" - covars = np.asarray(covars) - _validate_covars(covars, self.covariance_type, self.n_components) - self.covars_ = covars - - def score_samples(self, X): - """Return the per-sample likelihood of the data under the model. - - Compute the log probability of X under the model and - return the posterior distribution (responsibilities) of each - mixture component for each element of X. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - logprob : array_like, shape (n_samples,) - Log probabilities of each data point in X. - - responsibilities : array_like, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation - """ - check_is_fitted(self, 'means_') - - X = check_array(X) - if X.ndim == 1: - X = X[:, np.newaxis] - if X.size == 0: - return np.array([]), np.empty((0, self.n_components)) - if X.shape[1] != self.means_.shape[1]: - raise ValueError('The shape of X is not compatible with self') - - lpr = (log_multivariate_normal_density(X, self.means_, self.covars_, - self.covariance_type) + - np.log(self.weights_)) - logprob = logsumexp(lpr, axis=1) - responsibilities = np.exp(lpr - logprob[:, np.newaxis]) - return logprob, responsibilities - - def score(self, X, y=None): - """Compute the log probability under the model. - - Parameters - ---------- - X : array_like, shape (n_samples, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - logprob : array_like, shape (n_samples,) - Log probabilities of each data point in X - """ - logprob, _ = self.score_samples(X) - return logprob - - def predict(self, X): - """Predict label for data. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - - Returns - ------- - C : array, shape = (n_samples,) component memberships - """ - logprob, responsibilities = self.score_samples(X) - return responsibilities.argmax(axis=1) - - def predict_proba(self, X): - """Predict posterior probability of data under each Gaussian - in the model. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - - Returns - ------- - responsibilities : array-like, shape = (n_samples, n_components) - Returns the probability of the sample for each Gaussian - (state) in the model. - """ - logprob, responsibilities = self.score_samples(X) - return responsibilities - - def sample(self, n_samples=1, random_state=None): - """Generate random samples from the model. - - Parameters - ---------- - n_samples : int, optional - Number of samples to generate. Defaults to 1. - - Returns - ------- - X : array_like, shape (n_samples, n_features) - List of samples - """ - check_is_fitted(self, 'means_') - - if random_state is None: - random_state = self.random_state - random_state = check_random_state(random_state) - weight_cdf = np.cumsum(self.weights_) - - X = np.empty((n_samples, self.means_.shape[1])) - rand = random_state.rand(n_samples) - # decide which component to use for each sample - comps = weight_cdf.searchsorted(rand) - # for each component, generate all needed samples - for comp in range(self.n_components): - # occurrences of current component in X - comp_in_X = (comp == comps) - # number of those occurrences - num_comp_in_X = comp_in_X.sum() - if num_comp_in_X > 0: - if self.covariance_type == 'tied': - cv = self.covars_ - elif self.covariance_type == 'spherical': - cv = self.covars_[comp][0] - else: - cv = self.covars_[comp] - X[comp_in_X] = _sample_gaussian( - self.means_[comp], cv, self.covariance_type, - num_comp_in_X, random_state=random_state).T - return X - - def fit_predict(self, X, y=None): - """Fit and then predict labels for data. - - Warning: Due to the final maximization step in the EM algorithm, - with low iterations the prediction may not be 100% accurate. - - .. versionadded:: 0.17 - *fit_predict* method in Gaussian Mixture Model. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - - Returns - ------- - C : array, shape = (n_samples,) component memberships - """ - return self._fit(X, y).argmax(axis=1) - - def _fit(self, X, y=None, do_prediction=False): - """Estimate model parameters with the EM algorithm. - - A initialization step is performed before entering the - expectation-maximization (EM) algorithm. If you want to avoid - this step, set the keyword argument init_params to the empty - string '' when creating the GMM object. Likewise, if you would - like just to do an initialization, set n_iter=0. - - Parameters - ---------- - X : array_like, shape (n, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - responsibilities : array, shape (n_samples, n_components) - Posterior probabilities of each mixture component for each - observation. - """ - - # initialization step - X = check_array(X, dtype=np.float64, ensure_min_samples=2, - estimator=self) - if X.shape[0] < self.n_components: - raise ValueError( - 'GMM estimation with %s components, but got only %s samples' % - (self.n_components, X.shape[0])) - - max_log_prob = -np.infty - - if self.verbose > 0: - print('Expectation-maximization algorithm started.') - - for init in range(self.n_init): - if self.verbose > 0: - print('Initialization ' + str(init + 1)) - start_init_time = time() - - if 'm' in self.init_params or not hasattr(self, 'means_'): - self.means_ = cluster.KMeans( - n_clusters=self.n_components, - random_state=self.random_state).fit(X).cluster_centers_ - if self.verbose > 1: - print('\tMeans have been initialized.') - - if 'w' in self.init_params or not hasattr(self, 'weights_'): - self.weights_ = np.tile(1.0 / self.n_components, - self.n_components) - if self.verbose > 1: - print('\tWeights have been initialized.') - - if 'c' in self.init_params or not hasattr(self, 'covars_'): - cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1]) - if not cv.shape: - cv.shape = (1, 1) - self.covars_ = \ - distribute_covar_matrix_to_match_covariance_type( - cv, self.covariance_type, self.n_components) - if self.verbose > 1: - print('\tCovariance matrices have been initialized.') - - # EM algorithms - current_log_likelihood = None - # reset self.converged_ to False - self.converged_ = False - - for i in range(self.n_iter): - if self.verbose > 0: - print('\tEM iteration ' + str(i + 1)) - start_iter_time = time() - prev_log_likelihood = current_log_likelihood - # Expectation step - log_likelihoods, responsibilities = self.score_samples(X) - current_log_likelihood = log_likelihoods.mean() - - # Check for convergence. - if prev_log_likelihood is not None: - change = abs(current_log_likelihood - prev_log_likelihood) - if self.verbose > 1: - print('\t\tChange: ' + str(change)) - if change < self.tol: - self.converged_ = True - if self.verbose > 0: - print('\t\tEM algorithm converged.') - break - - # Maximization step - self._do_mstep(X, responsibilities, self.params, - self.min_covar) - if self.verbose > 1: - print('\t\tEM iteration ' + str(i + 1) + ' took {0:.5f}s'.format( - time() - start_iter_time)) - - # if the results are better, keep it - if self.n_iter: - if current_log_likelihood > max_log_prob: - max_log_prob = current_log_likelihood - best_params = {'weights': self.weights_, - 'means': self.means_, - 'covars': self.covars_} - if self.verbose > 1: - print('\tBetter parameters were found.') - - if self.verbose > 1: - print('\tInitialization ' + str(init + 1) + ' took {0:.5f}s'.format( - time() - start_init_time)) - - # check the existence of an init param that was not subject to - # likelihood computation issue. - if np.isneginf(max_log_prob) and self.n_iter: - raise RuntimeError( - "EM algorithm was never able to compute a valid likelihood " + - "given initial parameters. Try different init parameters " + - "(or increasing n_init) or check for degenerate data.") - - if self.n_iter: - self.covars_ = best_params['covars'] - self.means_ = best_params['means'] - self.weights_ = best_params['weights'] - else: # self.n_iter == 0 occurs when using GMM within HMM - # Need to make sure that there are responsibilities to output - # Output zeros because it was just a quick initialization - responsibilities = np.zeros((X.shape[0], self.n_components)) - - return responsibilities - - def fit(self, X, y=None): - """Estimate model parameters with the EM algorithm. - - A initialization step is performed before entering the - expectation-maximization (EM) algorithm. If you want to avoid - this step, set the keyword argument init_params to the empty - string '' when creating the GMM object. Likewise, if you would - like just to do an initialization, set n_iter=0. - - Parameters - ---------- - X : array_like, shape (n, n_features) - List of n_features-dimensional data points. Each row - corresponds to a single data point. - - Returns - ------- - self - """ - self._fit(X, y) - return self - - def _do_mstep(self, X, responsibilities, params, min_covar=0): - """Perform the Mstep of the EM algorithm and return the cluster weights. - """ - weights = responsibilities.sum(axis=0) - weighted_X_sum = np.dot(responsibilities.T, X) - inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS) - - if 'w' in params: - self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS) - if 'm' in params: - self.means_ = weighted_X_sum * inverse_weights - if 'c' in params: - covar_mstep_func = _covar_mstep_funcs[self.covariance_type] - self.covars_ = covar_mstep_func( - self, X, responsibilities, weighted_X_sum, inverse_weights, - min_covar) - return weights - - def _n_parameters(self): - """Return the number of free parameters in the model.""" - ndim = self.means_.shape[1] - if self.covariance_type == 'full': - cov_params = self.n_components * ndim * (ndim + 1) / 2. - elif self.covariance_type == 'diag': - cov_params = self.n_components * ndim - elif self.covariance_type == 'tied': - cov_params = ndim * (ndim + 1) / 2. - elif self.covariance_type == 'spherical': - cov_params = self.n_components - mean_params = ndim * self.n_components - return int(cov_params + mean_params + self.n_components - 1) - - def bic(self, X): - """Bayesian information criterion for the current model fit - and the proposed data. - - Parameters - ---------- - X : array of shape(n_samples, n_dimensions) - - Returns - ------- - bic : float (the lower the better) - """ - return (-2 * self.score(X).sum() + - self._n_parameters() * np.log(X.shape[0])) - - def aic(self, X): - """Akaike information criterion for the current model fit - and the proposed data. - - Parameters - ---------- - X : array of shape(n_samples, n_dimensions) - - Returns - ------- - aic : float (the lower the better) - """ - return - 2 * self.score(X).sum() + 2 * self._n_parameters() - - -@deprecated("The class GMM is deprecated in 0.18 and will be " - " removed in 0.20. Use class GaussianMixture instead.") -class GMM(_GMMBase): - """ - Legacy Gaussian Mixture Model - - .. deprecated:: 0.18 - This class will be removed in 0.20. - Use :class:`sklearn.mixture.GaussianMixture` instead. - - """ - - def __init__(self, n_components=1, covariance_type='diag', - random_state=None, tol=1e-3, min_covar=1e-3, - n_iter=100, n_init=1, params='wmc', init_params='wmc', - verbose=0): - super(GMM, self).__init__( - n_components=n_components, covariance_type=covariance_type, - random_state=random_state, tol=tol, min_covar=min_covar, - n_iter=n_iter, n_init=n_init, params=params, - init_params=init_params, verbose=verbose) - -######################################################################### -# some helper routines -######################################################################### - - -def _log_multivariate_normal_density_diag(X, means, covars): - """Compute Gaussian log-density at X for a diagonal model.""" - n_samples, n_dim = X.shape - lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1) - + np.sum((means ** 2) / covars, 1) - - 2 * np.dot(X, (means / covars).T) - + np.dot(X ** 2, (1.0 / covars).T)) - return lpr - - -def _log_multivariate_normal_density_spherical(X, means, covars): - """Compute Gaussian log-density at X for a spherical model.""" - cv = covars.copy() - if covars.ndim == 1: - cv = cv[:, np.newaxis] - if cv.shape[1] == 1: - cv = np.tile(cv, (1, X.shape[-1])) - return _log_multivariate_normal_density_diag(X, means, cv) - - -def _log_multivariate_normal_density_tied(X, means, covars): - """Compute Gaussian log-density at X for a tied model.""" - cv = np.tile(covars, (means.shape[0], 1, 1)) - return _log_multivariate_normal_density_full(X, means, cv) - - -def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7): - """Log probability for full covariance matrices.""" - n_samples, n_dim = X.shape - nmix = len(means) - log_prob = np.empty((n_samples, nmix)) - for c, (mu, cv) in enumerate(zip(means, covars)): - try: - cv_chol = linalg.cholesky(cv, lower=True) - except linalg.LinAlgError: - # The model is most probably stuck in a component with too - # few observations, we need to reinitialize this components - try: - cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), - lower=True) - except linalg.LinAlgError: - raise ValueError("'covars' must be symmetric, " - "positive-definite") - - cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) - cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T - log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) + - n_dim * np.log(2 * np.pi) + cv_log_det) - - return log_prob - - -def _validate_covars(covars, covariance_type, n_components): - """Do basic checks on matrix covariance sizes and values.""" - from scipy import linalg - if covariance_type == 'spherical': - if len(covars) != n_components: - raise ValueError("'spherical' covars have length n_components") - elif np.any(covars <= 0): - raise ValueError("'spherical' covars must be non-negative") - elif covariance_type == 'tied': - if covars.shape[0] != covars.shape[1]: - raise ValueError("'tied' covars must have shape (n_dim, n_dim)") - elif (not np.allclose(covars, covars.T) - or np.any(linalg.eigvalsh(covars) <= 0)): - raise ValueError("'tied' covars must be symmetric, " - "positive-definite") - elif covariance_type == 'diag': - if len(covars.shape) != 2: - raise ValueError("'diag' covars must have shape " - "(n_components, n_dim)") - elif np.any(covars <= 0): - raise ValueError("'diag' covars must be non-negative") - elif covariance_type == 'full': - if len(covars.shape) != 3: - raise ValueError("'full' covars must have shape " - "(n_components, n_dim, n_dim)") - elif covars.shape[1] != covars.shape[2]: - raise ValueError("'full' covars must have shape " - "(n_components, n_dim, n_dim)") - for n, cv in enumerate(covars): - if (not np.allclose(cv, cv.T) - or np.any(linalg.eigvalsh(cv) <= 0)): - raise ValueError("component %d of 'full' covars must be " - "symmetric, positive-definite" % n) - else: - raise ValueError("covariance_type must be one of " + - "'spherical', 'tied', 'diag', 'full'") - - -@deprecated("The function distribute_covar_matrix_to_match_covariance_type" - "is deprecated in 0.18 and will be removed in 0.20.") -def distribute_covar_matrix_to_match_covariance_type( - tied_cv, covariance_type, n_components): - """Create all the covariance matrices from a given template.""" - if covariance_type == 'spherical': - cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]), - (n_components, 1)) - elif covariance_type == 'tied': - cv = tied_cv - elif covariance_type == 'diag': - cv = np.tile(np.diag(tied_cv), (n_components, 1)) - elif covariance_type == 'full': - cv = np.tile(tied_cv, (n_components, 1, 1)) - else: - raise ValueError("covariance_type must be one of " + - "'spherical', 'tied', 'diag', 'full'") - return cv - - -def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm, - min_covar): - """Perform the covariance M step for diagonal cases.""" - avg_X2 = np.dot(responsibilities.T, X * X) * norm - avg_means2 = gmm.means_ ** 2 - avg_X_means = gmm.means_ * weighted_X_sum * norm - return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar - - -def _covar_mstep_spherical(*args): - """Perform the covariance M step for spherical cases.""" - cv = _covar_mstep_diag(*args) - return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1])) - - -def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm, - min_covar): - """Perform the covariance M step for full cases.""" - # Eq. 12 from K. Murphy, "Fitting a Conditional Linear Gaussian - # Distribution" - n_features = X.shape[1] - cv = np.empty((gmm.n_components, n_features, n_features)) - for c in range(gmm.n_components): - post = responsibilities[:, c] - mu = gmm.means_[c] - diff = X - mu - with np.errstate(under='ignore'): - # Underflow Errors in doing post * X.T are not important - avg_cv = np.dot(post * diff.T, diff) / (post.sum() + 10 * EPS) - cv[c] = avg_cv + min_covar * np.eye(n_features) - return cv - - -def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm, - min_covar): - """Perform the covariance M step for tied cases.""" - # Eq. 15 from K. Murphy, "Fitting a Conditional Linear Gaussian - # Distribution" - avg_X2 = np.dot(X.T, X) - avg_means2 = np.dot(gmm.means_.T, weighted_X_sum) - out = avg_X2 - avg_means2 - out *= 1. / X.shape[0] - out.flat[::len(out) + 1] += min_covar - return out - -_covar_mstep_funcs = {'spherical': _covar_mstep_spherical, - 'diag': _covar_mstep_diag, - 'tied': _covar_mstep_tied, - 'full': _covar_mstep_full, - } diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py deleted file mode 100644 index 8ca38626b4cef..0000000000000 --- a/sklearn/mixture/tests/test_dpgmm.py +++ /dev/null @@ -1,237 +0,0 @@ -# Important note for the deprecation cleaning of 0.20 : -# All the function and classes of this file have been deprecated in 0.18. -# When you remove this file please also remove the related files -# - 'sklearn/mixture/dpgmm.py' -# - 'sklearn/mixture/gmm.py' -# - 'sklearn/mixture/test_gmm.py' -import unittest -import sys - -import numpy as np - -from sklearn.mixture import DPGMM, VBGMM -from sklearn.mixture.dpgmm import log_normalize -from sklearn.datasets import make_blobs -from sklearn.utils.testing import assert_array_less, assert_equal -from sklearn.utils.testing import assert_warns_message, ignore_warnings -from sklearn.mixture.tests.test_gmm import GMMTester -from sklearn.externals.six.moves import cStringIO as StringIO -from sklearn.mixture.dpgmm import digamma, gammaln -from sklearn.mixture.dpgmm import wishart_log_det, wishart_logz - - -np.seterr(all='warn') - - -@ignore_warnings(category=DeprecationWarning) -def test_class_weights(): - # check that the class weights are updated - # simple 3 cluster dataset - X, y = make_blobs(random_state=1) - for Model in [DPGMM, VBGMM]: - dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50) - dpgmm.fit(X) - # get indices of components that are used: - indices = np.unique(dpgmm.predict(X)) - active = np.zeros(10, dtype=np.bool) - active[indices] = True - # used components are important - assert_array_less(.1, dpgmm.weights_[active]) - # others are not - assert_array_less(dpgmm.weights_[~active], .05) - - -@ignore_warnings(category=DeprecationWarning) -def test_verbose_boolean(): - # checks that the output for the verbose output is the same - # for the flag values '1' and 'True' - # simple 3 cluster dataset - X, y = make_blobs(random_state=1) - for Model in [DPGMM, VBGMM]: - dpgmm_bool = Model(n_components=10, random_state=1, alpha=20, - n_iter=50, verbose=True) - dpgmm_int = Model(n_components=10, random_state=1, alpha=20, - n_iter=50, verbose=1) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - # generate output with the boolean flag - dpgmm_bool.fit(X) - verbose_output = sys.stdout - verbose_output.seek(0) - bool_output = verbose_output.readline() - # generate output with the int flag - dpgmm_int.fit(X) - verbose_output = sys.stdout - verbose_output.seek(0) - int_output = verbose_output.readline() - assert_equal(bool_output, int_output) - finally: - sys.stdout = old_stdout - - -@ignore_warnings(category=DeprecationWarning) -def test_verbose_first_level(): - # simple 3 cluster dataset - X, y = make_blobs(random_state=1) - for Model in [DPGMM, VBGMM]: - dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50, - verbose=1) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - dpgmm.fit(X) - finally: - sys.stdout = old_stdout - - -@ignore_warnings(category=DeprecationWarning) -def test_verbose_second_level(): - # simple 3 cluster dataset - X, y = make_blobs(random_state=1) - for Model in [DPGMM, VBGMM]: - dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50, - verbose=2) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - dpgmm.fit(X) - finally: - sys.stdout = old_stdout - - -@ignore_warnings(category=DeprecationWarning) -def test_digamma(): - assert_warns_message(DeprecationWarning, "The function digamma is" - " deprecated in 0.18 and will be removed in 0.20. " - "Use scipy.special.digamma instead.", digamma, 3) - - -@ignore_warnings(category=DeprecationWarning) -def test_gammaln(): - assert_warns_message(DeprecationWarning, "The function gammaln" - " is deprecated in 0.18 and will be removed" - " in 0.20. Use scipy.special.gammaln instead.", - gammaln, 3) - - -@ignore_warnings(category=DeprecationWarning) -def test_log_normalize(): - v = np.array([0.1, 0.8, 0.01, 0.09]) - a = np.log(2 * v) - result = assert_warns_message(DeprecationWarning, "The function " - "log_normalize is deprecated in 0.18 and" - " will be removed in 0.20.", - log_normalize, a) - assert np.allclose(v, result, rtol=0.01) - - -@ignore_warnings(category=DeprecationWarning) -def test_wishart_log_det(): - a = np.array([0.1, 0.8, 0.01, 0.09]) - b = np.array([0.2, 0.7, 0.05, 0.1]) - assert_warns_message(DeprecationWarning, "The function " - "wishart_log_det is deprecated in 0.18 and" - " will be removed in 0.20.", - wishart_log_det, a, b, 2, 4) - - -@ignore_warnings(category=DeprecationWarning) -def test_wishart_logz(): - assert_warns_message(DeprecationWarning, "The function " - "wishart_logz is deprecated in 0.18 and " - "will be removed in 0.20.", wishart_logz, - 3, np.identity(3), 1, 3) - - -@ignore_warnings(category=DeprecationWarning) -def test_DPGMM_deprecation(): - assert_warns_message( - DeprecationWarning, "The `DPGMM` class is not working correctly and " - "it's better to use `sklearn.mixture.BayesianGaussianMixture` class " - "with parameter `weight_concentration_prior_type='dirichlet_process'` " - "instead. DPGMM is deprecated in 0.18 and will be removed in 0.20.", - DPGMM) - - -def do_model(self, **kwds): - return VBGMM(verbose=False, **kwds) - - -class DPGMMTester(GMMTester): - model = DPGMM - do_test_eval = False - - def score(self, g, train_obs): - _, z = g.score_samples(train_obs) - return g.lower_bound(train_obs, z) - - -class TestDPGMMWithSphericalCovars(unittest.TestCase, DPGMMTester): - covariance_type = 'spherical' - setUp = GMMTester._setUp - - -class TestDPGMMWithDiagCovars(unittest.TestCase, DPGMMTester): - covariance_type = 'diag' - setUp = GMMTester._setUp - - -class TestDPGMMWithTiedCovars(unittest.TestCase, DPGMMTester): - covariance_type = 'tied' - setUp = GMMTester._setUp - - -class TestDPGMMWithFullCovars(unittest.TestCase, DPGMMTester): - covariance_type = 'full' - setUp = GMMTester._setUp - - -def test_VBGMM_deprecation(): - assert_warns_message( - DeprecationWarning, "The `VBGMM` class is not working correctly and " - "it's better to use `sklearn.mixture.BayesianGaussianMixture` class " - "with parameter `weight_concentration_prior_type=" - "'dirichlet_distribution'` instead. VBGMM is deprecated " - "in 0.18 and will be removed in 0.20.", VBGMM) - - -class VBGMMTester(GMMTester): - model = do_model - do_test_eval = False - - def score(self, g, train_obs): - _, z = g.score_samples(train_obs) - return g.lower_bound(train_obs, z) - - -class TestVBGMMWithSphericalCovars(unittest.TestCase, VBGMMTester): - covariance_type = 'spherical' - setUp = GMMTester._setUp - - -class TestVBGMMWithDiagCovars(unittest.TestCase, VBGMMTester): - covariance_type = 'diag' - setUp = GMMTester._setUp - - -class TestVBGMMWithTiedCovars(unittest.TestCase, VBGMMTester): - covariance_type = 'tied' - setUp = GMMTester._setUp - - -class TestVBGMMWithFullCovars(unittest.TestCase, VBGMMTester): - covariance_type = 'full' - setUp = GMMTester._setUp - - -def test_vbgmm_no_modify_alpha(): - alpha = 2. - n_components = 3 - X, y = make_blobs(random_state=1) - vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1) - assert_equal(vbgmm.alpha, alpha) - assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components) diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py deleted file mode 100644 index 137703adfcad4..0000000000000 --- a/sklearn/mixture/tests/test_gmm.py +++ /dev/null @@ -1,534 +0,0 @@ -# Important note for the deprecation cleaning of 0.20 : -# All the functions and classes of this file have been deprecated in 0.18. -# When you remove this file please remove the related files -# - 'sklearn/mixture/dpgmm.py' -# - 'sklearn/mixture/gmm.py' -# - 'sklearn/mixture/test_dpgmm.py' -import unittest -import copy -import sys - -import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from scipy import stats -from sklearn import mixture -from sklearn.datasets.samples_generator import make_spd_matrix -from sklearn.utils.testing import (assert_true, assert_greater, - assert_raise_message, assert_warns_message, - ignore_warnings, assert_raises) -from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.externals.six.moves import cStringIO as StringIO - - -rng = np.random.RandomState(0) - - -def test_sample_gaussian(): - # Test sample generation from mixture.sample_gaussian where covariance - # is diagonal, spherical and full - - n_features, n_samples = 2, 300 - axis = 1 - mu = rng.randint(10) * rng.rand(n_features) - cv = (rng.rand(n_features) + 1.0) ** 2 - - samples = mixture.gmm._sample_gaussian( - mu, cv, covariance_type='diag', n_samples=n_samples) - - assert_true(np.allclose(samples.mean(axis), mu, atol=1.3)) - assert_true(np.allclose(samples.var(axis), cv, atol=1.5)) - - # the same for spherical covariances - cv = (rng.rand() + 1.0) ** 2 - samples = mixture.gmm._sample_gaussian( - mu, cv, covariance_type='spherical', n_samples=n_samples) - - assert_true(np.allclose(samples.mean(axis), mu, atol=1.5)) - assert_true(np.allclose( - samples.var(axis), np.repeat(cv, n_features), atol=1.5)) - - # and for full covariances - A = rng.randn(n_features, n_features) - cv = np.dot(A.T, A) + np.eye(n_features) - samples = mixture.gmm._sample_gaussian( - mu, cv, covariance_type='full', n_samples=n_samples) - assert_true(np.allclose(samples.mean(axis), mu, atol=1.3)) - assert_true(np.allclose(np.cov(samples), cv, atol=2.5)) - - # Numerical stability check: in SciPy 0.12.0 at least, eigh may return - # tiny negative values in its second return value. - x = mixture.gmm._sample_gaussian( - [0, 0], [[4, 3], [1, .1]], covariance_type='full', random_state=42) - assert_true(np.isfinite(x).all()) - - -def _naive_lmvnpdf_diag(X, mu, cv): - # slow and naive implementation of lmvnpdf - ref = np.empty((len(X), len(mu))) - stds = np.sqrt(cv) - for i, (m, std) in enumerate(zip(mu, stds)): - ref[:, i] = np.log(stats.norm.pdf(X, m, std)).sum(axis=1) - return ref - - -def test_lmvnpdf_diag(): - # test a slow and naive implementation of lmvnpdf and - # compare it to the vectorized version (mixture.lmvnpdf) to test - # for correctness - n_features, n_components, n_samples = 2, 3, 10 - mu = rng.randint(10) * rng.rand(n_components, n_features) - cv = (rng.rand(n_components, n_features) + 1.0) ** 2 - X = rng.randint(10) * rng.rand(n_samples, n_features) - - ref = _naive_lmvnpdf_diag(X, mu, cv) - lpr = assert_warns_message(DeprecationWarning, "The function" - " log_multivariate_normal_density is " - "deprecated in 0.18 and will be removed in 0.20.", - mixture.log_multivariate_normal_density, - X, mu, cv, 'diag') - assert_array_almost_equal(lpr, ref) - - -def test_lmvnpdf_spherical(): - n_features, n_components, n_samples = 2, 3, 10 - - mu = rng.randint(10) * rng.rand(n_components, n_features) - spherecv = rng.rand(n_components, 1) ** 2 + 1 - X = rng.randint(10) * rng.rand(n_samples, n_features) - - cv = np.tile(spherecv, (n_features, 1)) - reference = _naive_lmvnpdf_diag(X, mu, cv) - lpr = assert_warns_message(DeprecationWarning, "The function" - " log_multivariate_normal_density is " - "deprecated in 0.18 and will be removed in 0.20.", - mixture.log_multivariate_normal_density, - X, mu, spherecv, 'spherical') - assert_array_almost_equal(lpr, reference) - -def test_lmvnpdf_full(): - n_features, n_components, n_samples = 2, 3, 10 - - mu = rng.randint(10) * rng.rand(n_components, n_features) - cv = (rng.rand(n_components, n_features) + 1.0) ** 2 - X = rng.randint(10) * rng.rand(n_samples, n_features) - - fullcv = np.array([np.diag(x) for x in cv]) - - reference = _naive_lmvnpdf_diag(X, mu, cv) - lpr = assert_warns_message(DeprecationWarning, "The function" - " log_multivariate_normal_density is " - "deprecated in 0.18 and will be removed in 0.20.", - mixture.log_multivariate_normal_density, - X, mu, fullcv, 'full') - assert_array_almost_equal(lpr, reference) - - -def test_lvmpdf_full_cv_non_positive_definite(): - n_features, n_samples = 2, 10 - rng = np.random.RandomState(0) - X = rng.randint(10) * rng.rand(n_samples, n_features) - mu = np.mean(X, 0) - cv = np.array([[[-1, 0], [0, 1]]]) - expected_message = "'covars' must be symmetric, positive-definite" - assert_raise_message(ValueError, expected_message, - mixture.log_multivariate_normal_density, - X, mu, cv, 'full') - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_GMM_attributes(): - n_components, n_features = 10, 4 - covariance_type = 'diag' - g = mixture.GMM(n_components, covariance_type, random_state=rng) - weights = rng.rand(n_components) - weights = weights / weights.sum() - means = rng.randint(-20, 20, (n_components, n_features)) - - assert_true(g.n_components == n_components) - assert_true(g.covariance_type == covariance_type) - - g.weights_ = weights - assert_array_almost_equal(g.weights_, weights) - g.means_ = means - assert_array_almost_equal(g.means_, means) - - covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2 - g.covars_ = covars - assert_array_almost_equal(g.covars_, covars) - assert_raises(ValueError, g._set_covars, []) - assert_raises(ValueError, g._set_covars, - np.zeros((n_components - 2, n_features))) - - assert_raises(ValueError, mixture.GMM, n_components=20, - covariance_type='badcovariance_type') - - -class GMMTester(): - do_test_eval = True - - def _setUp(self): - self.n_components = 10 - self.n_features = 4 - self.weights = rng.rand(self.n_components) - self.weights = self.weights / self.weights.sum() - self.means = rng.randint(-20, 20, (self.n_components, self.n_features)) - self.threshold = -0.5 - self.I = np.eye(self.n_features) - self.covars = { - 'spherical': (0.1 + 2 * rng.rand(self.n_components, - self.n_features)) ** 2, - 'tied': (make_spd_matrix(self.n_features, random_state=0) - + 5 * self.I), - 'diag': (0.1 + 2 * rng.rand(self.n_components, - self.n_features)) ** 2, - 'full': np.array([make_spd_matrix(self.n_features, random_state=0) - + 5 * self.I for x in range(self.n_components)])} - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def test_eval(self): - if not self.do_test_eval: - return # DPGMM does not support setting the means and - # covariances before fitting There is no way of fixing this - # due to the variational parameters being more expressive than - # covariance matrices - g = self.model(n_components=self.n_components, - covariance_type=self.covariance_type, random_state=rng) - # Make sure the means are far apart so responsibilities.argmax() - # picks the actual component used to generate the observations. - g.means_ = 20 * self.means - g.covars_ = self.covars[self.covariance_type] - g.weights_ = self.weights - - gaussidx = np.repeat(np.arange(self.n_components), 5) - n_samples = len(gaussidx) - X = rng.randn(n_samples, self.n_features) + g.means_[gaussidx] - - with ignore_warnings(category=DeprecationWarning): - ll, responsibilities = g.score_samples(X) - - self.assertEqual(len(ll), n_samples) - self.assertEqual(responsibilities.shape, - (n_samples, self.n_components)) - assert_array_almost_equal(responsibilities.sum(axis=1), - np.ones(n_samples)) - assert_array_equal(responsibilities.argmax(axis=1), gaussidx) - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def test_sample(self, n=100): - g = self.model(n_components=self.n_components, - covariance_type=self.covariance_type, - random_state=rng) - # Make sure the means are far apart so responsibilities.argmax() - # picks the actual component used to generate the observations. - g.means_ = 20 * self.means - g.covars_ = np.maximum(self.covars[self.covariance_type], 0.1) - g.weights_ = self.weights - - with ignore_warnings(category=DeprecationWarning): - samples = g.sample(n) - self.assertEqual(samples.shape, (n, self.n_features)) - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def test_train(self, params='wmc'): - g = mixture.GMM(n_components=self.n_components, - covariance_type=self.covariance_type) - with ignore_warnings(category=DeprecationWarning): - g.weights_ = self.weights - g.means_ = self.means - g.covars_ = 20 * self.covars[self.covariance_type] - - # Create a training set by sampling from the predefined distribution. - with ignore_warnings(category=DeprecationWarning): - X = g.sample(n_samples=100) - g = self.model(n_components=self.n_components, - covariance_type=self.covariance_type, - random_state=rng, min_covar=1e-1, - n_iter=1, init_params=params) - g.fit(X) - - # Do one training iteration at a time so we can keep track of - # the log likelihood to make sure that it increases after each - # iteration. - trainll = [] - with ignore_warnings(category=DeprecationWarning): - for _ in range(5): - g.params = params - g.init_params = '' - g.fit(X) - trainll.append(self.score(g, X)) - g.n_iter = 10 - g.init_params = '' - g.params = params - g.fit(X) # finish fitting - - # Note that the log likelihood will sometimes decrease by a - # very small amount after it has more or less converged due to - # the addition of min_covar to the covariance (to prevent - # underflow). This is why the threshold is set to -0.5 - # instead of 0. - with ignore_warnings(category=DeprecationWarning): - delta_min = np.diff(trainll).min() - self.assertTrue( - delta_min > self.threshold, - "The min nll increase is %f which is lower than the admissible" - " threshold of %f, for model %s. The likelihoods are %s." - % (delta_min, self.threshold, self.covariance_type, trainll)) - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def test_train_degenerate(self, params='wmc'): - # Train on degenerate data with 0 in some dimensions - # Create a training set by sampling from the predefined - # distribution. - X = rng.randn(100, self.n_features) - X.T[1:] = 0 - g = self.model(n_components=2, - covariance_type=self.covariance_type, - random_state=rng, min_covar=1e-3, n_iter=5, - init_params=params) - with ignore_warnings(category=DeprecationWarning): - g.fit(X) - trainll = g.score(X) - self.assertTrue(np.sum(np.abs(trainll / 100 / X.shape[1])) < 5) - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def test_train_1d(self, params='wmc'): - # Train on 1-D data - # Create a training set by sampling from the predefined - # distribution. - X = rng.randn(100, 1) - # X.T[1:] = 0 - g = self.model(n_components=2, - covariance_type=self.covariance_type, - random_state=rng, min_covar=1e-7, n_iter=5, - init_params=params) - with ignore_warnings(category=DeprecationWarning): - g.fit(X) - trainll = g.score(X) - if isinstance(g, mixture.dpgmm._DPGMMBase): - self.assertTrue(np.sum(np.abs(trainll / 100)) < 5) - else: - self.assertTrue(np.sum(np.abs(trainll / 100)) < 2) - - # This function tests the deprecated old GMM class - @ignore_warnings(category=DeprecationWarning) - def score(self, g, X): - with ignore_warnings(category=DeprecationWarning): - return g.score(X).sum() - - -class TestGMMWithSphericalCovars(unittest.TestCase, GMMTester): - covariance_type = 'spherical' - model = mixture.GMM - setUp = GMMTester._setUp - - -class TestGMMWithDiagonalCovars(unittest.TestCase, GMMTester): - covariance_type = 'diag' - model = mixture.GMM - setUp = GMMTester._setUp - - -class TestGMMWithTiedCovars(unittest.TestCase, GMMTester): - covariance_type = 'tied' - model = mixture.GMM - setUp = GMMTester._setUp - - -class TestGMMWithFullCovars(unittest.TestCase, GMMTester): - covariance_type = 'full' - model = mixture.GMM - setUp = GMMTester._setUp - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_multiple_init(): - # Test that multiple inits does not much worse than a single one - X = rng.randn(30, 5) - X[:10] += 2 - g = mixture.GMM(n_components=2, covariance_type='spherical', - random_state=rng, min_covar=1e-7, n_iter=5) - with ignore_warnings(category=DeprecationWarning): - train1 = g.fit(X).score(X).sum() - g.n_init = 5 - train2 = g.fit(X).score(X).sum() - assert_true(train2 >= train1 - 1.e-2) - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_n_parameters(): - n_samples, n_dim, n_components = 7, 5, 2 - X = rng.randn(n_samples, n_dim) - n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41} - for cv_type in ['full', 'tied', 'diag', 'spherical']: - with ignore_warnings(category=DeprecationWarning): - g = mixture.GMM(n_components=n_components, covariance_type=cv_type, - random_state=rng, min_covar=1e-7, n_iter=1) - g.fit(X) - assert_true(g._n_parameters() == n_params[cv_type]) - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_1d_1component(): - # Test all of the covariance_types return the same BIC score for - # 1-dimensional, 1 component fits. - n_samples, n_dim, n_components = 100, 1, 1 - X = rng.randn(n_samples, n_dim) - g_full = mixture.GMM(n_components=n_components, covariance_type='full', - random_state=rng, min_covar=1e-7, n_iter=1) - with ignore_warnings(category=DeprecationWarning): - g_full.fit(X) - g_full_bic = g_full.bic(X) - for cv_type in ['tied', 'diag', 'spherical']: - g = mixture.GMM(n_components=n_components, covariance_type=cv_type, - random_state=rng, min_covar=1e-7, n_iter=1) - g.fit(X) - assert_array_almost_equal(g.bic(X), g_full_bic) - - -def assert_fit_predict_correct(model, X): - model2 = copy.deepcopy(model) - - predictions_1 = model.fit(X).predict(X) - predictions_2 = model2.fit_predict(X) - - assert adjusted_rand_score(predictions_1, predictions_2) == 1.0 - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_fit_predict(): - """ - test that gmm.fit_predict is equivalent to gmm.fit + gmm.predict - """ - lrng = np.random.RandomState(101) - - n_samples, n_dim, n_comps = 100, 2, 2 - mu = np.array([[8, 8]]) - component_0 = lrng.randn(n_samples, n_dim) - component_1 = lrng.randn(n_samples, n_dim) + mu - X = np.vstack((component_0, component_1)) - - for m_constructor in (mixture.GMM, mixture.VBGMM, mixture.DPGMM): - model = m_constructor(n_components=n_comps, covariance_type='full', - min_covar=1e-7, n_iter=5, - random_state=np.random.RandomState(0)) - assert_fit_predict_correct(model, X) - - model = mixture.GMM(n_components=n_comps, n_iter=0) - z = model.fit_predict(X) - assert np.all(z == 0), "Quick Initialization Failed!" - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_aic(): - # Test the aic and bic criteria - n_samples, n_dim, n_components = 50, 3, 2 - X = rng.randn(n_samples, n_dim) - SGH = 0.5 * (X.var() + np.log(2 * np.pi)) # standard gaussian entropy - - for cv_type in ['full', 'tied', 'diag', 'spherical']: - g = mixture.GMM(n_components=n_components, covariance_type=cv_type, - random_state=rng, min_covar=1e-7) - g.fit(X) - aic = 2 * n_samples * SGH * n_dim + 2 * g._n_parameters() - bic = (2 * n_samples * SGH * n_dim + - np.log(n_samples) * g._n_parameters()) - bound = n_dim * 3. / np.sqrt(n_samples) - assert_true(np.abs(g.aic(X) - aic) / n_samples < bound) - assert_true(np.abs(g.bic(X) - bic) / n_samples < bound) - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def check_positive_definite_covars(covariance_type): - r"""Test that covariance matrices do not become non positive definite - - Due to the accumulation of round-off errors, the computation of the - covariance matrices during the learning phase could lead to non-positive - definite covariance matrices. Namely the use of the formula: - - .. math:: C = (\sum_i w_i x_i x_i^T) - \mu \mu^T - - instead of: - - .. math:: C = \sum_i w_i (x_i - \mu)(x_i - \mu)^T - - while mathematically equivalent, was observed a ``LinAlgError`` exception, - when computing a ``GMM`` with full covariance matrices and fixed mean. - - This function ensures that some later optimization will not introduce the - problem again. - """ - rng = np.random.RandomState(1) - # we build a dataset with 2 2d component. The components are unbalanced - # (respective weights 0.9 and 0.1) - X = rng.randn(100, 2) - X[-10:] += (3, 3) # Shift the 10 last points - - gmm = mixture.GMM(2, params="wc", covariance_type=covariance_type, - min_covar=1e-3) - - # This is a non-regression test for issue #2640. The following call used - # to trigger: - # numpy.linalg.linalg.LinAlgError: 2-th leading minor not positive definite - gmm.fit(X) - - if covariance_type == "diag" or covariance_type == "spherical": - assert_greater(gmm.covars_.min(), 0) - else: - if covariance_type == "tied": - covs = [gmm.covars_] - else: - covs = gmm.covars_ - - for c in covs: - assert_greater(np.linalg.det(c), 0) - - -def test_positive_definite_covars(): - # Check positive definiteness for all covariance types - for covariance_type in ["full", "tied", "diag", "spherical"]: - yield check_positive_definite_covars, covariance_type - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_verbose_first_level(): - # Create sample data - X = rng.randn(30, 5) - X[:10] += 2 - g = mixture.GMM(n_components=2, n_init=2, verbose=1) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - g.fit(X) - finally: - sys.stdout = old_stdout - - -# This function tests the deprecated old GMM class -@ignore_warnings(category=DeprecationWarning) -def test_verbose_second_level(): - # Create sample data - X = rng.randn(30, 5) - X[:10] += 2 - g = mixture.GMM(n_components=2, n_init=2, verbose=2) - - old_stdout = sys.stdout - sys.stdout = StringIO() - try: - g.fit(X) - finally: - sys.stdout = old_stdout From 7d4b2c11583e6bf2594d1cc1c445d66da7335d6f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 8 Sep 2017 12:30:36 -0400 Subject: [PATCH 10/14] more cleanup of deprecated scorers --- sklearn/metrics/scorer.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 05231826a8998..ebb6c7ca25ffe 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -225,18 +225,13 @@ def get_scorer(scoring): scorer : callable The scorer. """ - valid = True if isinstance(scoring, six.string_types): try: scorer = SCORERS[scoring] except KeyError: - scorers = [scorer for scorer in SCORERS - if SCORERS[scorer]._deprecation_msg is None] - valid = False # Don't raise here to make the error message elegant - if not valid: raise ValueError('%r is not a valid scoring value. ' 'Valid options are %s' - % (scoring, sorted(scorers))) + % (scoring, sorted(SCORERS.keys()))) else: scorer = scoring return scorer @@ -513,11 +508,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, # Score function for probabilistic classification neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True) -deprecation_msg = ('Scoring method log_loss was renamed to ' - 'neg_log_loss in version 0.18 and will be removed in 0.20.') -log_loss_scorer = make_scorer(log_loss, greater_is_better=False, - needs_proba=True) -log_loss_scorer._deprecation_msg = deprecation_msg brier_score_loss_scorer = make_scorer(brier_score_loss, greater_is_better=False, needs_proba=True) @@ -546,7 +536,6 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, accuracy=accuracy_scorer, roc_auc=roc_auc_scorer, balanced_accuracy=balanced_accuracy_scorer, average_precision=average_precision_scorer, - log_loss=log_loss_scorer, neg_log_loss=neg_log_loss_scorer, brier_score_loss=brier_score_loss_scorer, # Cluster metrics that use supervised evaluation From 2ffa7bdad5b18ea5d516e305fcef57738c215e4a Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 10 Nov 2017 16:09:28 +0100 Subject: [PATCH 11/14] More in scoring --- sklearn/metrics/tests/test_score_objects.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 6af6418635d59..836cdc0f934f8 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -499,25 +499,6 @@ def test_scorer_memmap_input(): yield check_scorer_memmap, name -def test_deprecated_names(): - X, y = make_blobs(random_state=0, centers=2) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - clf = LogisticRegression(random_state=0) - clf.fit(X_train, y_train) - - for name in ('mean_absolute_error', 'mean_squared_error', - 'median_absolute_error', 'log_loss'): - warning_msg = "Scoring method %s was renamed to" % name - for scorer in (get_scorer(name), SCORERS[name]): - assert_warns_message(DeprecationWarning, - warning_msg, - scorer, clf, X, y) - - assert_warns_message(DeprecationWarning, - warning_msg, - cross_val_score, clf, X, y, scoring=name) - - def test_scoring_is_not_metric(): assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), f1_score) From 0bf414616b133155f2cad12c063761eec75e0c5c Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 10 Nov 2017 16:22:53 +0100 Subject: [PATCH 12/14] Remove `hamming_loss` deprecated parameter `classes` --- sklearn/metrics/classification.py | 14 +------------- sklearn/metrics/tests/test_classification.py | 1 - 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index 7d8b887c66624..c14c8ffe855af 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -1528,8 +1528,7 @@ class 2 1.00 0.67 0.80 3 return report -def hamming_loss(y_true, y_pred, labels=None, sample_weight=None, - classes=None): +def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): """Compute the average Hamming loss. The Hamming loss is the fraction of labels that are incorrectly predicted. @@ -1555,13 +1554,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None, .. versionadded:: 0.18 - classes : array, shape = [n_labels], optional - Integer array of labels. - - .. deprecated:: 0.18 - This parameter has been deprecated in favor of ``labels`` in - version 0.18 and will be removed in 0.20. Use ``labels`` instead. - Returns ------- loss : float or int, @@ -1609,10 +1601,6 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None, >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) 0.75 """ - if classes is not None: - warnings.warn("'classes' was renamed to 'labels' in version 0.18 and " - "will be removed in 0.20.", DeprecationWarning) - labels = classes y_type, y_true, y_pred = _check_targets(y_true, y_pred) check_consistent_length(y_true, y_pred, sample_weight) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c259036807f7f..4f51f614d2a47 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -936,7 +936,6 @@ def test_multilabel_hamming_loss(): assert_equal(hamming_loss(y1, np.zeros_like(y1), sample_weight=w), 2. / 3) # sp_hamming only works with 1-D arrays assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0])) - assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1]) def test_multilabel_jaccard_similarity_score(): From b36341e23c0b515773f5027edef6f18ac80c61d9 Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 10 Nov 2017 16:41:01 +0100 Subject: [PATCH 13/14] splitter classes (issue:6660) Fix minor stuff --- examples/model_selection/plot_nested_cross_validation_iris.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py index 917746c359d4b..b40dc91fc4d8f 100644 --- a/examples/model_selection/plot_nested_cross_validation_iris.py +++ b/examples/model_selection/plot_nested_cross_validation_iris.py @@ -75,7 +75,7 @@ # Choose cross-validation techniques for the inner and outer loops, # independently of the dataset. - # E.g "LabelKFold", "LeaveOneOut", "LeaveOneLabelOut", etc. + # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc. inner_cv = KFold(n_splits=4, shuffle=True, random_state=i) outer_cv = KFold(n_splits=4, shuffle=True, random_state=i) From 4b7aa69655a3ab24398e9ded0c2daf880b49215f Mon Sep 17 00:00:00 2001 From: Joan Massich Date: Fri, 10 Nov 2017 17:45:42 +0100 Subject: [PATCH 14/14] Fix doctest expected output --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 4a19e27e9c11c..a122728e825a6 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -104,7 +104,7 @@ Usage examples: >>> model = svm.SVC() >>> cross_val_score(model, X, y, scoring='wrong_choice') Traceback (most recent call last): - ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score'] + ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score'] .. note::