8000 [MRG] Refactor CV and grid search by AlexanderFabisch · Pull Request #2736 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Refactor CV and grid search #2736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 160 additions & 66 deletions sklearn/cross_validation.py
< 8000 td class="blob-num blob-num-addition empty-cell">
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,19 @@
from itertools import chain, combinations
from math import ceil, floor, factorial
import numbers
import time
from abc import ABCMeta, abstractmethod

import numpy as np
import scipy.sparse as sp

from .base import is_classifier, clone
from .utils import check_arrays, check_random_state, safe_mask
from .utils.validation import _num_samples
from .utils.fixes import unique
from .externals.joblib import Parallel, delayed
from .externals.six import string_types, with_metaclass
from .metrics.scorer import _deprecate_loss_and_score_funcs
from .externals.joblib import Parallel, delayed, logger
from .externals.six import with_metaclass
from .metrics.scorer import check_scoring

__all__ = ['Bootstrap',
'KFold',
Expand Down Expand Up @@ -1023,48 +1025,6 @@ def __len__(self):

##############################################################################

def _cross_val_score(estimator, X, y, scorer, train, test, verbose,
fit_params):
"""Inner loop for cross validation"""
n_samples = X.shape[0] if sp.issparse(X) else len(X)
fit_params = dict([(k, np.asarray(v)[train]
if hasattr(v, '__len__') and len(v) == n_samples else v)
for k, v in fit_params.items()])
if not hasattr(X, "shape"):
if getattr(estimator, "_pairwise", False):
raise ValueError("Precomputed kernels or affinity matrices have "
"to be passed as arrays or sparse matrices.")
X_train = [X[idx] for idx in train]
X_test = [X[idx] for idx in test]
else:
if getattr(estimator, "_pairwise", False):
# X is a precomputed square kernel matrix
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square kernel matrix")
X_train = X[np.ix_(train, train)]
X_test = X[np.ix_(test, train)]
else:
X_train = X[safe_mask(X, train)]
X_test = X[safe_mask(X, test)]

if y is None:
y_train = None
y_test = None
else:
y_train = y[train]
y_test = y[test]
estimator.fit(X_train, y_train, **fit_params)
if scorer is None:
score = estimator.score(X_test, y_test)
else:
score = scorer(estimator, X_test, y_test)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s)"
" instead." % (str(score), type(score)))
if verbose > 1:
print("score: %f" % score)
return score


def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
verbose=0, fit_params=None, score_func=None,
Expand Down Expand Up @@ -1127,26 +1087,164 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
"""
X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
scorer = _deprecate_loss_and_score_funcs(
loss_func=None,
score_func=score_func,
scoring=scoring
)
if scorer is None and not hasattr(estimator, 'score'):
raise TypeError(
"If no scoring is specified, the estimator passed "
"should have a 'score' method. The estimator %s "
"does not." % estimator)
scorer = check_scoring(estimator, score_func=score_func, scoring=scoring)
# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
fit_params = fit_params if fit_params is not None else {}
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
scores = parallel(
delayed(_cross_val_score)(clone(estimator), X, y, scorer, train, test,
verbose, fit_params)
for train, test in cv)
return np.array(scores)
scores = parallel(delayed(fit_and_score)(clone(estimator), X, y, scorer,
train, test, verbose, None,
fit_params)
for train, test in cv)
return np.array(scores)[:, 0]


def fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@GaelVaroquaux, do you think fit_and_score should be public? Personally, I don't think either it or fit_grid_point should be public (and while not _-prefixed, fit_grid_point isn't in classes.rst, nor is it any longer the right name for that function). It makes it much harder to add functionality to grid search etc. since every modification to its output is an API change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I don't understand what the objection is to calling this cross-validation... I think that's what fitting on one chunk of data and scoring on another is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on using private functions

fit_params, return_train_score=False,
return_parameters=False):
"""Fit estimator and compute scores for a given dataset split.

Parameters
----------
estimator : estimator object implementing 'fit'
The object to use to fit the data.

X : array-like of shape at least 2D
The data to fit.

y : array-like, optional, default: None
The target variable to try to predict in the case of
supervised learning.

scoring : callable
A scorer callable object / function with signature
``scorer(estimator, X, y)``.

train : array-like, shape = (n_train_samples,)
Indices of training samples.

test : array-like, shape = (n_test_samples,)
Indices of test samples.

verbose : integer
The verbosity level.

parameters : dict or None
Parameters to be set on the estimator.

fit_params : dict or None
Parameters that will be passed to ``estimator.fit``.

return_train_score : boolean, optional, default: False
Compute and return score on training set.

return_parameters : boolean, optional, default: False
Return parameters that has been used for the estimator.

Returns
-------
test_score : float
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I messed this up. The order of test_score and train_score is wrong in the docstring.

Score on test set.

train_score : float, optional
Score on training set.

n_test_samples : int
Number of test samples.

scoring_time : float
Time spent for fitting and scoring in seconds.

parameters : dict or None, optional
The parameters that have been evaluated.
"""
if verbose > 1:
if parameters is None:
msg = "no parameters to be set"
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.items()))
print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

# Adjust lenght of sample weights
n_samples = _num_samples(X)
fit_params = fit_params if fit_params is not None else {}
fit_params = dict([(k, np.asarray(v)[train]
if hasattr(v, '__len__') and len(v) == n_samples else v)
for k, v in fit_params.items()])

if parameters is not None:
estimator.set_params(**parameters)

start_time = time.time()

X_train, y_train = _split_with_kernel(estimator, X, y, train)
X_test, y_test = _split_with_kernel(estimator, X, y, test, train)
if y_train is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlexanderFabisch , it seems that you are not fully consistent :). In your PR on learning curves you defined a '_fit' function that avoids the if/else clause that you have here. Here you are not using it.

I must admit that I don't like such a little 4-liner function, as I find it obfuscates the code. In addition, other contributors tend to not see it, and reinvent it.

Anyhow, in my opinion, I think that you should choose either to remove it from the grid_search.py file, or to use it here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, this was about this comment: https://github.com/scikit-learn/scikit-learn/pull/2701/files#r8853195

I would also be in favor of either removing the _fit mini-helper in the grid search module, or finding a more explicit name to reuse it here. I think I still prefer to keep those 4 lines expanded every where to help readability by avoiding an indirection.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has been removed already because of @GaelVaroquaux's comment. That is why I don't use it here. :)

estimator.fit(X_train, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)
test_score = _score(estimator, X_test, y_test, scorer)
if return_train_score:
train_score = _score(estimator, X_train, y_train, scorer)

scoring_time = time.time() - start_time

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You appear to have dropped the verbose output here. Why not report time and score in _cross_val_score?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

ret = [train_score] if return_train_score else []
ret.extend([test_score, _num_samples(X_test), scoring_time])
if return_parameters:
ret.append(parameters)
return ret
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not completely happy with such a complex return value but it is required to satisfy all requirements of grid_search, learning_curve and cross_val_score.



def _split_with_kernel(estimator, X, y, indices, train_indices=None):
"""Create subset of dataset."""
if hasattr(estimator, 'kernel') and callable(estimator.kernel):
# cannot compute the kernel values with custom function
raise ValueError("Cannot use a custom kernel function. "
"Precompute the kernel matrix instead.")

if not hasattr(X, "shape"):
if getattr(estimator, "_pairwise", False):
raise ValueError("Precomputed kernels or affinity matrices have "
"to be passed as arrays or sparse matrices.")
X_subset = [X[idx] for idx in indices]
else:
if getattr(estimator, "_pairwise", False):
# X is a precomputed square kernel matrix
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square kernel matrix")
if train_indices is None:
X_subset = X[np.ix_(indices, indices)]
else:
X_subset = X[np.ix_(indices, train_indices)]
else:
X_subset = X[safe_mask(X, indices)]

if y is not None:
y_subset = y[safe_mask(y, indices)]
else:
y_subset = None

return X_subset, y_subset


def _score(estimator, X_test, y_test, scorer):
"""Compute the score of an estimator on a given test set."""
if y_test is None:
score = scorer(estimator, X_test)
else:
score = scorer(estimator, X_test, y_test)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s) instead."
% (str(score), type(score)))
return score


def _permutation_test_score(estimator, X, y, cv, scorer):
Expand Down Expand Up @@ -1297,11 +1395,7 @@ def permutation_test_score(estimator, X, y, score_func=None, cv=None,
"""
X, y = check_arrays(X, y, sparse_format='csr')
cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
scorer = _deprecate_loss_and_score_funcs(
loss_func=None,
score_func=score_func,
scoring=scoring
)
scorer = check_scoring(estimator, scoring=scoring, score_func=score_func)
random_state = check_random_state(random_state)

# We clone the estimator to make sure that all the folds are
Expand Down
20 changes: 8 additions & 12 deletions sklearn/feature_selection/rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
from ..base import clone
from ..base import is_classifier
from ..cross_validation import _check_cv as check_cv
from ..cross_validation import _split_with_kernel, _score
from .base import SelectorMixin
from ..metrics.scorer import _deprecate_loss_and_score_funcs
from ..metrics.scorer import check_scoring


class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
Expand Down Expand Up @@ -325,12 +326,15 @@ def fit(self, X, y):
verbose=self.verbose - 1)

cv = check_cv(self.cv, X, y, is_classifier(self.estimator))
scorer = check_scoring(self.estimator, scoring=self.scoring,
loss_func=self.loss_func)
scores = np.zeros(X.shape[1])

# Cross-validation
for n, (train, test) in enumerate(cv):
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
X_train, y_train = _split_with_kernel(self.estimator, X, y, train)
X_test, y_test = _split_with_kernel(self.estimator, X, y, test,
train)

# Compute a full ranking of the features
ranking_ = rfe.fit(X_train, y_train).ranking_
Expand All @@ -339,15 +343,7 @@ def fit(self, X, y):
mask = np.where(ranking_ <= k + 1)[0]
estimator = clone(self.estimator)
estimator.fit(X_train[:, mask], y_train)

if self.loss_func is None and self.scoring is None:
score = estimator.score(X_test[:, mask], y_test)
else:
scorer = _deprecate_loss_and_score_funcs(
loss_func=self.loss_func,
scoring=self.scoring
)
score = scorer(estimator, X_test[:, mask], y_test)
score = _score(estimator, X_test[:, mask], y_test, scorer)

if self.verbose > 0:
print("Finished fold with %d / %d feature ranks, score=%f"
Expand Down
Loading
0