8000 [MRG + 1] ENH Ensure consistency in splits and in parameters (without causing memory blowup because of materializing the iterator) by raghavrv · Pull Request #7941 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG + 1] ENH Ensure consistency in splits and in parameters (without causing memory blowup because of materializing the iterator) #7941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 40 additions & 59 deletions sklearn/model_selection/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# License: BSD 3 clause

from abc import ABCMeta, abstractmethod
from collections import Mapping, namedtuple, Sized, defaultdict, Sequence
from collections import Mapping, namedtuple, defaultdict, Sequence
from functools import partial, reduce
from itertools import product
import operator
Expand Down Expand Up @@ -532,25 +532,41 @@ def inverse_transform(self, Xt):
self._check_is_fitted('inverse_transform')
return self.best_estimator_.transform(Xt)

def _fit(self, X, y, groups, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
def fit(self, X, y=None, groups=None):
"""Run fit with all sets of parameters.

Parameters
----------

X : array-like, shape = [n_samples, n_features]
Training vector, where n_samples is the number of samples and
n_features is the number of features.

y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.

groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
"""
estimator = self.estimator
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

X, y, groups = indexable(X, y, groups)
n_splits = cv.get_n_splits(X, y, groups)
if self.verbose > 0 and isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
# Regenerate parameter iterable for each fit
candidate_params = list(self._get_param_iterator())
n_candidates = len(candidate_params)
if self.verbose > 0:
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(n_splits, n_candidates,
n_candidates * n_splits))

base_estimator = clone(self.estimator)
pre_dispatch = self.pre_dispatch

cv_iter = list(cv.split(X, y, groups))
out = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch
Expand All @@ -559,28 +575,25 @@ def _fit(self, X, y, groups, parameter_iterable):
fit_params=self.fit_params,
return_train_score=self.return_train_score,
return_n_test_samples=True,
return_times=True, return_parameters=True,
return_times=True, return_parameters=False,
error_score=self.error_score)
for parameters in parameter_iterable
for train, test in cv_iter)
for train, test in cv.split(X, y, groups)
for parameters in candidate_params)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return_parameters=False is unrelated, right? but it seems fine.


# if one choose to see train score, "out" will contain train score info
if self.return_train_score:
(train_scores, test_scores, test_sample_counts,
fit_time, score_time, parameters) = zip(*out)
(train_scores, test_scores, test_sample_counts, fit_time,
score_time) = zip(*out)
else:
(test_scores, test_sample_counts,
fit_time, score_time, parameters) = zip(*out)

candidate_params = parameters[::n_splits]
n_candidates = len(candidate_params)
(test_scores, test_sample_counts, fit_time, score_time) = zip(*out)

results = dict()

def _store(key_name, array, weights=None, splits=False, rank=False):
"""A small helper to store the scores/times to the cv_results_"""
array = np.array(array, dtype=np.float64).reshape(n_candidates,
n_splits)
# When iterated first by splits, then by parameters
array = np.array(array, dtype=np.float64).reshape(n_splits,
n_candidates).T
if splits:
for split_i in range(n_splits):
results["split%d_%s"
Expand All @@ -600,7 +613,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):

# Computed the (weighted) mean and std for test scores alone
# NOTE test_sample counts (weights) remain the same for all candidates
test_sample_counts = np.array(test_sample_counts[:n_splits],
test_sample_counts = np.array(test_sample_counts[::n_candidates],
dtype=np.int)

_store('test_score', test_scores, splits=True, rank=True,
Expand Down Expand Up @@ -924,25 +937,9 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
self.param_grid = param_grid
_check_param_grid(param_grid)

def fit(self, X, y=None, groups=None):
"""Run fit with all sets of parameters.

Parameters
----------

X : array-like, shape = [n_samples, n_features]
Training vector, where n_samples is the number of samples and
n_features is the number of features.

y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.

groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
"""
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
def _get_param_iterator(self):
"""Return ParameterGrid instance for the given param_grid"""
return ParameterGrid(self.param_grid)


class RandomizedSearchCV(BaseSearchCV):
Expand Down Expand Up @@ -1167,24 +1164,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
pre_dispatch=pre_dispatch, error_score=error_score,
return_train_score=return_train_score)

def fit(self, X, y=None, groups=None):
"""Run fit on the estimator with randomly drawn parameters.

Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vector, where n_samples in the number of samples and
n_features is the number of features.

y : array-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
None for unsupervised learning.

groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
"""
sampled_params = ParameterSampler(self.param_distributions,
self.n_iter,
random_state=self.random_state)
return self._fit(X, y, groups, sampled_params)
def _get_param_iterator(self):
"""Return ParameterSampler instance for the given distributions"""
return ParameterSampler(
self.param_distributions, self.n_iter,
random_state=self.random_state)
14 changes: 6 additions & 8 deletions sklearn/model_selection/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
X, y, groups = indexable(X, y, groups)

cv = check_cv(cv, y, classifier=is_classifier(estimator))
cv_iter = list(cv.split(X, y, groups))
scorer = check_scoring(estimator, scoring=scoring)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did we put this here at all?

Copy link
Member Author
@raghavrv raghavrv Dec 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Paranoia on my part I guess when I learnt model_selection doesn't handle one time cv-splitters and rushed to push in a fix... :/

# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
Expand All @@ -137,7 +136,7 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
train, test, verbose, None,
fit_params)
for train, test in cv_iter)
for train, test in cv.split(X, y, groups))
return np.array(scores)[:, 0]


Expand Down Expand Up @@ -385,7 +384,6 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
X, y, groups = indexable(X, y, groups)

cv = check_cv(cv, y, classifier=is_classifier(estimator))
cv_iter = list(cv.split(X, y, groups))

# Ensure the estimator has implemented the passed decision function
if not callable(getattr(estimator, method)):
Expand All @@ -398,7 +396,7 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
pre_dispatch=pre_dispatch)
prediction_blocks = parallel(delayed(_fit_and_predict)(
clone(estimator), X, y, train, test, verbose, fit_params, method)
for train, test in cv_iter)
for train, test in cv.split(X, y, groups))

# Concatenate the predictions
predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]
Expand Down Expand Up @@ -752,8 +750,9 @@ def learning_curve(estimator, X, y, groups=None,
X, y, groups = indexable(X, y, groups)

cv = check_cv(cv, y, classifier=is_classifier(estimator))
# Make a list since we will be iterating multiple times over the folds
# Store it as list as we will be iterating over the list multiple times
cv_iter = list(cv.split(X, y, groups))

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we still materializing here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can do away with materializing for learning_curve. (Also it was like that before pre-0.18)

scorer = check_scoring(estimator, scoring=scoring)

n_max_training_samples = len(cv_iter[0][0])
Expand Down Expand Up @@ -961,16 +960,15 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
X, y, groups = indexable(X, y, groups)

cv = check_cv(cv, y, classifier=is_classifier(estimator))
cv_iter = list(cv.split(X, y, groups))

scorer = check_scoring(estimator, scoring=scoring)

parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
verbose=verbose)
out = parallel(delayed(_fit_and_score)(
estimator, X, y, scorer, train, test, verbose,
parameters={param_name: v}, fit_params=None, return_train_score=True)
for train, test in cv_iter for v in param_range)
# NOTE do not change order of iteration to allow one time cv splitters
for train, test in cv.split(X, y, groups) for v in param_range)

out = np.asarray(out)
n_params = len(param_range)
Expand Down
0