-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[MRG + 1] ENH Ensure consistency in splits and in parameters (without causing memory blowup because of materializing the iterator) #7941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b70ffb6
740d3d2
de43c92
932891c
2bd348f
1efe648
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,7 +128,6 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, | |
X, y, groups = indexable(X, y, groups) | ||
|
||
cv = check_cv(cv, y, classifier=is_classifier(estimator)) | ||
cv_iter = list(cv.split(X, y, groups)) | ||
scorer = check_scoring(estimator, scoring=scoring) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did we put this here at all? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Paranoia on my part I guess when I learnt |
||
# We clone the estimator to make sure that all the folds are | ||
# independent, and that it is pickle-able. | ||
|
@@ -137,7 +136,7 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, | |
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, | ||
train, test, verbose, None, | ||
fit_params) | ||
for train, test in cv_iter) | ||
for train, test in cv.split(X, y, groups)) | ||
return np.array(scores)[:, 0] | ||
|
||
|
||
|
@@ -385,7 +384,6 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, | |
X, y, groups = indexable(X, y, groups) | ||
|
||
cv = check_cv(cv, y, classifier=is_classifier(estimator)) | ||
cv_iter = list(cv.split(X, y, groups)) | ||
|
||
# Ensure the estimator has implemented the passed decision function | ||
if not callable(getattr(estimator, method)): | ||
|
@@ -398,7 +396,7 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, | |
pre_dispatch=pre_dispatch) | ||
prediction_blocks = parallel(delayed(_fit_and_predict)( | ||
clone(estimator), X, y, train, test, verbose, fit_params, method) | ||
for train, test in cv_iter) | ||
for train, test in cv.split(X, y, groups)) | ||
|
||
# Concatenate the predictions | ||
predictions = [pred_block_i for pred_block_i, _ in prediction_blocks] | ||
|
@@ -752,8 +750,9 @@ def learning_curve(estimator, X, y, groups=None, | |
X, y, groups = indexable(X, y, groups) | ||
|
||
cv = check_cv(cv, y, classifier=is_classifier(estimator)) | ||
# Make a list since we will be iterating multiple times over the folds | ||
# Store it as list as we will be iterating over the list multiple times | ||
cv_iter = list(cv.split(X, y, groups)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we still materializing here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we can do away with materializing for |
||
scorer = check_scoring(estimator, scoring=scoring) | ||
|
||
n_max_training_samples = len(cv_iter[0][0]) | ||
|
@@ -961,16 +960,15 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, | |
X, y, groups = indexable(X, y, groups) | ||
|
||
cv = check_cv(cv, y, classifier=is_classifier(estimator)) | ||
cv_iter = list(cv.split(X, y, groups)) | ||
|
||
scorer = check_scoring(estimator, scoring=scoring) | ||
|
||
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, | ||
verbose=verbose) | ||
out = parallel(delayed(_fit_and_score)( | ||
estimator, X, y, scorer, train, test, verbose, | ||
parameters={param_name: v}, fit_params=None, return_train_score=True) | ||
for train, test in cv_iter for v in param_range) | ||
# NOTE do not change order of iteration to allow one time cv splitters | ||
for train, test in cv.split(X, y, groups) for v in param_range) | ||
|
||
out = np.asarray(out) | ||
n_params = len(param_range) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return_parameters=False
is unrelated, right? but it seems fine.