8000
We read every piece of feedback, and take your input very seriously.
1 parent f9c6344 commit 3dcb873Copy full SHA for 3dcb873
sklearn/model_selection/_search.py
@@ -12,7 +12,7 @@
12
# License: BSD 3 clause
13
14
from abc import ABCMeta, abstractmethod
15
-from collections import Mapping, namedtuple, Sized, defaultdict, Sequence
+from collections import Mapping, namedtuple, defaultdict, Sequence
16
from functools import partial, reduce
17
from itertools import product
18
import operator
@@ -532,25 +532,41 @@ def inverse_transform(self, Xt):
532
self._check_is_fitted('inverse_transform')
533
return self.best_estimator_.transform(Xt)
534
535
- def _fit(self, X, y, groups, parameter_iterable):
536
- """Actual fitting, performing the search over parameters."""
+ def fit(self, X, y=None, groups=None):
+ """Run fit with all sets of parameters.
537
+
538
+ Parameters
539
+ ----------
540
541
+ X : array-like, shape = [n_samples, n_features]
542
+ Training vector, where n_samples is the number of samples and
543
+ n_features is the number of features.
544
545
+ y : array-like, shape = [n_samples] or [n_samples, n_output], optional
546
+ Target relative to X for classification or regression;
547
+ None for unsupervised learning.
548
549
+ groups : array-like, with shape (n_samples,), optional
550
+ Group labels for the samples used while splitting the dataset into
551
+ train/test set.
552
+ """
553
estimator = self.estimator
554
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
555
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
556
557
X, y, groups = indexable(X, y, groups)
558
n_splits = cv.get_n_splits(X, y, groups)
- if self.verbose > 0 and isinstance(parameter_iterable, Sized):
- n_candidates = len(parameter_iterable)
559
+ # Regenerate parameter iterable for each fit
560
+ candidate_params = list(self._get_param_iterator())
561
+ n_candidates = len(candidate_params)
562
+ if self.verbose > 0:
563
print("Fitting {0} folds for each of {1} candidates, totalling"
564
" {2} fits".format(n_splits, n_candidates,
565
n_candidates * n_splits))
566
567
base_estimator = clone(self.estimator)
568
pre_dispatch = self.pre_dispatch
569
- cv_iter = list(cv.split(X, y, groups))
570
out = Parallel(
571
n_jobs=self.n_jobs, verbose=self 8000 .verbose,
572
pre_dispatch=pre_dispatch
@@ -559,28 +575,25 @@ def _fit(self, X, y, groups, parameter_iterable):
575
fit_params=self.fit_params,
576
return_train_score=self.return_train_score,
577
return_n_test_samples=True,
- return_times=True, return_parameters=True,
578
+ return_times=True, return_parameters=False,
579
error_score=self.error_score)
- for parameters in parameter_iterable
- for train, test in cv_iter)
580
+ for train, test in cv.split(X, y, groups)
581
+ for parameters in candidate_params)
582
583
# if one choose to see train score, "out" will contain train score info
584
if self.return_train_score:
- (train_scores, test_scores, test_sample_counts,
- fit_time, score_time, parameters) = zip(*out)
585
+ (train_scores, test_scores, test_sample_counts, fit_time,
586
+ score_time) = zip(*out)
587
else:
- (test_scores, test_sample_counts,
573
574
-
- candidate_params = parameters[::n_splits]
- n_candidates = len(candidate_params)
588
+ (test_scores, test_sample_counts, fit_time, score_time) = zip(*out)
589
590
results = dict()
591
592
def _store(key_name, array, weights=None, splits=False, rank=False):
593
"""A small helper to store the scores/times to the cv_results_"""
- array = np.array(array, dtype=np.float64).reshape(n_candidates,
- n_splits)
594
+ # When iterated first by splits, then by parameters
595
+ array = np.array(array, dtype=np.float64).reshape(n_splits,
596
+ n_candidates).T
597
if splits:
598
for split_i in range(n_splits):
599
results["split%d_%s"
@@ -600,7 +613,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
600
613
601
614
# Computed the (weighted) mean and std for test scores alone
602
615
# NOTE test_sample counts (weights) remain the same for all candidates
603
- test_sample_counts = np.array(test_sample_counts[:n_splits],
616
+ test_sample_counts = np.array(test_sample_counts[::n_candidates],
604
617
dtype=np.int)
605
618
606
619
_store('test_score', test_scores, splits=True, rank=True,
@@ -924,25 +937,9 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
924
937
self.param_grid = param_grid
925
938
_check_param_grid(param_grid)
926
939
927
- def fit(self, X, y=None, groups=None):
928
- """Run fit with all sets of parameters.
929
930
- Parameters
931
- ----------
932
933
- X : array-like, shape = [n_samples, n_features]
934
- Training vector, where n_samples is the number of samples and
935
- n_features is the number of features.
936
- y : array-like, shape = [n_samples] or [n_samples, n_output], optional
- Target relative to X for classification or regression;
- None for unsupervised learning.
940
941
- groups : array-like, with shape (n_samples,), optional
942
- Group labels for the samples used while splitting the dataset into
943
- train/test set.
944
- """
945
- return self._fit(X, y, groups, ParameterGrid(self.param_grid))
+ def _get_param_iterator(self):
+ """Return ParameterGrid instance for the given param_grid"""
+ return ParameterGrid(self.param_grid)
946
947
948
class RandomizedSearchCV(BaseSearchCV):
@@ -1167,24 +1164,8 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
1167
1164
pre_dispatch=pre_dispatch, error_score=error_score,
1168
1165
return_train_score=return_train_score)
1169
1166
1170
1171
- """Run fit on the estimator with randomly drawn parameters.
1172
1173
1174
1175
1176
- Training vector, where n_samples in the number of samples and
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
- sampled_params = ParameterSampler(self.param_distributions,
1188
- self.n_iter,
1189
- random_state=self.random_state)
1190
- return self._fit(X, y, groups, sampled_params)
+ """Return ParameterSampler instance for the given distributions"""
+ return ParameterSampler(
+ self.param_distributions, self.n_iter,
+ random_state=self.random_state)
sklearn/model_selection/_validation.py
@@ -128,7 +128,6 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
128
129
130
cv = check_cv(cv, y, classifier=is_classifier(estimator))
131
132
scorer = check_scoring(estimator, scoring=scoring)
133
# We clone the estimator to make sure that all the folds are
134
# independent, and that it is pickle-able.
@@ -137,7 +136,7 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
137
136
scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
138
train, test, verbose, None,
139
fit_params)
140
+ for train, test in cv.split(X, y, groups))
141
return np.array(scores)[:, 0]
142
143
@@ -385,7 +384,6 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
385
384
386
387
388
389
390
# Ensure the estimator has implemented the passed decision function
391
if not callable(getattr(estimator, method)):
@@ -398,7 +396,7 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
398
396
pre_dispatch=pre_dispatch)
399
397
prediction_blocks = parallel(delayed(_fit_and_predict)(
400
clone(estimator), X, y, train, test, verbose, fit_params, method)
401
402
403
# Concatenate the predictions
404
predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]
@@ -752,8 +750,9 @@ def learning_curve(estimator, X, y, groups=None,
752
750
753
751
754
755
- # Make a list since we will be iterating multiple times over the folds
+ # Store it as list as we will be iterating over the list multiple times
756
cv_iter = list(cv.split(X, y, groups))
757
758
759
n_max_training_samples = len(cv_iter[0][0])
@@ -961,16 +960,15 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
961
960
962
963
964
965
966
967
968
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
969
verbose=verbose)
970
out = parallel(delayed(_fit_and_score)(
971
estimator, X, y, scorer, train, test, verbose,
972
parameters={param_name: v}, fit_params=None, return_train_score=True)
973
- for train, test in cv_iter for v in param_range)
+ # NOTE do not change order of iteration to allow one time cv splitters
+ for train, test in cv.split(X, y, groups) for v in param_range)
974
975
out = np.asarray(out)
976
n_params = len(param_range)
3318 code>