8000 ENH Restructure grid_scores_ into future proof eff. data structure · scikit-learn/scikit-learn@cad5e6c · GitHub
[go: up one dir, main page]

Skip to content

Commit cad5e6c

Browse files
committed
ENH Restructure grid_scores_ into future proof eff. data structure
1 parent 1d487fb commit cad5e6c

File tree

1 file changed

+155
-51
lines changed

1 file changed

+155
-51
lines changed

sklearn/model_selection/_search.py

Lines changed: 155 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
320320
return score, parameters, n_samples_test
321321

322322

323-
def _check_param_grid(param_grid):
324-
if hasattr(param_grid, 'items'):
325-
param_grid = [param_grid]
323+
def _check_param_grid_or_dist(param_grid_or_dist):
324+
"""Validate param_grid/distribution and return the unique parameters"""
325+
parameter_names = set()
326326

327-
for p in param_grid:
327+
if hasattr(param_grid_or_dist, 'items'):
328+
param_grid_or_dist = [param_grid_or_dist]
329+
330+
for p in param_grid_or_dist:
328331
for v in p.values():
329332
if isinstance(v, np.ndarray) and v.ndim > 1:
330333
raise ValueError("Parameter array should be one-dimensional.")
331334

332-
check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
333-
if True not in check:
335+
if not isinstance(v, (list, tuple, np.ndarray)):
334336
raise ValueError("Parameter values should be a list.")
335337

336338
if len(v) == 0:
337339
raise ValueError("Parameter values should be a non-empty "
338340
"list.")
339341

342+
parameter_names.update(p.keys())
343+
344+
return list(parameter_names)
345+
346+
347+
def _get_metric_name(scoring):
348+
"""Generate the metric name given the scoring parameter"""
349+
if callable(scoring):
350+
if scoring.__name__ == "_passthrough_scorer":
351+
return "estimator_default_scorer"
352+
else:
353+
return "custom_metric_%s" % (scoring.__name__,)
354+
355+
elif isinstance(scoring, six.string_types):
356+
return scoring
357+
358+
else:
359+
raise ValueError("Unknown metric type - %r" % type(scoring))
340360

361+
362+
# XXX Remove in 0.20
341363
class _CVScoreTuple (namedtuple('_CVScoreTuple',
342364
('parameters',
343365
'mean_validation_score',
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
526548
estimator = self.estimator
527549
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
528550
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
551+
self.metric_name_ = _get_metric_name(self.scorer_)
529552

530553
n_samples = _num_samples(X)
531554
X, y, labels = indexable(X, y, labels)
@@ -560,44 +583,90 @@ def _fit(self, X, y, labels, parameter_iterable):
560583
# Out is a list of triplet: score, estimator, n_test_samples
561584
n_fits = len(out)
562585

563-
scores = list()
564-
grid_scores = list()
565-
for grid_start in range(0, n_fits, n_splits):
566-
n_test_samples = 0
567-
score = 0
568-
all_scores = []
569-
for this_score, this_n_test_samples, _, parameters in \
586+
self.n_candidates_ = int(n_fits / n_splits)
587+
self.n_parameters_ = len(self.parameter_names_)
588+
589+
res_shape = (self.n_candidates_,)
590+
591+
search_results = dict()
592+
593+
for param in self.parameter_names_:
594+
# One column to record the values of each parameter
595+
search_results[param] = np.ma.masked_all(res_shape, dtype=object)
596+
597+
# Lets not initite this everytime and reuse the same array.
598+
all_scores = np.empty((n_splits,), dtype=np.float64)
599+
600+
# Loop this when multiple metric support is introduced.
601+
metric = self.metric_name_
602+
603+
# Make a column for each split of each metric
604+
for split_i in range(n_splits):
605+
search_results["%s_split_%s" % (metric, split_i)] = (
606+
np.empty(res_shape, dtype=np.float64))
607+
search_results["%s_mean" % metric] = np.empty(res_shape,
608+
dtype=np.float64)
609+
search_results["%s_rank" % metric] = np.empty(res_shape, dtype=int)
610+
611+
for fit_i, grid_start in enumerate(range(0, n_fits, n_splits)):
612+
n_test_samples_total = 0
613+
mean_score = 0
614+
615+
split_i = -1
616+
for score_i, n_test_samples_i, _, parameters in \
570617
out[grid_start:grid_start + n_splits]:
571-
all_scores.append(this_score)
618+
split_i += 1
619+
# Record the score/n_test_samples for the i-th split
620+
# of the current parameter setting candidate.
621+
all_scores[split_i] = score_i
622+
572623
if self.iid:
573-
this_score *= this_n_test_samples
574-
n_test_samples += this_n_test_samples
575-
score += this_score
624+
score_i *= n_test_samples_i
625+
n_test_samples_total += n_test_samples_i
626+
627+
mean_score += score_i
628+
search_results["%s_split_%s" %
629+
(metric, split_i)][fit_i] = score_i
630+
576631
if self.iid:
577-
score /= float(n_test_samples)
632+
mean_score = all_scores.sum() / float(n_test_samples_total)
578633
else:
579-
score /= float(n_splits)
580-
scores.append((score, parameters))
581-
# TODO: shall we also store the test_fold_sizes?
582-
grid_scores.append(_CVScoreTuple(
583-
parameters,
584-
score,
585-
np.array(all_scores)))
586-
# Store the computed scores
587-
self.grid_scores_ = grid_scores
634+
mean_score = all_scores.mean()
635+
636+
# Store the mean score and the parameters for this fit
637+
search_results["%s_mean" % metric][fit_i] = mean_score
638+
for param in parameters:
639+
# This entry alone gets unmasked when assigned
640+
search_results[param][fit_i] = parameters[param]
588641

589642
# Find the best parameters by comparing on the mean validation score:
590643
# note that `sorted` is deterministic in the way it breaks ties
591-
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
592-
reverse=True)[0]
593-
self.best_params_ = best.parameters
594-
self.best_score_ = best.mean_validation_score
644+
# We reverse the order to get a descending sort order
645+
sorted_indices = np.argsort(
646+
search_results["%s_mean" % metric])[::-1]
647+
648+
search_results["%s_rank" % metric][sorted_indices] = (
649+
np.arange(1, self.n_parameters_ + 2))
650+
651+
self.search_results_ = search_results
652+
653+
best = sorted_indices[0]
654+
655+
parameters = dict()
656+
657+
for param in self.parameter_names_:
658+
value = search_results[param][best]
659+
if value is not np.ma.masked:
660+
parameters[param] = search_results[param][best]
661+
662+
self.best_params_ = parameters
663+
self.best_score_ = search_results["%s_mean" % metric][best]
595664

596665
if self.refit:
597666
# fit the best estimator using the entire dataset
598667
# clone first to work around broken estimators
599668
best_estimator = clone(base_estimator).set_params(
600-
**best.parameters)
669+
**parameters)
601670
if y is not None:
602671
best_estimator.fit(X, y, **self.fit_params)
603672
else:
@@ -722,15 +791,32 @@ class GridSearchCV(BaseSearchCV):
722791
723792
Attributes
724793
----------
725-
grid_scores_ : list of named tuples
726-
Contains scores for all parameter combinations in param_grid.
727-
Each entry corresponds to one parameter setting.
728-
Each named tuple has the attributes:
729-
730-
* ``parameters``, a dict of parameter settings
731-
* ``mean_validation_score``, the mean score over the
732-
cross-validation folds
733-
* ``cv_validation_scores``, the list of scores for each fold
794+
search_results_ : dict of numpy (masked) ndarrays
795+
A dict with keys as column headers and values as columns, that can be
796+
imported into a pandas DataFrame.
797+
798+
For instance the below given table
799+
800+
kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
801+
=====================================================================
802+
'poly'| - | 2 | 0.8 | 0.81 |
803+
'poly'| - | 3 | 0.7 | 0.60 |
804+
'rbf' | 0.1 | - | 0.8 | 0.75 |
805+
'rbf' | 0.2 | - | 0.9 | 0.82 |
806+
807+
will be represented by a search_results_ dict of :
808+
809+
{'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
810+
mask = [False False False False]...)
811+
'gamma' : masked_array(data = [-- -- 0.1 0.2],
812+
mask = [ True True False False]...),
813+
'degree' : masked_array(data = [2.0 3.0 -- --],
814+
mask = [False False True True]...),
815+
'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
816+
'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
817+
'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
818+
'candidate_rank' : [2, 4, 3, 1],
819+
}
734820
735821
best_estimator_ : estimator
736822
Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +870,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
784870
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
785871
pre_dispatch=pre_dispatch, error_score=error_score)
786872
self.param_grid = param_grid
787-
_check_param_grid(param_grid)
873+
self.parameter_names_ = _check_param_grid_or_dist(param_grid)
788874

789875
def fit(self, X, y=None, labels=None):
790876
"""Run fit with all sets of parameters.
@@ -918,15 +1004,32 @@ class RandomizedSearchCV(BaseSearchCV):
9181004
9191005
Attributes
9201006
----------
921-
grid_scores_ : list of named tuples
922-
Contains scores for all parameter combinations in param_grid.
923-
Each entry corresponds to one parameter setting.
924-
Each named tuple has the attributes:
925-
926-
* ``parameters``, a dict of parameter settings
927-
* ``mean_validation_score``, the mean score over the
928-
cross-validation folds
929-
* ``cv_validation_scores``, the list of scores for each fold
1007+
search_results_ : dict of numpy (masked) ndarrays
1008+
A dict with keys as column headers and values as columns, that can be
1009+
imported into a pandas DataFrame.
1010+
1011+
For instance the below given table
1012+
1013+
kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1014+
=====================================================================
1015+
'poly'| - | 2 | 0.8 | 0.81 |
1016+
'poly'| - | 3 | 0.7 | 0.60 |
1017+
'rbf' | 0.1 | - | 0.8 | 0.75 |
1018+
'rbf' | 0.2 | - | 0.9 | 0.82 |
1019+
1020+
will be represented by a search_results_ dict of :
1021+
1022+
{'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1023+
mask = [False False False False]...)
1024+
'gamma' : masked_array(data = [-- -- 0.1 0.2],
1025+
mask = [ True True False False]...),
1026+
'degree' : masked_array(data = [2.0 3.0 -- --],
1027+
mask = [False False True True]...),
1028+
'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1029+
'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1030+
'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1031+
'candidate_rank' : [2, 4, 3, 1],
1032+
}
9301033
9311034
best_estimator_ : estimator
9321035
Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1072,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
9691072
error_score='raise'):
9701073

9711074
self.param_distributions = param_distributions
1075+
self.parameter_names_ = _check_param_grid_or_dist(param_distributions)
9721076
self.n_iter = n_iter
9731077
self.random_state = random_state
9741078
super(RandomizedSearchCV, self).__init__(

0 commit comments

Comments
 (0)
0