8000 ENH Restructure grid_scores_ into future proof eff. data structure · scikit-learn/scikit-learn@413404e · GitHub
[go: up one dir, main page]

Skip to content

Commit 413404e

Browse files
committed
ENH Restructure grid_scores_ into future proof eff. data structure
1 parent a6c3d1f commit 413404e

File tree

1 file changed

+156
-52
lines changed

1 file changed

+156
-52
lines changed

sklearn/model_selection/_search.py

Lines changed: 156 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -321,24 +321,57 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
321321
return score, parameters, n_samples_test
322322

323323

324-
def _check_param_grid(param_grid):
325-
if hasattr(param_grid, 'items'):
326-
param_grid = [param_grid]
324+
def _check_param_grid_or_dist(param_grid_or_dist):
325+
"""Validate param_grid/distribution and return the unique parameters"""
326+
parameter_names = set()
327327

328-
for p in param_grid:
328+
if hasattr(param_grid_or_dist, 'items'):
329+
param_grid_or_dist = [param_grid_or_dist]
330+
331+
for p in param_grid_or_dist:
329332
for v in p.values():
330333
if isinstance(v, np.ndarray) and v.ndim > 1:
331334
raise ValueError("Parameter array should be one-dimensional.")
332335

333-
check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
334-
if True not in check:
336+
if not isinstance(v, (list, tuple, np.ndarray)):
335337
raise ValueError("Parameter values should be a list.")
336338

337339
if len(v) == 0:
338340
raise ValueError("Parameter values should be a non-empty "
339341
"list.")
340342

343+
parameter_names.update(p.keys())
344+
345+
return list(parameter_names)
346+
347+
348+
def _get_metric_names(scoring):
349+
"""Generate the list of metric name(s) given the scoring parameter"""
350+
metric_names = list()
351+
# XXX Do we index from 0?
352+
# NOTE we need this to prevent collisions between similarly named
353+
# custom metric (i.e [foo.bar, bar])
354+
n_custom_metrics = 1
341355

356+
if not isinstance(scoring, (list, tuple)):
357+
scoring = [scoring]
358+
359+
for metric in scoring:
360+
if callable(metric):
361+
metric_names.append("custom_metric_%s_%s" %
362+
(n_custom_metrics, metric.__name__))
363+
n_custom_metrics += 1
364+
365+
elif isinstance(metric, six.string_types):
366+
metric_names.append(metric)
367+
368+
else:
369+
raise ValueError("Unknown metric type - %r" % type(metric))
370+
371+
return metric_names
372+
373+
374+
# XXX Remove in 0.20
342375
class _CVScoreTuple (namedtuple('_CVScoreTuple',
343376
('parameters',
344377
'mean_validation_score',
@@ -381,6 +414,7 @@ def __init__(self, estimator, scoring=None,
381414
self.verbose = verbose
382415
self.pre_dispatch = pre_dispatch
383416
self.error_score = error_score
417+
self.metric_names_ = _get_metric_names(scoring)
384418

385419
@property
386420
def _estimator_type(self):
@@ -521,6 +555,12 @@ def inverse_transform(self, Xt):
521555
"""
522556
return self.best_estimator_.transform(Xt)
523557

558+
@property
559+
@deprecated("The grid_scores_ attribute is deprecated in favor of the "
560+
"search_results_ and will be removed in version 0.20.")
561+
def grid_scores_(self):
562+
return self._grid_scores
563+
524564
def _fit(self, X, y, labels, parameter_iterable):
525565
"""Actual fitting, performing the search over parameters."""
526566

@@ -561,38 +601,67 @@ def _fit(self, X, y, labels, parameter_iterable):
561601
# Out is a list of triplet: score, estimator, n_test_samples
562602
n_fits = len(out)
563603

564-
scores = list()
565-
grid_scores = list()
566-
for grid_start in range(0, n_fits, n_splits):
567-
n_test_samples = 0
568-
score = 0
569-
all_scores = []
570-
for this_score, this_n_test_samples, _, parameters in \
571-
out[grid_start:grid_start + n_splits]:
572-
all_scores.append(this_score)
604+
self._grid_scores = list()
605+
606+
# XXX Do we want to store these?
607+
n_candidates = n_fits / n_splits
608+
n_parameters = len(self.parameter_names_)
609+
n_metrics = len(scoring)
610+
611+
search_results_ = dict()
612+
613+
for param in self.parameter_names_:
614+
search_results_[param] = np.empty((n_candidates,), dtype=object)
615+
616+
for metric in self.metric_names_:
617+
# Make a column for each split
618+
# XXX To make it future proof
619+
for split_i in range(n_splits)]:
620+
search_results_["%s_split_%s" % (metric, split_i)] = (
621+
np.empty((n_candidates,), dtype=np.float32))
622+
623+
search_results_["%s_aggregated"] = np.empty((n_candidates,),
624+
dtype=np.float32)
625+
search_results_["%s_rank"] = np.empty((n_candidates,), dtype=int)
626+
627+
for grid_start in range(0, n_fits, n_splits):
628+
n_test_samples = 0
629+
aggregated_score = 0
630+
all_scores = []
631+
632+
# XXX Loop this when multiple metric support is enabled
633+
for (this_score, this_n_test_samples, _, parameters), i in \
634+
enumerate(out[grid_start:grid_start + n_splits]):
635+
all_scores.append(this_score)
636+
637+
if self.iid:
638+
this_score *= this_n_test_samples
639+
n_test_samples += this_n_test_samples
640+
aggregated_score += this_score
641+
search_results_["%s_split_%s" % (metric, i)] = this_score
642+
573643
if self.iid:
574-
this_score *= this_n_test_samples
575-
n_test_samples += this_n_test_samples
576-
score += this_score
577-
if self.iid:
578-
score /= float(n_test_samples)
579-
else:
580-
score /= float(n_splits)
581-
scores.append((score, parameters))
582-
# TODO: shall we also store the test_fold_sizes?
583-
grid_scores.append(_CVScoreTuple(
644+
aggregated_score /= float(n_test_samples)
645+
else:
646+
aggregated_score /= float(n_splits)
647+
648+
search_results_["%s_aggregated" % metric] = aggregated_score
649+
650+
# XXX Remove in version 0.20
651+
self._grid_scores.append(_CVScoreTuple(
584652
parameters,
585653
score,
586654
np.array(all_scores)))
587-
# Store the computed scores
588-
self.grid_scores_ = grid_scores
589655

590-
# Find the best parameters by comparing on the mean validation score:
591-
# note that `sorted` is deterministic in the way it breaks ties
592-
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
593-
reverse=True)[0]
594-
self.best_params_ = best.parameters
595-
self.best_score_ = best.mean_validation_score
656+
# Find the best parameters by comparing on the mean validation score:
657+
# note that `sorted` is deterministic in the way it breaks ties
658+
np.argsort(search_results_["%s"])
659+
search_results_["%s_aggregated" % metric] = aggregated_score
660+
661+
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
662+
reverse=True)[0]
663+
self.best_params_ = best.parameters
664+
self.best_score_ = best.mean_validation_score
596665

597666
if self.refit:
598667
# fit the best estimator using the entire dataset
@@ -723,15 +792,32 @@ class GridSearchCV(BaseSearchCV):
723792
724793
Attributes
725794
----------
726-
grid_scores_ : list of named tuples
727-
Contains scores for all parameter combinations in param_grid.
728-
Each entry corresponds to one parameter setting.
729-
Each named tuple has the attributes:
730-
731-
* ``parameters``, a dict of parameter settings
732-
* ``mean_validation_score``, the mean score over the
733-
cross-validation folds
734-
* ``cv_validation_scores``, the list of scores for each fold
795+
search_results_ : dict of numpy (masked) ndarrays
796+
A dict with keys as column headers and values as columns, that can be
797+
imported into a pandas DataFrame.
798+
799+
For instance the below given table
800+
801+
kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
802+
=====================================================================
803+
'poly'| - | 2 | 0.8 | 0.81 |
804+
'poly'| - | 3 | 0.7 | 0.60 |
805+
'rbf' | 0.1 | - | 0.8 | 0.75 |
806+
'rbf' | 0.2 | - | 0.9 | 0.82 |
807+
808+
will be represented by a search_results_ dict of :
809+
810+
{'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
811+
mask = [False False False False]...)
812+
'gamma' : masked_array(data = [-- -- 0.1 0.2],
813+
mask = [ True True False False]...),
814+
'degree' : masked_array(data = [2.0 3.0 -- --],
815+
mask = [False False True True]...),
816+
'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
817+
'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
818+
'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
819+
'candidate_rank' : [2, 4, 3, 1],
820+
}
735821
736822
best_estimator_ : estimator
737823
Estimator that was chosen by the search, i.e. estimator
@@ -785,7 +871,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
785871
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
786872
pre_dispatch=pre_dispatch, error_score=error_score)
787873
self.param_grid = param_grid
788-
_check_param_grid(param_grid)
874+
self.parameter_names_ = _check_param_grid_or_dist(param_grid)
789875

790876
def fit(self, X, y=None, labels=None):
791877
"""Run fit with all sets of parameters.
@@ -919,15 +1005,32 @@ class RandomizedSearchCV(BaseSearchCV):
9191005
9201006
Attributes
9211007
----------
922-
grid_scores_ : list of named tuples
923-
Contains scores for all parameter combinations in param_grid.
924-
Each entry corresponds to one parameter setting.
925-
Each named tuple has the attributes:
926-
927-
* ``parameters``, a dict of parameter settings
928-
* ``mean_validation_score``, the mean score over the
929-
cross-validation folds
930-
* ``cv_validation_scores``, the list of scores for each fold
1008+
search_results_ : dict of numpy (masked) ndarrays
1009+
A dict with keys as column headers and values as columns, that can be
1010+
imported into a pandas DataFrame.
1011+
1012+
For instance the below given table
1013+
1014+
kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1015+
=====================================================================
1016+
'poly'| - | 2 | 0.8 | 0.81 |
1017+
'poly'| - | 3 | 0.7 | 0.60 |
1018+
'rbf' | 0.1 | - | 0.8 | 0.75 |
1019+
'rbf' | 0.2 | - | 0.9 | 0.82 |
1020+
1021+
will be represented by a search_results_ dict of :
1022+
1023+
{'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1024+
mask = [False False False False]...)
1025+
'gamma' : masked_array(data = [-- -- 0.1 0.2],
1026+
mask = [ True True False False]...),
1027+
'degree' : masked_array(data = [2.0 3.0 -- --],
1028+
mask = [False False True True]...),
1029+
'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1030+
'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1031+
'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1032+
'candidate_rank' : [2, 4, 3, 1],
1033+
}
9311034
9321035
best_estimator_ : estimator
9331036
Estimator that was chosen by the search, i.e. estimator
@@ -970,6 +1073,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
9701073
error_score='raise'):
9711074

9721075
self.param_distributions = param_distributions
1076+
self.parameter_names_ = _check_param_grid_or_dist(param_distributions)
9731077
self.n_iter = n_iter
9741078
self.random_state = random_state
9751079
super(RandomizedSearchCV, self).__init__(

0 commit comments

Comments
 (0)
0