@@ -321,24 +321,57 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
321
321
return score , parameters , n_samples_test
322
322
323
323
324
- def _check_param_grid ( param_grid ):
325
- if hasattr ( param_grid , 'items' ):
326
- param_grid = [ param_grid ]
324
+ def _check_param_grid_or_dist ( param_grid_or_dist ):
325
+ """Validate param_grid/distribution and return the unique parameters"""
326
+ parameter_names = set ()
327
327
328
- for p in param_grid :
328
+ if hasattr (param_grid_or_dist , 'items' ):
329
+ param_grid_or_dist = [param_grid_or_dist ]
330
+
331
+ for p in param_grid_or_dist :
329
332
for v in p .values ():
330
333
if isinstance (v , np .ndarray ) and v .ndim > 1 :
331
334
raise ValueError ("Parameter array should be one-dimensional." )
332
335
333
- check = [isinstance (v , k ) for k in (list , tuple , np .ndarray )]
334
- if True not in check :
336
+ if not isinstance (v , (list , tuple , np .ndarray )):
335
337
raise ValueError ("Parameter values should be a list." )
336
338
337
339
if len (v ) == 0 :
338
340
raise ValueError ("Parameter values should be a non-empty "
339
341
"list." )
340
342
343
+ parameter_names .update (p .keys ())
344
+
345
+ return list (parameter_names )
346
+
347
+
348
+ def _get_metric_names (scoring ):
349
+ """Generate the list of metric name(s) given the scoring parameter"""
350
+ metric_names = list ()
351
+ # XXX Do we index from 0?
352
+ # NOTE we need this to prevent collisions between similarly named
353
+ # custom metric (i.e [foo.bar, bar])
354
+ n_custom_metrics = 1
341
355
356
+ if not isinstance (scoring , (list , tuple )):
357
+ scoring = [scoring ]
358
+
359
+ for metric in scoring :
360
+ if callable (metric ):
361
+ metric_names .append ("custom_metric_%s_%s" %
362
+ (n_custom_metrics , metric .__name__ ))
363
+ n_custom_metrics += 1
364
+
365
+ elif isinstance (metric , six .string_types ):
366
+ metric_names .append (metric )
367
+
368
+ else :
369
+ raise ValueError ("Unknown metric type - %r" % type (metric ))
370
+
371
+ return metric_names
372
+
373
+
374
+ # XXX Remove in 0.20
342
375
class _CVScoreTuple (namedtuple ('_CVScoreTuple' ,
343
376
('parameters' ,
344
377
'mean_validation_score' ,
@@ -381,6 +414,7 @@ def __init__(self, estimator, scoring=None,
381
414
self .verbose = verbose
382
415
self .pre_dispatch = pre_dispatch
383
416
self .error_score = error_score
417
+ self .metric_names_ = _get_metric_names (scoring )
384
418
385
419
@property
386
420
def _estimator_type (self ):
@@ -521,6 +555,12 @@ def inverse_transform(self, Xt):
521
555
"""
522
556
return self .best_estimator_ .transform (Xt )
523
557
558
+ @property
559
+ @deprecated ("The grid_scores_ attribute is deprecated in favor of the "
560
+ "search_results_ and will be removed in version 0.20." )
561
+ def grid_scores_ (self ):
562
+ return self ._grid_scores
563
+
524
564
def _fit (self , X , y , labels , parameter_iterable ):
525
565
"""Actual fitting, performing the search over parameters."""
526
566
@@ -561,38 +601,67 @@ def _fit(self, X, y, labels, parameter_iterable):
561
601
# Out is a list of triplet: score, estimator, n_test_samples
562
602
n_fits = len (out )
563
603
564
- scores = list ()
565
- grid_scores = list ()
566
- for grid_start in range (0 , n_fits , n_splits ):
567
- n_test_samples = 0
568
- score = 0
569
- all_scores = []
570
- for this_score , this_n_test_samples , _ , parameters in \
571
- out [grid_start :grid_start + n_splits ]:
572
- all_scores .append (this_score )
604
+ self ._grid_scores = list ()
605
+
606
+ # XXX Do we want to store these?
607
+ n_candidates = n_fits / n_splits
608
+ n_parameters = len (self .parameter_names_ )
609
+ n_metrics = len (scoring )
610
+
611
+ search_results_ = dict ()
612
+
613
+ for param in self .parameter_names_ :
614
+ search_results_ [param ] = np .empty ((n_candidates ,), dtype = object )
615
+
616
+ for metric in self .metric_names_ :
617
+ # Make a column for each split
618
+ # XXX To make it future proof
619
+ for split_i in range (n_splits )]:
620
+ search_results_ ["%s_split_%s" % (metric , split_i )] = (
621
+ np .empty ((n_candidates ,), dtype = np .float32 ))
622
+
623
+ search_results_ ["%s_aggregated" ] = np .empty ((n_candidates ,),
624
+ dtype = np .float32 )
625
+ search_results_ ["%s_rank" ] = np .empty ((n_candidates ,), dtype = int )
626
+
627
+ for grid_start in range (0 , n_fits , n_splits ):
628
+ n_test_samples = 0
629
+ aggregated_score = 0
630
+ all_scores = []
631
+
632
+ # XXX Loop this when multiple metric support is enabled
633
+ for (this_score , this_n_test_samples , _ , parameters ), i in \
634
+ enumerate (out [grid_start :grid_start + n_splits ]):
635
+ all_scores .append (this_score )
636
+
637
+ if self .iid :
638
+ this_score *= this_n_test_samples
639
+ n_test_samples += this_n_test_samples
640
+ aggregated_score += this_score
641
+ search_results_ ["%s_split_%s" % (metric , i )] = this_score
642
+
573
643
if self .iid :
574
- this_score *= this_n_test_samples
575
- n_test_samples += this_n_test_samples
576
- score += this_score
577
- if self .iid :
578
- score /= float (n_test_samples )
579
- else :
580
- score /= float (n_splits )
581
- scores .append ((score , parameters ))
582
- # TODO: shall we also store the test_fold_sizes?
583
- grid_scores .append (_CVScoreTuple (
644
+ aggregated_score /= float (n_test_samples )
645
+ else :
646
+ aggregated_score /= float (n_splits )
647
+
648
+ search_results_ ["%s_aggregated" % metric ] = aggregated_score
649
+
650
+ # XXX Remove in version 0.20
651
+ self ._grid_scores .append (_CVScoreTuple (
584
652
parameters ,
585
653
score ,
586
654
np .array (all_scores )))
587
- # Store the computed scores
588
- self .grid_scores_ = grid_scores
589
655
590
- # Find the best parameters by comparing on the mean validation score:
591
- # note that `sorted` is deterministic in the way it breaks ties
592
- best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
593
- reverse = True )[0 ]
594
- self .best_params_ = best .parameters
595
- self .best_score_ = best .mean_validation_score
656
+ # Find the best parameters by comparing on the mean validation score:
657
+ # note that `sorted` is deterministic in the way it breaks ties
658
+ np .argsort (search_results_ ["%s" ])
659
+ search_results_ ["%s_aggregated" % metric ] = aggregated_score
660
+
661
+ best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
662
+ reverse = True )[0 ]
663
+ self .best_params_ = best .parameters
664
+ self .best_score_ = best .mean_validation_score
596
665
597
666
if self .refit :
598
667
# fit the best estimator using the entire dataset
@@ -723,15 +792,32 @@ class GridSearchCV(BaseSearchCV):
723
792
724
793
Attributes
725
794
----------
726
- grid_scores_ : list of named tuples
727
- Contains scores for all parameter combinations in param_grid.
728
- Each entry corresponds to one parameter setting.
729
- Each named tuple has the attributes:
730
-
731
- * ``parameters``, a dict of parameter settings
732
- * ``mean_validation_score``, the mean score over the
733
- cross-validation folds
734
- * ``cv_validation_scores``, the list of scores for each fold
795
+ search_results_ : dict of numpy (masked) ndarrays
796
+ A dict with keys as column headers and values as columns, that can be
797
+ imported into a pandas DataFrame.
798
+
799
+ For instance the below given table
800
+
801
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
802
+ =====================================================================
803
+ 'poly'| - | 2 | 0.8 | 0.81 |
804
+ 'poly'| - | 3 | 0.7 | 0.60 |
805
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
806
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
807
+
808
+ will be represented by a search_results_ dict of :
809
+
810
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
811
+ mask = [False False False False]...)
812
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
813
+ mask = [ True True False False]...),
814
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
815
+ mask = [False False True True]...),
816
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
817
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
818
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
819
+ 'candidate_rank' : [2, 4, 3, 1],
820
+ }
735
821
736
822
best_estimator_ : estimator
737
823
Estimator that was chosen by the search, i.e. estimator
@@ -785,7 +871,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
785
871
n_jobs = n_jobs , iid = iid , refit = refit , cv = cv , verbose = verbose ,
786
872
pre_dispatch = pre_dispatch , error_score = error_score )
787
873
self .param_grid = param_grid
788
- _check_param_grid (param_grid )
874
+ self . parameter_names_ = _check_param_grid_or_dist (param_grid )
789
875
790
876
def fit (self , X , y = None , labels = None ):
791
877
"""Run fit with all sets of parameters.
@@ -919,15 +1005,32 @@ class RandomizedSearchCV(BaseSearchCV):
919
1005
920
1006
Attributes
921
1007
----------
922
- grid_scores_ : list of named tuples
923
- Contains scores for all parameter combinations in param_grid.
924
- Each entry corresponds to one parameter setting.
925
- Each named tuple has the attributes:
926
-
927
- * ``parameters``, a dict of parameter settings
928
- * ``mean_validation_score``, the mean score over the
929
- cross-validation folds
930
- * ``cv_validation_scores``, the list of scores for each fold
1008
+ search_results_ : dict of numpy (masked) ndarrays
1009
+ A dict with keys as column headers and values as columns, that can be
1010
+ imported into a pandas DataFrame.
1011
+
1012
+ For instance the below given table
1013
+
1014
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1015
+ =====================================================================
1016
+ 'poly'| - | 2 | 0.8 | 0.81 |
1017
+ 'poly'| - | 3 | 0.7 | 0.60 |
1018
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
1019
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
1020
+
1021
+ will be represented by a search_results_ dict of :
1022
+
1023
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1024
+ mask = [False False False False]...)
1025
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
1026
+ mask = [ True True False False]...),
1027
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
1028
+ mask = [False False True True]...),
1029
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1030
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1031
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1032
+ 'candidate_rank' : [2, 4, 3, 1],
1033
+ }
931
1034
932
1035
best_estimator_ : estimator
933
1036
Estimator that was chosen by the search, i.e. estimator
@@ -970,6 +1073,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
970
1073
error_score = 'raise' ):
971
1074
972
1075
self .param_distributions = param_distributions
1076
+ self .parameter_names_ = _check_param_grid_or_dist (param_distributions )
973
1077
self .n_iter = n_iter
974
1078
self .random_state = random_state
975
1079
super (RandomizedSearchCV , self ).__init__ (
0 commit comments