@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
320
320
return score , parameters , n_samples_test
321
321
322
322
323
- def _check_param_grid ( param_grid ):
324
- if hasattr ( param_grid , 'items' ):
325
- param_grid = [ param_grid ]
323
+ def _check_param_grid_or_dist ( param_grid_or_dist ):
324
+ """Validate param_grid/distribution and return the unique parameters"""
325
+ parameter_names = set ()
326
326
327
- for p in param_grid :
327
+ if hasattr (param_grid_or_dist , 'items' ):
328
+ param_grid_or_dist = [param_grid_or_dist ]
329
+
330
+ for p in param_grid_or_dist :
328
331
for v in p .values ():
329
332
if isinstance (v , np .ndarray ) and v .ndim > 1 :
330
333
raise ValueError ("Parameter array should be one-dimensional." )
331
334
332
- check = [isinstance (v , k ) for k in (list , tuple , np .ndarray )]
333
- if True not in check :
335
+ if not isinstance (v , (list , tuple , np .ndarray )):
334
336
raise ValueError ("Parameter values should be a list." )
335
337
336
338
if len (v ) == 0 :
337
339
raise ValueError ("Parameter values should be a non-empty "
338
340
"list." )
339
341
342
+ parameter_names .update (p .keys ())
343
+
344
+ return list (parameter_names )
345
+
346
+
347
+ def _get_metric_name (scoring ):
348
+ """Generate the metric name given the scoring parameter"""
349
+ if callable (scoring ):
350
+ if scoring .__name__ == "_passthrough_scorer" :
351
+ return "estimator_default_scorer"
352
+ else :
353
+ return "custom_metric_%s" % (scoring .__name__ ,)
354
+
355
+ elif isinstance (scoring , six .string_types ):
356
+ return scoring
357
+
358
+ else :
359
+ raise ValueError ("Unknown metric type - %r" % type (scoring ))
340
360
361
+
362
+ # XXX Remove in 0.20
341
363
class _CVScoreTuple (namedtuple ('_CVScoreTuple' ,
342
364
('parameters' ,
343
365
'mean_validation_score' ,
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
526
548
estimator = self .estimator
527
549
cv = check_cv (self .cv , y , classifier = is_classifier (estimator ))
528
550
self .scorer_ = check_scoring (self .estimator , scoring = self .scoring )
551
+ self .metric_name_ = _get_metric_name (self .scorer_ )
529
552
530
553
n_samples = _num_samples (X )
531
554
X , y , labels = indexable (X , y , labels )
@@ -560,44 +583,90 @@ def _fit(self, X, y, labels, parameter_iterable):
560
583
# Out is a list of triplet: score, estimator, n_test_samples
561
584
n_fits = len (out )
562
585
563
- scores = list ()
564
- grid_scores = list ()
565
- for grid_start in range (0 , n_fits , n_splits ):
566
- n_test_samples = 0
567
- score = 0
568
- all_scores = []
569
- for this_score , this_n_test_samples , _ , parameters in \
586
+ self .n_candidates_ = int (n_fits / n_splits )
587
+ self .n_parameters_ = len (self .parameter_names_ )
588
+
589
+ res_shape = (self .n_candidates_ ,)
590
+
591
+ search_results = dict ()
592
+
593
+ for param in self .parameter_names_ :
594
+ # One column to record the values of each parameter
595
+ search_results [param ] = np .ma .masked_all (res_shape , dtype = object )
596
+
597
+ # Lets not initite this everytime and reuse the same array.
598
+ all_scores = np .empty ((n_splits ,), dtype = np .float64 )
599
+
600
+ # Loop this when multiple metric support is introduced.
601
+ metric = self .metric_name_
602
+
603
+ # Make a column for each split of each metric
604
+ for split_i in range (n_splits ):
605
+ search_results ["%s_split_%s" % (metric , split_i )] = (
606
+ np .empty (res_shape , dtype = np .float64 ))
607
+ search_results ["%s_mean" % metric ] = np .empty (res_shape ,
608
+ dtype = np .float64 )
609
+ search_results ["%s_rank" % metric ] = np .empty (res_shape , dtype = int )
610
+
611
+ for fit_i , grid_start in enumerate (range (0 , n_fits , n_splits )):
612
+ n_test_samples_total = 0
613
+ mean_score = 0
614
+
615
+ split_i = - 1
616
+ for score_i , n_test_samples_i , _ , parameters in \
570
617
out [grid_start :grid_start + n_splits ]:
571
- all_scores .append (this_score )
618
+ split_i += 1
619
+ # Record the score/n_test_samples for the i-th split
620
+ # of the current parameter setting candidate.
621
+ all_scores [split_i ] = score_i
622
+
572
623
if self .iid :
573
- this_score *= this_n_test_samples
574
- n_test_samples += this_n_test_samples
575
- score += this_score
624
+ score_i *= n_test_samples_i
625
+ n_test_samples_total += n_test_samples_i
626
+
627
+ mean_score += score_i
628
+ search_results ["%s_split_%s" %
629
+ (metric , split_i )][fit_i ] = score_i
630
+
576
631
if self .iid :
577
- score /= float (n_test_samples )
632
+ mean_score = all_scores . sum () / float (n_test_samples_total )
578
633
else :
579
- score /= float (n_splits )
580
- scores .append ((score , parameters ))
581
- # TODO: shall we also store the test_fold_sizes?
582
- grid_scores .append (_CVScoreTuple (
583
- parameters ,
584
- score ,
585
- np .array (all_scores )))
586
- # Store the computed scores
587
- self .grid_scores_ = grid_scores
634
+ mean_score = all_scores .mean ()
635
+
636
+ # Store the mean score and the parameters for this fit
637
+ search_results ["%s_mean" % metric ][fit_i ] = mean_score
638
+ for param in parameters :
639
+ # This entry alone gets unmasked when assigned
640
+ search_results [param ][fit_i ] = parameters [param ]
588
641
589
642
# Find the best parameters by comparing on the mean validation score:
590
643
# note that `sorted` is deterministic in the way it breaks ties
591
- best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
592
- reverse = True )[0 ]
593
- self .best_params_ = best .parameters
594
- self .best_score_ = best .mean_validation_score
644
+ # We reverse the order to get a descending sort order
645
+ sorted_indices = np .argsort (
646
+ search_results ["%s_mean" % metric ])[::- 1 ]
647
+
648
+ search_results ["%s_rank" % metric ][sorted_indices ] = (
649
+ np .arange (1 , self .n_parameters_ + 2 ))
650
+
651
+ self .search_results_ = search_results
652
+
653
+ best = sorted_indices [0 ]
654
+
655
+ parameters = dict ()
656
+
657
+ for param in self .parameter_names_ :
658
+ value = search_results [param ][best ]
659
+ if value is not np .ma .masked :
660
+ parameters [param ] = search_results [param ][best ]
661
+
662
+ self .best_params_ = parameters
663
+ self .best_score_ = search_results ["%s_mean" % metric ][best ]
595
664
596
665
if self .refit :
597
666
# fit the best estimator using the entire dataset
598
667
# clone first to work around broken estimators
599
668
best_estimator = clone (base_estimator ).set_params (
600
- ** best . parameters )
669
+ ** parameters )
601
670
if y is not None :
602
671
best_estimator .fit (X , y , ** self .fit_params )
603
672
else :
@@ -722,15 +791,32 @@ class GridSearchCV(BaseSearchCV):
722
791
723
792
Attributes
724
793
----------
725
- grid_scores_ : list of named tuples
726
- Contains scores for all parameter combinations in param_grid.
727
- Each entry corresponds to one parameter setting.
728
- Each named tuple has the attributes:
729
-
730
- * ``parameters``, a dict of parameter settings
731
- * ``mean_validation_score``, the mean score over the
732
- cross-validation folds
733
- * ``cv_validation_scores``, the list of scores for each fold
794
+ search_results_ : dict of numpy (masked) ndarrays
795
+ A dict with keys as column headers and values as columns, that can be
796
+ imported into a pandas DataFrame.
797
+
798
+ For instance the below given table
799
+
800
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
801
+ =====================================================================
802
+ 'poly'| - | 2 | 0.8 | 0.81 |
803
+ 'poly'| - | 3 | 0.7 | 0.60 |
804
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
805
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
806
+
807
+ will be represented by a search_results_ dict of :
808
+
809
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
810
+ mask = [False False False False]...)
811
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
812
+ mask = [ True True False False]...),
813
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
814
+ mask = [False False True True]...),
815
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
816
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
817
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
818
+ 'candidate_rank' : [2, 4, 3, 1],
819
+ }
734
820
735
821
best_estimator_ : estimator
736
822
Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +870,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
784
870
n_jobs = n_jobs , iid = iid , refit = refit , cv = cv , verbose = verbose ,
785
871
pre_dispatch = pre_dispatch , error_score = error_score )
786
872
self .param_grid = param_grid
787
- _check_param_grid (param_grid )
873
+ self . parameter_names_ = _check_param_grid_or_dist (param_grid )
788
874
789
875
def fit (self , X , y = None , labels = None ):
790
876
"""Run fit with all sets of parameters.
@@ -918,15 +1004,32 @@ class RandomizedSearchCV(BaseSearchCV):
918
1004
919
1005
Attributes
920
1006
----------
921
- grid_scores_ : list of named tuples
922
- Contains scores for all parameter combinations in param_grid.
923
- Each entry corresponds to one parameter setting.
924
- Each named tuple has the attributes:
925
-
926
- * ``parameters``, a dict of parameter settings
927
- * ``mean_validation_score``, the mean score over the
928
- cross-validation folds
929
- * ``cv_validation_scores``, the list of scores for each fold
1007
+ search_results_ : dict of numpy (masked) ndarrays
1008
+ A dict with keys as column headers and values as columns, that can be
1009
+ imported into a pandas DataFrame.
1010
+
1011
+ For instance the below given table
1012
+
1013
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1014
+ =====================================================================
1015
+ 'poly'| - | 2 | 0.8 | 0.81 |
1016
+ 'poly'| - | 3 | 0.7 | 0.60 |
1017
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
1018
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
1019
+
1020
+ will be represented by a search_results_ dict of :
1021
+
1022
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1023
+ mask = [False False False False]...)
1024
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
1025
+ mask = [ True True False False]...),
1026
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
1027
+ mask = [False False True True]...),
1028
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1029
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1030
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1031
+ 'candidate_rank' : [2, 4, 3, 1],
1032
+ }
930
1033
931
1034
best_estimator_ : estimator
932
1035
Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1072,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
969
1072
error_score = 'raise' ):
970
1073
971
1074
self .param_distributions = param_distributions
1075
+ self .parameter_names_ = _check_param_grid_or_dist (param_distributions )
972
1076
self .n_iter = n_iter
973
1077
self .random_state = random_state
974
1078
super (RandomizedSearchCV , self ).__init__ (
0 commit comments