@@ -320,24 +320,46 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
320
320
return score , parameters , n_samples_test
321
321
322
322
323
- def _check_param_grid ( param_grid ):
324
- if hasattr ( param_grid , 'items' ):
325
- param_grid = [ param_grid ]
323
+ def _check_param_grid_or_dist ( param_grid_or_dist ):
324
+ """Validate param_grid/distribution and return the unique parameters"""
325
+ parameter_names = set ()
326
326
327
- for p in param_grid :
327
+ if hasattr (param_grid_or_dist , 'items' ):
328
+ param_grid_or_dist = [param_grid_or_dist ]
329
+
330
+ for p in param_grid_or_dist :
328
331
for v in p .values ():
329
332
if isinstance (v , np .ndarray ) and v .ndim > 1 :
330
333
raise ValueError ("Parameter array should be one-dimensional." )
331
334
332
- check = [isinstance (v , k ) for k in (list , tuple , np .ndarray )]
333
- if True not in check :
335
+ if not isinstance (v , (list , tuple , np .ndarray )):
334
336
raise ValueError ("Parameter values should be a list." )
335
337
336
338
if len (v ) == 0 :
337
339
raise ValueError ("Parameter values should be a non-empty "
338
340
"list." )
339
341
342
+ parameter_names .update (p .keys ())
343
+
344
+ return list (parameter_names )
345
+
346
+
347
+ def _get_metric_name (scoring ):
348
+ """Generate the metric name given the scoring parameter"""
349
+ if callable (scoring ):
350
+ if scoring .__name__ == "_passthrough_scorer" :
351
+ return "estimator_default_scorer"
352
+ else :
353
+ return "custom_metric_%s" % (scoring .__name__ ,)
354
+
355
+ elif isinstance (scoring , six .string_types ):
356
+ return scoring
357
+
358
+ else :
359
+ raise ValueError ("Unknown metric type - %r" % type (scoring ))
340
360
361
+
362
+ # XXX Remove in 0.20
341
363
class _CVScoreTuple (namedtuple ('_CVScoreTuple' ,
342
364
('parameters' ,
343
365
'mean_validation_score' ,
@@ -526,6 +548,7 @@ def _fit(self, X, y, labels, parameter_iterable):
526
548
estimator = self .estimator
527
549
cv = check_cv (self .cv , y , classifier = is_classifier (estimator ))
528
550
self .scorer_ = check_scoring (self .estimator , scoring = self .scoring )
551
+ self .metric_name_ = _get_metric_name (self .scorer_ )
529
552
530
553
n_samples = _num_samples (X )
531
554
X , y , labels = indexable (X , y , labels )
@@ -560,44 +583,91 @@ def _fit(self, X, y, labels, parameter_iterable):
560
583
# Out is a list of triplet: score, estimator, n_test_samples
561
584
n_fits = len (out )
562
585
563
- scores = list ()
564
- grid_scores = list ()
565
- for grid_start in range (0 , n_fits , n_splits ):
566
- n_test_samples = 0
567
- score = 0
568
- all_scores = []
569
- for this_score , this_n_test_samples , _ , parameters in \
586
+ self .n_candidates_ = int (n_fits / n_splits )
587
+ self .n_parameters_ = len (self .parameter_names_ )
588
+
589
+ res_shape = (self .n_candidates_ ,)
590
+
591
+ search_results = dict ()
592
+
593
+ # Lets not initite this everytime and reuse the same array.
594
+ all_scores = np .empty ((n_splits ,), dtype = np .float64 )
595
+
596
+ # Loop this when multiple metric support is introduced.
597
+ metric = self .metric_name_
598
+
599
+
A8C6
for param in self .parameter_names_ :
600
+ # One column to record the values of each parameter
601
+ search_results ["param_%s" % param ] = (
602
+ np .ma .masked_all (res_shape , dtype = object ))
603
+
604
+ # Make a column for each split of each metric
605
+ for split_i in range (n_splits ):
606
+ search_results ["%s_split_%s" % (metric , split_i )] = (
607
+ np .empty (res_shape , dtype = np .float64 ))
608
+ search_results ["%s_mean" % metric ] = np .empty (res_shape ,
609
+ dtype = np .float64 )
610
+ search_results ["%s_rank" % metric ] = np .empty (res_shape , dtype = int )
611
+
612
+ for fit_i , grid_start in enumerate (range (0 , n_fits , n_splits )):
613
+ n_test_samples_total = 0
614
+ mean_score = 0
615
+
616
+ split_i = - 1
617
+ for score_i , n_test_samples_i , _ , parameters in \
570
618
out [grid_start :grid_start + n_splits ]:
571
- all_scores .append (this_score )
619
+ split_i += 1
620
+ # Record the score/n_test_samples for the i-th split
621
+ # of the current parameter setting candidate.
622
+ all_scores [split_i ] = score_i
623
+
572
624
if self .iid :
573
- this_score *= this_n_test_samples
574
- n_test_samples += this_n_test_samples
575
- score += this_score
625
+ score_i *= n_test_samples_i
626
+ n_test_samples_total += n_test_samples_i
627
+
628
+ mean_score += score_i
629
+ search_results ["%s_split_%s" %
630
+ (metric , split_i )][fit_i ] = score_i
631
+
576
632
if self .iid :
577
- score /= float (n_test_samples )
633
+ mean_score = all_scores . sum () / float (n_test_samples_total )
578
634
else :
579
- score /= float (n_splits )
580
- scores .append ((score , parameters ))
581
- # TODO: shall we also store the test_fold_sizes?
582
- grid_scores .append (_CVScoreTuple (
583
- parameters ,
584
- score ,
585
- np .array (all_scores )))
586
- # Store the computed scores
587
- self .grid_scores_ = grid_scores
635
+ mean_score = all_scores .mean ()
636
+
637
+ # Store the mean score and the parameters for this fit
638
+ search_results ["%s_mean" % metric ][fit_i ] = mean_score
639
+ for param in parameters :
640
+ # This entry alone gets unmasked when assigned
641
+ search_results ["param_%s" % param ][fit_i ] = parameters [param ]
588
642
589
643
# Find the best parameters by comparing on the mean validation score:
590
644
# note that `sorted` is deterministic in the way it breaks ties
591
- best = sorted (grid_scores , key = lambda x : x .mean_validation_score ,
592
- reverse = True )[0 ]
593
- self .best_params_ = best .parameters
594
- self .best_score_ = best .mean_validation_score
645
+ # We reverse the order to get a descending sort order
646
+ sorted_indices = np .argsort (
647
+ search_results ["%s_mean" % metric ])[::- 1 ]
648
+
649
+ search_results ["%s_rank" % metric ][sorted_indices ] = (
650
+ np .arange (1 , self .n_parameters_ + 2 ))
651
+
652
+ self .search_results_ = search_results
653
+
654
+ best = sorted_indices [0 ]
655
+
656
+ parameters = dict ()
657
+
658
+ for param in self .parameter_names_ :
659
+ value = search_results ["param_%s" % param ][best ]
660
+ if value is not np .ma .masked :
661
+ parameters [param ] = value
662
+
663
+ self .best_params_ = parameters
664
+ self .best_score_ = search_results ["%s_mean" % metric ][best ]
595
665
596
666
if self .refit :
597
667
# fit the best estimator using the entire dataset
598
668
# clone first to work around broken estimators
599
669
best_estimator = clone (base_estimator ).set_params (
600
- ** best . parameters )
670
+ ** parameters )
601
671
if y is not None :
602
672
best_estimator .fit (X , y , ** self .fit_params )
603
673
else :
@@ -722,15 +792,32 @@ class GridSearchCV(BaseSearchCV):
722
792
723
793
Attributes
724
794
----------
725
- grid_scores_ : list of named tuples
726
- Contains scores for all parameter combinations in param_grid.
727
- Each entry corresponds to one parameter setting.
728
- Each named tuple has the attributes:
729
-
730
- * ``parameters``, a dict of parameter settings
731
- * ``mean_validation_score``, the mean score over the
732
- cross-validation folds
733
- * ``cv_validation_scores``, the list of scores for each fold
795
+ search_results_ : dict of numpy (masked) ndarrays
796
+ A dict with keys as column headers and values as columns, that can be
797
+ imported into a pandas DataFrame.
798
+
799
+ For instance the below given table
800
+
801
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
802
+ =====================================================================
803
+ 'poly'| - | 2 | 0.8 | 0.81 |
804
+ 'poly'| - | 3 | 0.7 | 0.60 |
805
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
806
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
807
+
808
+ will be represented by a search_results_ dict of :
809
+
810
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
811
+ mask = [False False False False]...)
812
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
813
+ mask = [ True True False False]...),
814
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
815
+ mask = [False False True True]...),
816
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
817
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
818
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
819
+ 'candidate_rank' : [2, 4, 3, 1],
820
+ }
734
821
735
822
best_estimator_ : estimator
736
823
Estimator that was chosen by the search, i.e. estimator
@@ -784,7 +871,7 @@ def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
784
871
n_jobs = n_jobs , iid = iid , refit = refit , cv = cv , verbose = verbose ,
785
872
pre_dispatch = pre_dispatch , error_score = error_score )
786
873
self .param_grid = param_grid
787
- _check_param_grid (param_grid )
874
+ self . parameter_names_ = _check_param_grid_or_dist (param_grid )
788
875
789
876
def fit (self , X , y = None , labels = None ):
790
877
"""Run fit with all sets of parameters.
@@ -918,15 +1005,32 @@ class RandomizedSearchCV(BaseSearchCV):
918
1005
919
1006
Attributes
920
1007
----------
921
- grid_scores_ : list of named tuples
922
- Contains scores for all parameter combinations in param_grid.
923
- Each entry corresponds to one parameter setting.
924
- Each named tuple has the attributes:
925
-
926
- * ``parameters``, a dict of parameter settings
927
- * ``mean_validation_score``, the mean score over the
928
- cross-validation folds
929
- * ``cv_validation_scores``, the list of scores for each fold
1008
+ search_results_ : dict of numpy (masked) ndarrays
1009
+ A dict with keys as column headers and values as columns, that can be
1010
+
FD55
imported into a pandas DataFrame.
1011
+
1012
+ For instance the below given table
1013
+
1014
+ kernel|gamma|degree|accuracy_score_split_0...|accuracy_score_mean ...|
1015
+ =====================================================================
1016
+ 'poly'| - | 2 | 0.8 | 0.81 |
1017
+ 'poly'| - | 3 | 0.7 | 0.60 |
1018
+ 'rbf' | 0.1 | - | 0.8 | 0.75 |
1019
+ 'rbf' | 0.2 | - | 0.9 | 0.82 |
1020
+
1021
+ will be represented by a search_results_ dict of :
1022
+
1023
+ {'kernel' : masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
1024
+ mask = [False False False False]...)
1025
+ 'gamma' : masked_array(data = [-- -- 0.1 0.2],
1026
+ mask = [ True True False False]...),
1027
+ 'degree' : masked_array(data = [2.0 3.0 -- --],
1028
+ mask = [False False True True]...),
1029
+ 'accuracy_score_split_0' : [0.8, 0.7, 0.8, 0.9],
1030
+ 'accuracy_score_split_1' : [0.82, 0.5, 0.7, 0.78],
1031
+ 'accuracy_score_mean' : [0.81, 0.60, 0.75, 0.82],
1032
+ 'candidate_rank' : [2, 4, 3, 1],
1033
+ }
930
1034
931
1035
best_estimator_ : estimator
932
1036
Estimator that was chosen by the search, i.e. estimator
@@ -969,6 +1073,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
969
1073
error_score = 'raise' ):
970
1074
971
1075
self .param_distributions = param_distributions
1076
+ self .parameter_names_ = _check_param_grid_or_dist (param_distributions )
972
1077
self .n_iter = n_iter
973
1078
self .random_state = random_state
974
1079
super (RandomizedSearchCV , self ).__init__ (
0 commit comments