scikit-learn
diff --git a/‎sklearn/model_selection/_search.py
Lines changed: 136 additions & 45 deletions b/‎sklearn/model_selection/_search.py
Lines changed: 136 additions & 45 deletions
@@ -34,7 +34,7 @@
 from ..utils.random import sample_without_replacement
 from ..utils.validation import indexable, check_is_fitted
 from ..utils.metaestimators import if_delegate_has_method
-from ..metrics.scorer import check_scoring
+from ..metrics.scorer import check_multimetric_scoring
 
 
 __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
@@ -291,9 +291,11 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     test : ndarray, dtype int or bool
         Boolean mask or indices for test set.
 
-    scorer : callable or None.
-        If provided must be a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
+    scorers : dict
+        dict which maps the scorer name to the scorer callable.
+
+        If provided, the scorer callable object / function must be with
+        signature ``scorer(estimator, X, y)``.
 
     verbose : int
         Verbosity level.
@@ -309,21 +311,22 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
 
     Returns
     -------
-    score : float
-        Score of this parameter setting on given training / test split.
+    scores : dict
+        A dict mapping the scorer name to it's score value for the given
+        parameter setting on given training / test split.
 
     parameters : dict
         The parameters that have been evaluated.
 
     n_samples_test : int
         Number of test samples in this split.
     """
-    score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train,
-                                              test, verbose, parameters,
-                                              fit_params=fit_params,
-                                              return_n_test_samples=True,
-                                              error_score=error_score)
-    return score, parameters, n_samples_test
+    scores, n_samples_test, _ = _fit_and_score(estimator, X, y, scorers, train,
+                                               test, verbose, parameters,
+                                               fit_params=fit_params,
+                                               return_n_test_samples=True,
+                                               error_score=error_score)
+    return scores, parameters, n_samples_test
 
 
 def _check_param_grid(param_grid):
@@ -537,7 +540,10 @@ def _fit(self, X, y, groups, parameter_iterable):
 
         estimator = self.estimator
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
-        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
+
+        self.scorers_ = check_multimetric_scoring(self.estimator,
+                                                  scoring=self.scoring)
+        multimetric = False if len(list(self.scorers_.keys())) == 1 else True
 
         X, y, groups = indexable(X, y, groups)
         n_splits = cv.get_n_splits(X, y, gro
10000
ups)
@@ -565,15 +571,32 @@ def _fit(self, X, y, groups, parameter_iterable):
 
         # if one choose to see train score, "out" will contain train score info
         if self.return_train_score:
-            (train_scores, test_scores, test_sample_counts,
+            (train_score_dicts, test_score_dicts, test_sample_counts,
              fit_time, score_time, parameters) = zip(*out)
         else:
-            (test_scores, test_sample_counts,
+            (test_score_dicts, test_sample_counts,
              fit_time, score_time, parameters) = zip(*out)
 
         candidate_params = parameters[::n_splits]
         n_candidates = len(candidate_params)
 
+        # The train_scores and test_scores will be a list of dict
+        # of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
+        # Convert that to a dict of array {'prec': np.array([0.1 ...]), ...}
+        def _to_dict_of_scores_array():
+            # It will be reshaped into (n_candidates, n_splits) in _store()
+            np_empty = partial(np.empty, shape=((n_candidates * n_splits,)))
+            scores_arr = defauldict(np_empty)
+            for i, score_dict_i in enumerate(scores):
+                for key in scoring.keys():
+                    scores_arr[key][i // n_splits,
+                                    i % n_splits] = score_dict_i[key]
+            return dict(scores_arr)
+
+        test_scores = _to_dict_of_scores_array(test_score_dicts)
+        if self.return_train_score:
+            train_scores = _to_dict_of_scores_array(train_score_dicts)
+
         results = dict()
 
         def _store(key_name, array, weights=None, splits=False, rank=False):
@@ -582,6 +605,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                                                               n_splits)
             if splits:
                 for split_i in range(n_splits):
+                    # Uses closure to reference the results
                     results["split%d_%s"
                             % (split_i, key_name)] = array[:, split_i]
 
@@ -597,19 +621,20 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                 results["rank_%s" % key_name] = np.asarray(
                     rankdata(-array_means, method='min'), dtype=np.int32)
 
-        # Computed the (weighted) mean and std for test scores alone
-        # NOTE test_sample counts (weights) remain the same for all candidates
-        test_sample_counts = np.array(test_sample_counts[:n_splits],
-                                      dtype=np.int)
-
-        _store('test_score', test_scores, splits=True, rank=True,
-               weights=test_sample_counts if self.iid else None)
-        _store('train_score', train_scores, splits=True)
-        _store('fit_time', fit_time)
-        _store('score_time', score_time)
-
-        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
-        best_parameters = candidate_params[best_index]
+        for scorer_name in self.scoring.keys():
+            # Computed the (weighted) mean and std for test scores alone
+            # NOTE test_sample counts (weights) remain the same for all
+            # candidates
+            test_sample_counts = np.array(test_sample_counts[:n_splits],
+                                          dtype=np.int)
+            _store('test_%s' % scorer_name, test_scores[scorer_name],
+                   splits=True, rank=True,
+                   weights=test_sample_counts if self.iid else None)
+            if self.return_train_score:
+                _store('train_%s' % scorer_name, train_scores[scorer_name],
+                       splits=True)
+            _store('fit_time', fit_time)
+            _store('score_time', score_time)
 
         # Use one np.MaskedArray and mask all the places where the param is not
         # applicable for that candidate. Use defaultdict as each candidate may
@@ -625,6 +650,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
 
         results.update(param_results)
 
+        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
+        best_parameters = candidate_params[best_index]
+
         # Store a list of param dicts at the key 'params'
         results['params'] = candidate_params
 
@@ -707,11 +735,18 @@ class GridSearchCV(BaseSearchCV):
         in the list are explored. This enables searching over any sequence
         of parameter settings.
 
-    scoring : string, callable or None, default=None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-        If ``None``, the ``score`` method of the estimator is used.
+    scoring : string, callable or None, optional, default: None
+        A single string (see :ref:`_scoring_parameter`) or a callable
+        (see :ref:`_scoring`) to evaluate the predictions on the test set.
+
+        For evaluating multiple metrics, either give a list of (unique) strings
+        or a dict with names as keys and callables as values.
+
+        NOTE that when using custom scorers, each scorer should return a single
+        value. Single scorers returning a list/array of values may be wrapped
+        into multiple scorers that return one value each.
+
+        If None the estimator's default scorer, if available is used.
 
     fit_params : dict, optional
         Parameters to pass to the fit method.
@@ -848,35 +883,63 @@ class GridSearchCV(BaseSearchCV):
             'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
             }
 
-        NOTE that the key ``'params'`` is used to store a list of parameter
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
         settings dict for all the parameter candidates.
 
         The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
         ``std_score_time`` are all in seconds.
 
-    best_estimator_ : estimator
+        For multiple metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` as shown
+        above. ('split0_test_precision', 'mean_train_precision' etc.)
+
+    best_estimator_ : estimator or dict
         Estimator that was chosen by the search, i.e. estimator
         which gave highest score (or smallest loss if specified)
         on the left out data. Not available if refit=False.
 
-    best_score_ : float
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the estimator
+        that gave the best score for that scorer.
+
+    best_score_ : float or dict
         Score of best_estimator on the left out data.
 
-    best_params_ : dict
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the best score
+        for that scorer.
+
+    best_params_ : dict or dict of dict
         Parameter setting that gave the best results on the hold out data.
 
-    best_index_ : int
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict of dict mapping scorer names to the
+        dict of the parameter setting that gave the best scores for that
+        scorer.
+
+    best_index_ : int or dict
         The index (of the ``cv_results_`` arrays) which corresponds to the best
         candidate parameter setting.
 
         The dict at ``search.cv_results_['params'][search.best_index_]`` gives
         the parameter setting for the best model, that gives the highest
         mean score (``search.best_score_``).
 
-    scorer_ : function
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the
+        index which corresponds to the parameter setting that gave the best
+        scores for that scorer.
+
+    scorer_ : function or a dict
         Scorer function used on the held out data to choose the best
         parameters for the model.
 
+        For multimetric evaluation this parameter is a dict mapping scorer
+        names to the corresponding scorer functions.
+
     n_splits_ : int
         The number of cross-validation splits (folds/iterations).
 
@@ -1094,35 +1157,63 @@ class RandomizedSearchCV(BaseSearchCV):
             'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
             }
 
-        NOTE that the key ``'params'`` is used to store a list of parameter
+        NOTE
+
+        The key ``'params'`` is used to store a list of parameter
         settings dict for all the parameter candidates.
 
         The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
         ``std_score_time`` are all in seconds.
 
-    best_estimator_ : estimator
+        For multiple metric evaluation, the scores for all the scorers are
+        available in the ``cv_results_`` dict at the keys ending with that
+        scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` as shown
+        above.
+
+    best_estimator_ : estimator or dict
         Estimator that was chosen by the search, i.e. estimator
         which gave highest score (or smallest loss if specified)
         on the left out data. Not available if refit=False.
 
-    best_score_ : float
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the estimator
+        that gave the best score for that scorer.
+
+    best_score_ : float or dict
         Score of best_estimator on the left out data.
 
-    best_params_ : dict
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the best score
+        for that scorer.
+
+    best_params_ : dict or dict of dict
         Parameter setting that gave the best results on the hold out data.
 
-    best_index_ : int
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict of dict mapping scorer names to the
+        dict of the parameter setting that gave the best scores for that
+        scorer.
+
+    best_index_ : int or dict
         The index (of the ``cv_results_`` arrays) which corresponds to the best
         candidate parameter setting.
 
         The dict at ``search.cv_results_['params'][search.best_index_]`` gives
         the parameter setting for the best model, that gives the highest
         mean score (``search.best_score_``).
 
-    scorer_ : function
+        For multimetric evaluation (when the ``scoring`` parameter is a dict/
+        list), this parameter is a dict mapping scorer names to the
+        index which corresponds to the parameter setting that gave the best
+        scores for that scorer.
+
+    scorer_ : function or a dict
         Scorer function used on the held out data to choose the best
         parameters for the model.
 
+        For multimetric evaluation this parameter is a dict mapping scorer
+        names to the corresponding scorer functions.
+
     n_splits_ : int
         The number of cross-validation splits (folds/iterations).