scikit-optimize · QuentinSoubeyran · Sep 29, 2021 · Sep 29, 2021 · Sep 29, 2021 · Oct 1, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 *.py[co]
 *~
 .ipynb_checkpoints/
+venv
+.venv
+.python-version
 # created by installing scikit-learn from git
 src/*
 build/
@@ -19,3 +22,6 @@ doc/auto_examples
 
 # vim users
 .*.swp
+
+# vscode users
+.vscode
diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst
@@ -26,3 +26,9 @@ Version 0.9.0
   for scikit-learn >= 1.0. :pr:`1063`
 - Minor documentation improvements.
 - Various small bugs and fixes.
+    :pr:`988`
+
+Version 0.9.1
+=============
+- |Feature| Add support for multimetric scoring to :obj:`skopt.searchcv.BayesSearchCV`.
+  :pr:`1062`
diff --git a/skopt/searchcv.py b/skopt/searchcv.py
@@ -1,28 +1,25 @@
 import warnings
 
-try:
-    from collections.abc import Sized
-except ImportError:
-    from collections import Sized
-
 import numpy as np
 from scipy.stats import rankdata
 
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.utils import check_random_state
 
 from sklearn.utils.validation import check_is_fitted
-try:
-    from sklearn.metrics import check_scoring
-except ImportError:
-    from sklearn.metrics.scorer import check_scoring
 
 from . import Optimizer
 from .utils import point_asdict, dimensions_aslist, eval_callbacks
 from .space import check_dimension
 from .callbacks import check_callback
 
 
+def _get_score_names(cv_results, *, kind="test"):
+    prefix = f"mean_{kind}_"
+    return {key[len(prefix):]
+            for key in cv_results.keys()
+            if key.startswith(prefix)}
+
 class BayesSearchCV(BaseSearchCV):
     """Bayesian optimization over hyper parameters.
 
@@ -78,11 +75,24 @@ class BayesSearchCV(BaseSearchCV):
         ``{'base_estimator': 'RF'}`` would use a Random Forest surrogate
         instead of the default Gaussian Process.
 
-    scoring : string, callable or None, default=None
-        A string (see model evaluation documentation) or
-        a scorer callable object / function with signature
-        ``scorer(estimator, X, y)``.
-        If ``None``, the ``score`` method of the estimator is used.
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set. If ``None``, the ``score`` method of the estimator is
+        used.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring`) that returns a single value.
+
+        If `scoring` represents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        Callables must have the signature ``scorer(estimator, X, y=None)``
 
     fit_params : dict, optional
         Parameters to pass to the fit method.
@@ -124,10 +134,14 @@ class BayesSearchCV(BaseSearchCV):
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
         other cases, :class:`KFold` is used.
 
-    refit : boolean, default=True
+    refit : bool, str, default=True
         Refit the best estimator with the entire dataset.
         If "False", it is impossible to make predictions using
-        this RandomizedSearchCV instance after fitting.
+        this BayesSearchCV instance after fitting.
+
+        For multiple metric evaluation, this needs to be a `str` denoting the
+        scorer that would be used to direct the optimization process, and find
        the best parameters for refitting the estimator at the end.
 
     verbose : integer
         Controls the verbosity: the higher, the more messages.
@@ -258,13 +272,21 @@ class BayesSearchCV(BaseSearchCV):
     n_splits_ : int
         The number of cross-validation splits (folds/iterations).
 
+    refit_time_ : float
+        Seconds used for refitting the best model on the whole dataset.
+
+        This is present only if ``refit`` is not False.
+
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
     Notes
     -----
     The parameters selected are those that maximize the score of the held-out
     data, according to the scoring parameter.
 
     If `n_jobs` was set to a value higher than one, the data is copied for each
-    parameter setting(and not `n_jobs` times). This is done for efficiency
+    parameter setting (and not `n_jobs` times). This is done for efficiency
     reasons if individual jobs take very little time, but may raise errors if
     the dataset is large and not enough memory is available.  A workaround in
     this case is to set `pre_dispatch`. Then, the memory is copied only
@@ -393,7 +415,8 @@ def _make_optimizer(self, params_space):
 
         return optimizer
 
-    def _step(self, search_space, optimizer, evaluate_candidates, n_points=1):
+    def _step(self, search_space, optimizer, score_name,
+              evaluate_candidates, n_points=1):
         """Generate n_jobs parameters and evaluate them in parallel.
         """
         # get parameter values to evaluate
@@ -406,10 +429,38 @@ def _step(self, search_space, optimizer, evaluate_candidates, n_points=1):
         params_dict = [point_asdict(search_space, p) for p in params]
 
         all_results = evaluate_candidates(params_dict)
+
+        # if self.scoring is a callable, we have to wait until here
+        # to get the score name
+        if score_name is None:
+            score_names = _get_score_names(all_results)
+            if len(score_names) > 1:
+                # multimetric case
+                # early check to fail before lengthy computations, as
+                # BaseSearchCV only performs this check *after* _run_search
+                self._check_refit_for_multimetric(score_names)
+                score_name = f"mean_test_{self.refit}"
+            elif len(score_names) == 1:
+                # single metric, or a callable self.scoring returning a dict
+                # with a single value
+                # In both case, we just use the score that is available
+                score_name = f"mean_test_{score_names.pop()}"
+            else:
+                # failsafe, shouldn't happen
+                raise ValueError(
+                    "No score was detected after fitting. This is probably "
+                    "due to a callable 'scoring' returning an empty dict."
+                )
+
         # Feed the point and objective value back into optimizer
         # Optimizer minimizes objective, hence provide negative score
-        local_results = all_results["mean_test_score"][-len(params):]
-        return optimizer.tell(params, [-score for score in local_results])
+        local_results = all_results[score_name][-len(params):]
+        # return the score_name to cache it if callable refit
+        # this avoids checking self.refit all the time
+        return (
+                optimizer.tell(params, [-score for score in local_results]),
+                score_name
+        )
 
     @property
     def total_iterations(self):
@@ -463,14 +514,24 @@ def fit(self, X, y=None, *, groups=None, callback=None, **fit_params):
         else:
             self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
 
+        if callable(self.refit):
+            raise ValueError("BayesSearchCV doesn't support a callable refit, "
+                             "as it doesn't define an implicit score to "
+                             "optimize")
+
         super().fit(X=X, y=y, groups=groups, **fit_params)
 
         # BaseSearchCV never ranked train scores,
         # but apparently we used to ship this (back-compat)
         if self.return_train_score:
-            self.cv_results_["rank_train_score"] = \
-                rankdata(-np.array(self.cv_results_["mean_train_score"]),
-                         method='min').astype(int)
+            for score in _get_score_names(self.cv_results_, kind="train"):
+                self.cv_results_[f"rank_train_{score}"] = (
+                    rankdata(
+                        -np.array(self.cv_results_[f"mean_train_{score}"]),
+                        method='min'
+                    )
+                    .astype(int)
+                )
         return self
 
     def _run_search(self, evaluate_candidates):
@@ -484,6 +545,16 @@ def _run_search(self, evaluate_candidates):
         random_state = check_random_state(self.random_state)
         self.optimizer_kwargs_['random_state'] = random_state
 
+        # Adapted from BaseSearchCV fit() method
+        if callable(self.scoring):
+            # will be determined later
+            score_name = None
+        elif self.scoring is None or isinstance(self.scoring, str):
+            score_name = "mean_test_score"
+        else:
+            # proper checking took place before in BaseSearchCV.fit()
+            score_name = f"mean_test_{self.refit}"
+
         # Instantiate optimizers for all the search spaces.
         optimizers = []
         for search_space in search_spaces:
@@ -509,12 +580,29 @@ def _run_search(self, evaluate_candidates):
                 # when n_iter < n_points points left for evaluation
                 n_points_adjusted = min(n_iter, n_points)
 
-                optim_result = self._step(
-                    search_space, optimizer,
+                optim_result, score_name = self._step(
+                    search_space, optimizer, score_name,
                     evaluate_candidates, n_points=n_points_adjusted
                 )
                 n_iter -= n_points
 
                 if eval_callbacks(callbacks, optim_result):
                     break
             self._optim_results.append(optim_result)
+
+    def _check_refit_for_multimetric(self, scores):
+        """Check `refit` is compatible with `scores` and valid"""
+        # override parent method to exclude False and callables
+        multimetric_refit_msg = (
+            "For multi-metric scoring, the 'refit' parameter must be set to a "
+            "scorer key, used to guide the bayesian optimization process "
+            "and refit an estimator with the best parameter settings on the "
+            "whole dataset (making the best_* attributes available for that "
+            f" metric). {self.refit!r} was passed."
+            )
+
+        is_refit_valid = (isinstance(self.refit, str) and
+                          self.refit in scores)
+
+        if (not is_refit_valid):
+            raise ValueError(multimetric_refit_msg)
diff --git a/skopt/tests/test_searchcv.py b/skopt/tests/test_searchcv.py
@@ -4,13 +4,16 @@
 
 import pytest
 
+import sklearn as skl
 from sklearn.datasets import load_iris, make_classification
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, f1_score
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC, LinearSVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.base import clone
 from sklearn.base import BaseEstimator
+from packaging.version import parse as parse_version
 from scipy.stats import rankdata
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -463,3 +466,98 @@ def score(self, X, y):
     )
 
     model.fit(X, y)
+
+
+def test_searchcv_multimetric_scoring():
+    # test that multi-metric scoring works as intened
+    # for BayesSearchCV
+    random_state = 42
+
+    X, y = make_classification(n_classes=2, random_state=random_state)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, train_size=0.75, random_state=0
+    )
+    # test iterable scoring
+    opt = BayesSearchCV(
+        SVC(random_state=random_state),
+        {
+            'C': Real(1e-6, 1e+6, prior='log-uniform'),
+            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
+            'degree': Integer(1, 8),
+            'kernel': Categorical(['linear', 'poly', 'rbf']),
+        },
+        scoring=["accuracy", "f1"],
+        refit="f1",
+        n_iter=11,
+        random_state=random_state
+    )
+    opt.fit(X_train, y_train)
+    y_pred = opt.predict(X_test)
+    assert f1_score(y_test, y_pred) > 0.9
+    assert (
+        len(opt.cv_results_["mean_test_accuracy"])
+        == len(opt.cv_results_["mean_test_f1"])
+    )
+
+    # test dict scoring
+    opt = BayesSearchCV(
+        SVC(random_state=random_state),
+        {
+            'C': Real(1e-6, 1e+6, prior='log-uniform'),
+            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
+            'degree': Integer(1, 8),
+            'kernel': Categorical(['linear', 'poly', 'rbf']),
+        },
+        scoring={
+            "f1": "f1",
+            "accuracy": "accuracy",
+        },
+        refit="f1",
+        n_iter=11,
+        random_state=random_state
+    )
+    opt.fit(X_train, y_train)
+    y_pred = opt.predict(X_test)
+    assert f1_score(y_test, y_pred) > 0.9
+    assert (
+        len(opt.cv_results_["mean_test_accuracy"])
+        == len(opt.cv_results_["mean_test_f1"])
+    )
+
+
+@pytest.mark.skipif(parse_version(skl.__version__) < parse_version("0.24"),
+                    reason="requires sklearn>=0.24")
+def test_searchcv_multimetric_callable_scoring():
+    random_state = 42
+
+    X, y = make_classification(n_classes=2, random_state=random_state)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, train_size=0.75, random_state=0
+    )
+
+    # sample code taken from scikit-learn
+    def confusion_matrix_score(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {'tn': cm[0, 0], 'fp': cm[0, 1],
+                'fn': cm[1, 0], 'tp': cm[1, 1]}
+
+    opt = BayesSearchCV(
+        SVC(random_state=random_state),
+        {
+            'C': Real(1e-6, 1e+6, prior='log-uniform'),
+            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
+            'degree': Integer(1, 8),
+            'kernel': Categorical(['linear', 'poly', 'rbf']),
+        },
+        scoring=confusion_matrix_score,
+        refit="tp",
+        n_iter=11,
+        random_state=random_state
+    )
+    opt.fit(X_train, y_train)
+    assert confusion_matrix_score(opt, X_test, y_test)["tp"] > 0.9
+    assert (
+        len(opt.cv_results_["mean_test_tp"])
+        == len(opt.cv_results_["mean_test_fp"])
+    )