From 471bd757992a273e3c898dc13bee36b01fb83e96 Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Mon, 1 Jul 2013 17:54:39 +0200 Subject: [PATCH 1/2] ENH simplify the Scorer API A Scorer is now a function that returns a score that should be maximized. --- doc/modules/classes.rst | 4 +- doc/modules/model_evaluation.rst | 28 +-- sklearn/cross_validation.py | 6 +- sklearn/grid_search.py | 9 +- sklearn/metrics/__init__.py | 4 +- sklearn/metrics/scorer.py | 235 ++++++++++++++------ sklearn/metrics/tests/test_score_objects.py | 27 ++- sklearn/tests/test_cross_validation.py | 8 +- sklearn/tests/test_grid_search.py | 4 +- 9 files changed, 213 insertions(+), 112 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2dc6ae7b1c042..2af798bf539bd 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -671,9 +671,9 @@ Model Selection Interface ------------------------- .. autosummary:: :toctree: generated/ - :template: class_with_call.rst + :template: function.rst - metrics.Scorer + metrics.make_scorer Classification metrics ---------------------- diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index f6d5d5330f908..7c756ce8fbd0b 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -943,16 +943,16 @@ Creating scoring objects from score functions If you want to use a scoring function that takes additional parameters, such as :func:`fbeta_score`, you need to generate an appropriate scoring object. The simplest way to generate a callable object for scoring is by using -:class:`Scorer`. -:class:`Scorer` converts score functions as above into callables that can be +:func:`make_scorer`. +That function converts score functions as above into callables that can be used for model evaluation. One typical use case is to wrap an existing scoring function from the library with non default value for its parameters such as the beta parameter for the :func:`fbeta_score` function:: - >>> from sklearn.metrics import fbeta_score, Scorer - >>> ftwo_scorer = Scorer(fbeta_score, beta=2) + >>> from sklearn.metrics import fbeta_score, make_scorer + >>> ftwo_scorer = make_scorer(fbeta_score, beta=2) >>> from sklearn.grid_search import GridSearchCV >>> from sklearn.svm import LinearSVC >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=ftwo_scorer) @@ -964,10 +964,10 @@ from a simple python function:: ... diff = np.abs(ground_truth - predictions).max() ... return np.log(1 + diff) ... - >>> my_custom_scorer = Scorer(my_custom_loss_func, greater_is_better=False) + >>> my_custom_scorer = make_scorer(my_custom_loss_func, greater_is_better=False) >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=my_custom_scorer) -:class:`Scorer` takes as parameters the function you want to use, whether it is +:func:`make_scorer` takes as parameters the function you want to use, whether it is a score (``greater_is_better=True``) or a loss (``greater_is_better=False``), whether the function you provided takes predictions as input (``needs_threshold=False``) or needs confidence scores @@ -978,22 +978,18 @@ the previous example. Implementing your own scoring object ------------------------------------ You can generate even more flexible model scores by constructing your own -scoring object from scratch, without using the :class:`Scorer` helper class. -The requirements that a callable can be used for model selection are as -follows: +scoring object from scratch, without using the :func:`make_scorer` factory. +For a callable to be a scorer, it needs to meet the protocol specified by +the following two rules: - It can be called with parameters ``(estimator, X, y)``, where ``estimator`` it the model that should be evaluated, ``X`` is validation data and ``y`` is the ground truth target for ``X`` (in the supervised case) or ``None`` in the unsupervised case. -- The call returns a number indicating the quality of estimator. - -- The callable has a boolean attribute ``greater_is_better`` which indicates whether - high or low values correspond to a better estimator. - -Objects that meet those conditions as said to implement the sklearn Scorer -protocol. +- It returns a floating point number that quantifies the quality of + ``estimator``'s predictions on ``X`` which reference to ``y``. + Again, higher numbers are better. .. _dummy_estimators: diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index f0c290435f049..16f9e239fbf02 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -24,7 +24,7 @@ from .utils.fixes import unique from .externals.joblib import Parallel, delayed from .externals.six import string_types, with_metaclass -from .metrics import SCORERS, Scorer +from .metrics import make_scorer, SCORERS __all__ = ['Bootstrap', 'KFold', @@ -1136,7 +1136,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, warnings.warn("Passing function as ``score_func`` is " "deprecated and will be removed in 0.15. " "Either use strings or score objects.", stacklevel=2) - scorer = Scorer(score_func) + scorer = make_scorer(score_func) elif isinstance(scoring, string_types): scorer = SCORERS[scoring] else: @@ -1299,7 +1299,7 @@ def permutation_test_score(estimator, X, y, scoring=None, cv=None, warnings.warn("Passing function as ``score_func`` is " "deprecated and will be removed in 0.15. " "Either use strings or score objects.") - scorer = Scorer(score_func) + scorer = make_scorer(score_func) elif isinstance(scoring, string_types): scorer = SCORERS[scoring] else: diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 8e00cda4d3c82..e9e010383866a 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -28,7 +28,7 @@ from .externals import six from .utils import safe_mask, check_random_state from .utils.validation import _num_samples, check_arrays -from .metrics import SCORERS, Scorer +from .metrics import make_scorer, SCORERS __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', @@ -465,13 +465,13 @@ def _fit(self, X, y, parameter_iterable): "deprecated and will be removed in 0.15. " "Either use strings or score objects." "The relevant new parameter is called ''scoring''. ") - scorer = Scorer(self.loss_func, greater_is_better=False) + scorer = make_scorer(self.loss_func, greater_is_better=False) elif self.score_func is not None: warnings.warn("Passing function as ``score_func`` is " "deprecated and will be removed in 0.15. " "Either use strings or score objects." "The relevant new parameter is called ''scoring''.") - scorer = Scorer(self.score_func) + scorer = make_scorer(self.score_func) elif isinstance(self.scoring, six.string_types): scorer = SCORERS[self.scoring] else: @@ -539,9 +539,8 @@ def _fit(self, X, y, parameter_iterable): # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties - greater_is_better = getattr(self.scorer_, 'greater_is_better', True) best = sorted(cv_scores, key=lambda x: x.mean_validation_score, - reverse=greater_is_better)[0] + reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 4fa801c174886..c812d4e37a713 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -30,7 +30,7 @@ from .metrics import zero_one from .metrics import zero_one_score -from .scorer import Scorer, SCORERS +from .scorer import make_scorer, SCORERS from . import cluster from .cluster import (adjusted_rand_score, @@ -85,5 +85,5 @@ 'silhouette_samples', 'v_measure_score', 'zero_one_loss', - 'Scorer', + 'make_scorer', 'SCORERS'] diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index e3de557cc3ade..63d5b5958645c 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -3,7 +3,7 @@ interface for model selection and evaluation using arbitrary score functions. -A Scorer object is a callable that can be passed to +A scorer object is a callable that can be passed to :class:`sklearn.grid_search.GridSearchCV` or :func:`sklearn.cross_validation.cross_val_score` as the ``scoring`` parameter, to specify how a model should be evaluated. @@ -14,8 +14,11 @@ """ # Authors: Andreas Mueller +# Lars Buitinck # License: Simplified BSD +from abc import ABCMeta, abstractmethod + import numpy as np from . import (r2_score, mean_squared_error, accuracy_score, f1_score, @@ -23,15 +26,131 @@ recall_score) from .cluster import adjusted_rand_score +from ..externals import six + + +class _BaseScorer(six.with_metaclass(ABCMeta, object)): + def __init__(self, score_func, sign, kwargs): + self._kwargs = kwargs + self._score_func = score_func + self._sign = sign + + @abstractmethod + def __call__(self, estimator, X, y): + pass + + def __repr__(self): + kwargs_string = "".join([", %s=%s" % (str(k), str(v)) + for k, v in self._kwargs.items()]) + return ("make_scorer(%s%s%s%s)" + % (self._score_func.__name__, + "" if self._sign > 0 else ", greater_is_better=False", + self._factory_args(), kwargs_string)) + + def _factory_args(self): + """Return non-default make_scorer arguments for repr.""" + return "" + + +class _PredictScorer(_BaseScorer): + def __call__(self, estimator, X, y_true): + """Evaluate predicted target values for X relative to y_true. + + Parameters + ---------- + estimator : object + Trained estimator to use for scoring. Must have a predict_proba + method; the output of that is used to compute the score. + + X : array-like or sparse matrix + Test data that will be fed to estimator.predict. + + y_true : array-like + Gold standard target values for X. + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + y_pred = estimator.predict(X) + return self._sign * self._score_func(y_true, y_pred, **self._kwargs) + + +class _ProbaScorer(_BaseScorer): + def _score(self, clf, X, y): + """Evaluate predicted probabilities for X relative to y_true. + + Parameters + ---------- + clf : object + Trained classifier to use for scoring. Must have a predict_proba + method; the output of that is used to compute the score. + + X : array-like or sparse matrix + Test data that will be fed to clf.predict_proba. + + y : array-like + Gold standard target values for X. These must be class labels, + not probabilities. + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + y_pred = clf.predict_proba(X) + return self._sign * self._score_func(y, y_pred, **self._kwargs) + def _factory_args(self): + return ", needs_proba=True" -class Scorer(object): - """Flexible scores for any estimator. - This class wraps estimator scoring functions for the use in GridSearchCV +class _ThresholdScorer(_BaseScorer): + def __call__(self, clf, X, y): + """Evaluate decision function output for X relative to y_true. + + Parameters + ---------- + clf : object + Trained classifier to use for scoring. Must have either a + decision_function method or a predict_proba method; the output of + that is used to compute the score. + + X : array-like or sparse matrix + Test data that will be fed to clf.decision_function or + clf.predict_proba. + + y : array-like + Gold standard target values for X. These must be class labels, + not decision function values. + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + if len(np.unique(y)) > 2: + raise ValueError("This classification score only " + "supports binary classification.") + try: + y_pred = clf.decision_function(X).ravel() + except (NotImplementedError, AttributeError): + y_pred = clf.predict_proba(X)[:, 1] + return self._sign * self._score_func(y, y_pred, **self._kwargs) + + def _factory_args(self): + return ", needs_threshold=True" + + +def make_scorer(score_func, greater_is_better=True, needs_proba=False, + needs_threshold=False, **kwargs): + """Make a scorer from a performance metric or loss function. + + This factory function wraps scoring functions for use in GridSearchCV and cross_val_score. It takes a score function, such as ``accuracy_score``, ``mean_squared_error``, ``adjusted_rand_index`` or ``average_precision`` - and provides a call method. + and returns a callable that scores an estimator's output. Parameters ---------- @@ -41,93 +160,71 @@ class Scorer(object): greater_is_better : boolean, default=True Whether score_func is a score function (default), meaning high is good, - or a loss function, meaning low is good. + or a loss function, meaning low is good. In the latter case, the + scorer object will sign-flip the outcome of the score_func. + + needs_proba : boolean, default=False + Whether score_func requires predict_proba to get probability estimates + out of a classifier. - needs_threshold : bool, default=False + needs_threshold : boolean, default=False Whether score_func takes a continuous decision certainty. + This only works for binary classification using estimators that + have either a decision_function or predict_proba method. + For example ``average_precision`` or the area under the roc curve - can not be computed using predictions alone, but need the output of - ``decision_function`` or ``predict_proba``. + can not be computed using discrete predictions alone. **kwargs : additional arguments Additional parameters to be passed to score_func. + Returns + ------- + scorer : callable + Callable object that returns a scalar score; greater is better. + Examples -------- - >>> from sklearn.metrics import fbeta_score, Scorer - >>> ftwo_scorer = Scorer(fbeta_score, beta=2) + >>> from sklearn.metrics import fbeta_score, make_scorer + >>> ftwo_scorer = make_scorer(fbeta_score, beta=2) + >>> ftwo_scorer + make_scorer(fbeta_score, beta=2) >>> from sklearn.grid_search import GridSearchCV >>> from sklearn.svm import LinearSVC >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, ... scoring=ftwo_scorer) """ - def __init__(self, score_func, greater_is_better=True, - needs_threshold=False, **kwargs): - self.score_func = score_func - self.greater_is_better = greater_is_better - self.needs_threshold = needs_threshold - self.kwargs = kwargs - - def __repr__(self): - kwargs_string = "".join([", %s=%s" % (str(k), str(v)) - for k, v in self.kwargs.items()]) - return ("Scorer(score_func=%s, greater_is_better=%s, needs_threshold=" - "%s%s)" % (self.score_func.__name__, self.greater_is_better, - self.needs_threshold, kwargs_string)) - - def __call__(self, estimator, X, y): - """Score X and y using the provided estimator. - - Parameters - ---------- - estimator : object - Trained estimator to use for scoring. - If ``needs_threshold`` is True, estimator needs - to provide ``decision_function`` or ``predict_proba``. - Otherwise, estimator needs to provide ``predict``. - - X : array-like or sparse matrix - Test data that will be scored by the estimator. - - y : array-like - True prediction for X. - - Returns - ------- - score : float - Score function applied to prediction of estimator on X. - """ - if self.needs_threshold: - if len(np.unique(y)) > 2: - raise ValueError("This classification score only " - "supports binary classification.") - try: - y_pred = estimator.decision_function(X).ravel() - except (NotImplementedError, AttributeError): - y_pred = estimator.predict_proba(X)[:, 1] - return self.score_func(y, y_pred, **self.kwargs) - else: - y_pred = estimator.predict(X) - return self.score_func(y, y_pred, **self.kwargs) + sign = 1 if greater_is_better else -1 + if needs_proba and needs_threshold: + raise ValueError("Set either needs_proba or needs_threshold to True," + " but not both.") + if needs_proba: + cls = _ProbaScorer + elif needs_threshold: + cls = _ThresholdScorer + else: + cls = _PredictScorer + return cls(score_func, sign, kwargs) # Standard regression scores -r2_scorer = Scorer(r2_score) -mse_scorer = Scorer(mean_squared_error, greater_is_better=False) +r2_scorer = make_scorer(r2_score) +mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Standard Classification Scores -accuracy_scorer = Scorer(accuracy_score) -f1_scorer = Scorer(f1_score) +accuracy_scorer = make_scorer(accuracy_score) +f1_scorer = make_scorer(f1_score) # Score functions that need decision values -auc_scorer = Scorer(auc_score, greater_is_better=True, needs_threshold=True) -average_precision_scorer = Scorer(average_precision_score, - needs_threshold=True) -precision_scorer = Scorer(precision_score) -recall_scorer = Scorer(recall_score) +auc_scorer = make_scorer(auc_score, greater_is_better=True, + needs_threshold=True) +average_precision_scorer = make_scorer(average_precision_score, + needs_threshold=True) +precision_scorer = make_scorer(precision_score) +recall_scorer = make_scorer(recall_score) # Clustering scores -ari_scorer = Scorer(adjusted_rand_score) +ari_scorer = make_scorer(adjusted_rand_score) SCORERS = dict(r2=r2_scorer, mse=mse_scorer, accuracy=accuracy_scorer, f1=f1_scorer, roc_auc=auc_scorer, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 7777f15de1a7e..33e724496f7b0 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -5,7 +5,7 @@ from sklearn.metrics import f1_score, r2_score, auc_score, fbeta_score from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.metrics import SCORERS, Scorer +from sklearn.metrics import make_scorer, SCORERS from sklearn.svm import LinearSVC from sklearn.cluster import KMeans from sklearn.linear_model import Ridge, LogisticRegression @@ -15,7 +15,14 @@ from sklearn.grid_search import GridSearchCV +def test_make_scorer(): + """Sanity check on the make_scorer factory function.""" + f = lambda *args: 0 + assert_raises(ValueError, make_scorer, f, needs_threshold=True, needs_proba=True) + + def test_classification_scores(): + """Test classification scorers.""" X, y = make_blobs(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) @@ -25,7 +32,7 @@ def test_classification_scores(): assert_almost_equal(score1, score2) # test fbeta score that takes an argument - scorer = Scorer(fbeta_score, beta=2) + scorer = make_scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2) assert_almost_equal(score1, score2) @@ -39,7 +46,8 @@ def test_classification_scores(): repr(fbeta_score) -def test_regression_scores(): +def test_regression_scorers(): + """Test regression scorers.""" diabetes = load_diabetes() X, y = diabetes.data, diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -50,7 +58,8 @@ def test_regression_scores(): assert_almost_equal(score1, score2) -def test_thresholded_scores(): +def test_thresholded_scorers(): + """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) @@ -75,9 +84,9 @@ def test_thresholded_scores(): assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test) -def test_unsupervised_scores(): - # test clustering where there is some true y. - # We don't have any real unsupervised SCORERS yet +def test_unsupervised_scorers(): + """Test clustering scorers against gold standard labeling.""" + # We don't have any real unsupervised Scorers yet. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) @@ -88,9 +97,9 @@ def test_unsupervised_scores(): def test_raises_on_score_list(): - # test that when a list of scores is returned, we raise proper errors. + """Test that when a list of scores is returned, we raise proper errors.""" X, y = make_blobs(random_state=0) - f1_scorer_no_average = Scorer(f1_score, average=None) + f1_scorer_no_average = make_scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index ab9f34a20238f..ec388f16177a1 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -24,7 +24,7 @@ from sklearn.metrics import f1_score from sklearn.metrics import explained_variance_score from sklearn.metrics import fbeta_score -from sklearn.metrics import Scorer +from sklearn.metrics import make_scorer from sklearn.externals import six from sklearn.linear_model import Ridge @@ -397,9 +397,9 @@ def test_cross_val_score_with_score_func_regression(): r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5) assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) - # Mean squared error + # Mean squared error; this is a loss function, so "scores" are negative mse_scores = cval.cross_val_score(reg, X, y, cv=5, scoring="mse") - expected_mse = np.array([763.07, 553.16, 274.38, 273.26, 1681.99]) + expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) assert_array_almost_equal(mse_scores, expected_mse, 2) # Explained variance @@ -428,7 +428,7 @@ def test_permutation_score(): assert_true(pvalue_label == pvalue) # test with custom scoring object - scorer = Scorer(fbeta_score, beta=2) + scorer = make_scorer(fbeta_score, beta=2) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size), random_state=0) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 074e1cc1439ca..e941485f626f7 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -31,7 +31,7 @@ from sklearn.svm import LinearSVC, SVC from sklearn.cluster import KMeans, MeanShift from sklearn.metrics import f1_score -from sklearn.metrics import Scorer +from sklearn.metrics import make_scorer from sklearn.cross_validation import KFold, StratifiedKFold @@ -331,7 +331,7 @@ def test_grid_search_sparse_scoring(): # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) - F1Loss = Scorer(f1_loss, greater_is_better=False) + F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) From 932f0bf0ed855cdb6d9b2d4447d2f14b0e54992a Mon Sep 17 00:00:00 2001 From: Lars Buitinck Date: Mon, 1 Jul 2013 17:54:39 +0200 Subject: [PATCH 2/2] ENH f_scorer that returns multiple values + support for that Added a report method to GridSearchCV to use it. --- doc/modules/model_evaluation.rst | 6 +- .../grid_search_text_feature_extraction.py | 23 ++++--- sklearn/grid_search.py | 61 +++++++++++++++---- sklearn/metrics/scorer.py | 54 ++++++++++++++-- sklearn/metrics/tests/test_score_objects.py | 4 +- 5 files changed, 118 insertions(+), 30 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7c756ce8fbd0b..5354d5b713370 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -987,9 +987,9 @@ the following two rules: the ground truth target for ``X`` (in the supervised case) or ``None`` in the unsupervised case. -- It returns a floating point number that quantifies the quality of - ``estimator``'s predictions on ``X`` which reference to ``y``. - Again, higher numbers are better. +- It returns either a floating point number (the score), or a tuple, the + first element of which is a float. The additional values are used by the + ``report`` method on ``GridSearchCV`` and ``RandomizedSearchCV``. .. _dummy_estimators: diff --git a/examples/grid_search_text_feature_extraction.py b/examples/grid_search_text_feature_extraction.py index 4d3456b9acac6..05d2d474748ba 100644 --- a/examples/grid_search_text_feature_extraction.py +++ b/examples/grid_search_text_feature_extraction.py @@ -29,18 +29,16 @@ 'vect__max_features': (None, 5000, 10000, 50000)} done in 1737.030s - Best score: 0.940 + Best score: 0.923 Best parameters set: - clf__alpha: 9.9999999999999995e-07 - clf__n_iter: 50 - clf__penalty: 'elasticnet' - tfidf__use_idf: True - vect__max_n: 2 - vect__max_df: 0.75 - vect__max_features: 50000 + clf__alpha: 1e-06 + clf__penalty: 'l2' + vect__max_df: 1.0 + vect__ngram_range: (1, 2) """ + # Author: Olivier Grisel # Peter Prettenhofer # Mathieu Blondel @@ -49,6 +47,7 @@ from __future__ import print_function from pprint import pprint +import sys from time import time import logging @@ -111,7 +110,8 @@ # find the best parameters for both the feature extraction and the # classifier - grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) + grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, + scoring="f1") print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) @@ -127,3 +127,8 @@ best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) + + # Uncomment the following line to get a detailed (and long!) report + # about the cross-validation results, including precision and recall + # per fold for all settings. + #grid_search.report(sys.stdout) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index e9e010383866a..57409cd940e81 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -11,7 +11,7 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod -from collections import Mapping, namedtuple, Sized +from collections import Mapping, namedtuple, Sequence, Sized from functools import partial, reduce from itertools import product import numbers @@ -316,8 +316,10 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, else: this_score = clf.score(X_test) - if not isinstance(this_score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s)" + if not isinstance(this_score, numbers.Number) \ + and not (isinstance(this_score, Sequence) + and isinstance(this_score[0], numbers.Number)): + raise ValueError("scoring must return a number or tuple, got %s (%s)" " instead." % (str(this_score), type(this_score))) if verbose > 2: @@ -364,10 +366,17 @@ class _CVScoreTuple (namedtuple('_CVScoreTuple', def __repr__(self): """Simple custom repr to summarize the main info""" + std = np.std([sc if isinstance(sc, numbers.Number) else sc[0] + for sc in self.cv_validation_scores]) + return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format( - self.mean_validation_score, - np.std(self.cv_validation_scores), - self.parameters) + self.mean_validation_score, std, self.parameters) + + def __str__(self): + """More extensive reporting than from repr.""" + per_fold = ("\n fold {0}: {1}".format(i, sc) + for i, sc in enumerate(self.cv_validation_scores)) + return repr(self) + "".join(per_fold) class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, @@ -392,6 +401,33 @@ def __init__(self, estimator, scoring=None, loss_func=None, self.pre_dispatch = pre_dispatch self._check_estimator() + def report(self, file=None): + """Generate a report of the scores achieved. + + Reports on the scores achieved across the folds for the various + parameter settings tried. This also prints the additional information + reported by some scorers, such as "f1", which tracks precision and + recall as well. + + Parameters + ---------- + file : file-like, optional + File to which the report is written. If None or not given, the + report is returned as a string. + """ + if not hasattr(self, "cv_scores_"): + raise AttributeError("no cv_scores_ found; run fit first") + + return_string = (file is None) + if return_string: + file = six.StringIO() + + for cvs in self.cv_scores_: + print(cvs, file=file) + + if return_string: + return file.getvalue() + def score(self, X, y=None): """Returns the score on the given test data and labels, if the search estimator has been refit. The ``score`` function of the best estimator @@ -507,7 +543,7 @@ def _fit(self, X, y, parameter_iterable): for parameters in parameter_iterable for train, test in cv) - # Out is a list of triplet: score, estimator, n_test_samples + # Out is a list of triples: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) @@ -519,7 +555,11 @@ def _fit(self, X, y, parameter_iterable): all_scores = [] for this_score, parameters, this_n_test_samples in \ out[grid_start:grid_start + n_folds]: - all_scores.append(this_score) + full_info = this_score + if isinstance(this_score, Sequence): + # Structured score. + this_score = this_score[0] + all_scores.append(full_info) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples @@ -530,10 +570,7 @@ def _fit(self, X, y, parameter_iterable): score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? - cv_scores.append(_CVScoreTuple( - parameters, - score, - np.array(all_scores))) + cv_scores.append(_CVScoreTuple(parameters, score, all_scores)) # Store the computed scores self.cv_scores_ = cv_scores diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 63d5b5958645c..881aeb9d8182d 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -18,12 +18,13 @@ # License: Simplified BSD from abc import ABCMeta, abstractmethod +from collections import namedtuple import numpy as np -from . import (r2_score, mean_squared_error, accuracy_score, f1_score, +from . import (r2_score, mean_squared_error, accuracy_score, auc_score, average_precision_score, precision_score, - recall_score) + recall_score, precision_recall_fscore_support) from .cluster import adjusted_rand_score from ..externals import six @@ -211,9 +212,52 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, r2_scorer = make_scorer(r2_score) mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) -# Standard Classification Scores +# Standard classification scores accuracy_scorer = make_scorer(accuracy_score) -f1_scorer = make_scorer(f1_score) + + +class _FPR(namedtuple("fpr", ["f_score", "precision", "recall"])): + __slots__ = () + + def __str__(self): + return ("F = {0:.4f}, precision = {1:.4f}, recall = {2:.4f}" + .format(self.f_score, self.precision, self.recall)) + + +def f_scorer(clf, X, y_true, beta=1.): + """Evaluate a classifier's predictions for X according to F1/F-beta score. + + Parameters + ---------- + clf : object + Trained classifier to evaluate. + + X : array-like or sparse matrix + Test data that will be fed to clf.predict. + + y_true : array-like + Gold standard target values for X. + + beta : float, optional + The strength of recall versus precision in the F-score. + + Returns + ------- + (fscore, precision, recall) : tuple of floats + F-score and the scores it is based on for estimator's predictions on X + relative to y_true. When this scorer is used inside GridSearchCV or + similar, the first value is used for optimization. + + See also + -------- + sklearn.metrics.precision_recall_fscore_support + """ + # TODO support the various weightings for precision_recall_fscore_support + # offers + p, r, f, _ = precision_recall_fscore_support(y_true, clf.predict(X), + beta, average="weighted") + return _FPR(f, p, r) + # Score functions that need decision values auc_scorer = make_scorer(auc_score, greater_is_better=True, @@ -227,7 +271,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, ari_scorer = make_scorer(adjusted_rand_score) SCORERS = dict(r2=r2_scorer, mse=mse_scorer, accuracy=accuracy_scorer, - f1=f1_scorer, roc_auc=auc_scorer, + f1=f_scorer, roc_auc=auc_scorer, average_precision=average_precision_scorer, precision=precision_scorer, recall=recall_scorer, ari=ari_scorer) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 33e724496f7b0..ca2dbdacf4d27 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -27,7 +27,9 @@ def test_classification_scores(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) - score1 = SCORERS['f1'](clf, X_test, y_test) + + # F1 returns multiple values + score1 = SCORERS['f1'](clf, X_test, y_test)[0] score2 = f1_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2)