diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 84e7c98e29324..0d55c9eb7da40 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -35,6 +35,7 @@ from ._classification import zero_one_loss from ._classification import brier_score_loss from ._classification import multilabel_confusion_matrix +from ._classification import multiclass_brier_score_loss from . import cluster from .cluster import adjusted_mutual_info_score @@ -139,6 +140,7 @@ 'median_absolute_error', 'mean_absolute_percentage_error', 'multilabel_confusion_matrix', + 'multiclass_brier_score_loss', 'mutual_info_score', 'ndcg_score', 'normalized_mutual_info_score', diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 44a84e125727b..ec86f76f057b1 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -128,6 +128,99 @@ def _check_targets(y_true, y_pred): return y_type, y_true, y_pred +def _validate_multiclass_probabilistic_prediction(y_true, y_prob, + sample_weight, labels): + r"""Convert y_true and y_prob to shape [n_samples, n_classes] + + 1. Verify that y_true, y_prob, and sample_weights have the same first dim + 2. Ensure 2 or more classes in y_true i.e. valid classification task. The + classes are provided by the labels argument, or inferred using y_true. + When inferring y_true is assumed binary if it has shape (n_samples, ). + 3. Validate y_true, and y_prob have the same number of classes. Convert to + shape [n_samples, n_classes]/ + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels for n_samples samples. + + y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_prob.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_prob`` are assumed to be + ordered lexicographically, as done by + :class:`preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_prob`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + Returns + ------- + transformed_labels : array of shape [n_samples, n_classes] + + y_prob : array of shape [n_samples, n_classes] + """ + y_prob = check_array(y_prob, ensure_2d=False) + check_consistent_length(y_prob, y_true, sample_weight) + + lb = LabelBinarizer() + + if labels is not None: + lb = lb.fit(labels) + # LabelBinarizer does not respect the order implied by labels, which + # can be misleading. + if not np.all(lb.classes_ == labels): + warnings.warn(f"Labels passed were {labels}. But this function " + f"assumes labels are ordered lexicographically. " + f"Ensure that labels in y_prob are ordered as " + f"{lb.classes_}.", UserWarning) + else: + lb = lb.fit(y_true) + + if len(lb.classes_) == 1: + if labels is None: + raise ValueError(f'y_true contains only one label: ' + f'{lb.classes_[0]}. Please provide the true ' + f'labels explicitly through the labels argument.') + else: + raise ValueError(f'The labels array needs to contain at least two ' + f'labels, got {lb.classes_}.') + + transformed_labels = lb.transform(y_true) + + if transformed_labels.shape[1] == 1: + transformed_labels = np.append(1-transformed_labels, + transformed_labels, axis=1) + + # If y_prob is of single dimension, assume y_true to be binary + if y_prob.ndim == 1: + y_prob = y_prob[:, np.newaxis] + if y_prob.shape[1] == 1: + y_prob = np.append(1 - y_prob, y_prob, axis=1) + + # Check if dimensions are consistent. + transformed_labels = check_array(transformed_labels) + if len(lb.classes_) != y_prob.shape[1]: + if labels is None: + raise ValueError(f"y_true and y_prob contain different number of " + f"classes {transformed_labels.shape[1]}, " + f"{y_prob.shape[1]}. Please provide the true " + f"labels explicitly through the labels argument. " + f"Classes found in y_true: {lb.classes_}") + else: + raise ValueError(f'The number of classes in labels is different ' + f'from that in y_prob. Classes found in ' + f'labels: {lb.classes_}') + + return transformed_labels, y_prob + + def _weighted_sum(sample_score, sample_weight, normalize=False): if normalize: return np.average(sample_score, weights=sample_weight) @@ -2222,58 +2315,13 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, p. 209. """ - y_pred = check_array(y_pred, ensure_2d=False) - check_consistent_length(y_pred, y_true, sample_weight) - - lb = LabelBinarizer() - - if labels is not None: - lb.fit(labels) - else: - lb.fit(y_true) - - if len(lb.classes_) == 1: - if labels is None: - raise ValueError('y_true contains only one label ({0}). Please ' - 'provide the true labels explicitly through the ' - 'labels argument.'.format(lb.classes_[0])) - else: - raise ValueError('The labels array needs to contain at least two ' - 'labels for log_loss, ' - 'got {0}.'.format(lb.classes_)) - - transformed_labels = lb.transform(y_true) - - if transformed_labels.shape[1] == 1: - transformed_labels = np.append(1 - transformed_labels, - transformed_labels, axis=1) + transformed_labels, y_pred = _validate_multiclass_probabilistic_prediction( + y_true, y_pred, sample_weight, labels + ) # Clipping y_pred = np.clip(y_pred, eps, 1 - eps) - # If y_pred is of single dimension, assume y_true to be binary - # and then check. - if y_pred.ndim == 1: - y_pred = y_pred[:, np.newaxis] - if y_pred.shape[1] == 1: - y_pred = np.append(1 - y_pred, y_pred, axis=1) - - # Check if dimensions are consistent. - transformed_labels = check_array(transformed_labels) - if len(lb.classes_) != y_pred.shape[1]: - if labels is None: - raise ValueError("y_true and y_pred contain different number of " - "classes {0}, {1}. Please provide the true " - "labels explicitly through the labels argument. " - "Classes found in " - "y_true: {2}".format(transformed_labels.shape[1], - y_pred.shape[1], - lb.classes_)) - else: - raise ValueError('The number of classes in labels is different ' - 'from that in y_pred. Classes found in ' - 'labels: {0}'.format(lb.classes_)) - # Renormalize y_pred /= y_pred.sum(axis=1)[:, np.newaxis] loss = -(transformed_labels * np.log(y_pred)).sum(axis=1) @@ -2425,6 +2473,12 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): the greater label unless `y_true` is all 0 or all -1, in which case `pos_label` defaults to 1. + A more generalized form of Brier score is implemented in + :func:`multiclass_brier_score_loss` that is applicable to the multi-class + case as well. When used for the binary case, `multiclass_brier_score_loss` + returns Brier score that is exactly twice of the value returned by this + function. + Read more in the :ref:`User Guide `. Parameters @@ -2484,7 +2538,8 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): if y_type != "binary": raise ValueError( f"Only binary classification is supported. The type of the target " - f"is {y_type}." + f"is {y_type}. For the multiclass case, use " + f"multiclass_brier_score_loss instead" ) if y_prob.max() > 1: @@ -2504,3 +2559,94 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): raise y_true = np.array(y_true == pos_label, int) return np.average((y_true - y_prob) ** 2, weights=sample_weight) + + +def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None, + labels=None): + r"""Compute the Brier score loss. + + The smaller the Brier score loss, the better, hence the naming with "loss". + The Brier score measures the mean squared difference between the predicted + probability and the actual outcome. + + For :math:`N` samples with :math:`C` different classes, the multi-class + Brier score is defined as: + + .. math:: + \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \hat{y}_{ic})^{2} + + where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`, + otherwise 0 and :math:`\hat{y}_{ic}` is the predicted probability of + observation `i` for class `c`. The probabilities for `c` classes for + observation `i` should sum to 1. + + The Brier score always takes on a value between [0, 2]. For the + binary case however, there is a more common definition of Brier score + implemented in :func:`brier_score_loss` that is exactly half of the value + returned by this function, thereby having a range between [0, 1]. + + It can be decomposed as the sum of refinement loss and calibration loss. + + The Brier score is appropriate for binary and categorical outcomes that + can be structured as true or false, but is inappropriate for ordinal + variables which can take on three or more values (this is because the + Brier score assumes that all possible outcomes are equivalently + "distant" from one another). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array of shape (n_samples,) + True targets. + + y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_prob.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_prob`` are assumed to be + ordered lexicographically, as done by + :class:`preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_prob`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + Returns + ------- + score : float + Brier score loss. + + Examples + -------- + >>> from sklearn.metrics import multiclass_brier_score_loss + >>> multiclass_brier_score_loss([0, 1, 1, 0], + ... [0.1, 0.9, 0.8, 0.3]) + 0.074... + >>> multiclass_brier_score_loss(['eggs', 'ham', 'spam'], [[.8, .1, .1], + ... [.2, .7, .1], + ... [.2, .2, .6]]) + 0.146... + + References + ---------- + .. [1] `Wikipedia entry for the Brier score + `_. + """ + y_true = column_or_1d(y_true) + + transformed_labels, y_prob = _validate_multiclass_probabilistic_prediction( + y_true, y_prob, sample_weight, labels + ) + + if y_prob.max() > 1: + raise ValueError("y_prob contains values greater than 1.") + if y_prob.min() < 0: + raise ValueError("y_prob contains values less than 0.") + + return np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1), + weights=sample_weight) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c32e9c89ada47..7b70fb73b9823 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -4,6 +4,7 @@ from itertools import chain from itertools import permutations import warnings +import re import numpy as np from scipy import linalg @@ -44,6 +45,7 @@ from sklearn.metrics import zero_one_loss from sklearn.metrics import brier_score_loss from sklearn.metrics import multilabel_confusion_matrix +from sklearn.metrics import multiclass_brier_score_loss from sklearn.metrics._classification import _check_targets from sklearn.exceptions import UndefinedMetricWarning @@ -2257,7 +2259,7 @@ def test_log_loss(): y_true = [2, 2] y_pred = [[0.2, 0.7], [0.6, 0.5]] y_score = np.array([[0.1, 0.9], [0.1, 0.9]]) - error_str = (r'y_true contains only one label \(2\). Please provide ' + error_str = (r'y_true contains only one label: 2. Please provide ' r'the true labels explicitly through the labels argument.') with pytest.raises(ValueError, match=error_str): log_loss(y_true, y_pred) @@ -2297,6 +2299,21 @@ def test_log_loss_pandas_input(): assert_almost_equal(loss, 1.0383217, decimal=6) +def test_log_loss_warnings(): + assert_warns_message( + UserWarning, + "Labels passed were ['spam', 'eggs', 'ham']. But this function " + "assumes labels are ordered lexicographically. " + "Ensure that labels in y_prob are ordered as " + "['eggs' 'ham' 'spam'].", + log_loss, + ['eggs', 'spam', 'ham'], + [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]], + labels=['spam', 'eggs', 'ham']) + + def test_brier_score_loss(): # Check brier_score_loss function y_true = np.array([0, 1, 1, 0, 1, 1]) @@ -2319,12 +2336,10 @@ def test_brier_score_loss(): # ensure to raise an error for multiclass y_true y_true = np.array([0, 1, 2, 0]) y_pred = np.array([0.8, 0.6, 0.4, 0.2]) - error_message = ( - "Only binary classification is supported. The type of the target is " - "multiclass" - ) - - with pytest.raises(ValueError, match=error_message): + error_message = ("Only binary classification is supported. The type of " + "the target is multiclass. For the multiclass case, use " + "multiclass_brier_score_loss instead") + with pytest.raises(ValueError, match=re.escape(error_message)): brier_score_loss(y_true, y_pred) # calculate correctly when there's only one class in y_true @@ -2337,6 +2352,124 @@ def test_brier_score_loss(): brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36) +def test_multiclass_brier_score_loss(): + # test cases for binary case + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95]) + + assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred), + .05083333) + # Check brier_score_loss and multiclass_brier_score_loss are consistent + assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred), + brier_score_loss(y_true, y_pred) * 2) + + # test cases for multi-class + assert_almost_equal( + multiclass_brier_score_loss(['eggs', 'spam', 'ham'], + [[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 1, 0, 0]], + labels=['eggs', 'ham', 'spam', 'yams']), + 2/3) + + assert_almost_equal( + multiclass_brier_score_loss([1, 0, 2], + [[0.2, 0.7, 0.1], + [0.6, 0.2, 0.2], + [0.6, 0.1, 0.3]]), + .41333333) + + # check perfect predictions for 2 classes + assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1], + [0., 0., 1., 1.]), + 0) + + # check perfect predictions for 3 classes + assert_almost_equal(multiclass_brier_score_loss([0, 1, 2], + [[1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]), + 0) + + # check perfectly incorrect predictions for 2 classes + assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1], + [1., 1., 0., 0.]), + 2) + + # check perfectly incorrect predictions for 3 classes + assert_almost_equal(multiclass_brier_score_loss([0, 1, 2], + [[0., 1., 0.], + [1., 0., 0.], + [1., 0., 0.]]), + 2) + + +def test_multiclass_brier_score_loss_invalid_inputs(): + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95]) + + with pytest.raises(ValueError): + # bad length of y_pred + multiclass_brier_score_loss(y_true, y_pred[1:]) + with pytest.raises(ValueError): + # y_pred has value greater than 1 + multiclass_brier_score_loss(y_true, y_pred + 1.) + with pytest.raises(ValueError): + # y_pred has value less than 1 + multiclass_brier_score_loss(y_true, y_pred - 1.) + + # ensure to raise an error for wrong number of classes + y_true = np.array([0, 1, 2, 0]) + y_pred = np.array([0.8, 0.6, 0.4, 0.2]) + error_message = ("y_true and y_prob contain different number of " + "classes 3, 2. Please provide the true " + "labels explicitly through the labels argument. " + "Classes found in " + "y_true: [0 1 2]") + with pytest.raises(ValueError, match=re.escape(error_message)): + multiclass_brier_score_loss(y_true, y_pred) + + y_true = ['eggs', 'spam', 'ham'] + y_pred = [[1, 0, 0], + [0, 1, 0], + [0, 1, 0]] + labels = ['eggs', 'spam', 'ham', 'yams'] + error_message = ("The number of classes in labels is different " + "from that in y_prob. Classes found in " + "labels: ['eggs' 'ham' 'spam' 'yams']") + with pytest.raises(ValueError, match=re.escape(error_message)): + multiclass_brier_score_loss(y_true, y_pred, labels=labels) + + # raise error message when there's only one class in y_true + y_true = ['eggs'] + y_pred = [.1] + error_message = (f'y_true contains only one label: {y_true[0]}. Please ' + f'provide the true labels explicitly through the ' + f'labels argument.') + with pytest.raises(ValueError, match=re.escape(error_message)): + multiclass_brier_score_loss(y_true, y_pred) + + # error is fixed when labels is specified + assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred, + labels=['eggs', 'ham']), + .02) + + +def test_multiclass_brier_score_loss_warnings(): + assert_warns_message( + UserWarning, + "Labels passed were ['spam', 'eggs', 'ham']. But this function " + "assumes labels are ordered lexicographically. " + "Ensure that labels in y_prob are ordered as " + "['eggs' 'ham' 'spam'].", + multiclass_brier_score_loss, + ['eggs', 'spam', 'ham'], + [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]], + labels=['spam', 'eggs', 'ham']) + + def test_balanced_accuracy_score_unseen(): assert_warns_message(UserWarning, 'y_pred contains classes not in y_true', balanced_accuracy_score, [0, 0, 0], [0, 0, 1]) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6688ddc2aa834..f0e1b37dcf244 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -483,7 +483,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "unnormalized_multilabel_confusion_matrix", "macro_f0.5_score", "macro_f2_score", "macro_precision_score", - "macro_recall_score", "log_loss", "hinge_loss", + "macro_recall_score", "hinge_loss", "mean_gamma_deviance", "mean_poisson_deviance", "mean_compound_poisson_deviance", "mean_absolute_percentage_error" }