diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 59f014b732e35..7caacd697ea1c 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2826,6 +2826,51 @@ Here are some usage examples of the :func:`d2_absolute_error_score` function:: |details-end| +|details-start| +**D² log loss score** +|details-split| + +The :func:`d2_log_loss_score` function implements the special case +of D² with the log loss, see :ref:`log_loss`, i.e.: + +.. math:: + + \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}). + +The :math:`y_{\text{null}}` for the :func:`log_loss` is the per-class +proportion. + +Here are some usage examples of the :func:`d2_log_loss_score` function:: + + >>> from sklearn.metrics import d2_log_loss_score + >>> y_true = [1, 1, 2, 3] + >>> y_pred = [ + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.98, 0.01, 0.01], + ... [0.01, 0.98, 0.01], + ... [0.01, 0.01, 0.98], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.981... + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.1, 0.6, 0.3], + ... [0.1, 0.6, 0.3], + ... [0.4, 0.5, 0.1], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + -0.552... + +|details-end| + .. _visualization_regression_evaluation: Visual evaluation of regression models diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 1fe0df6f97a61..d3064851e7f87 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -169,7 +169,7 @@ Changelog .......................... - |Fix| Fixed a regression in :class:`calibration.CalibratedClassifierCV` where - an error was wrongly raised with string targets. + an error was wrongly raised with string targets. :pr:`28843` by :user:`Jérémie du Boisberranger `. :mod:`sklearn.cluster` @@ -406,6 +406,10 @@ Changelog is deprecated and will raise an error in v1.7. :pr:`18555` by :user:`Kaushik Amar Das `. +- |Feature| :func:`metrics.d2_log_loss_score` has been added which + calculates the D^2 score for the log loss. + :pr:`28351` by :user:`Omar Salman `. + :mod:`sklearn.mixture` ...................... diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 8a818c885043c..af25a219c79f1 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -12,6 +12,7 @@ classification_report, cohen_kappa_score, confusion_matrix, + d2_log_loss_score, f1_score, fbeta_score, hamming_loss, @@ -113,6 +114,7 @@ "coverage_error", "d2_tweedie_score", "d2_absolute_error_score", + "d2_log_loss_score", "d2_pinball_score", "dcg_score", "davies_bouldin_score", diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index caa4db5479a29..04894a4d7a7e7 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -53,7 +53,11 @@ from ..utils.extmath import _nanaverage from ..utils.multiclass import type_of_target, unique_labels from ..utils.sparsefuncs import count_nonzero -from ..utils.validation import _check_pos_label_consistency, _num_samples +from ..utils.validation import ( + _check_pos_label_consistency, + _check_sample_weight, + _num_samples, +) def _check_zero_division(zero_division): @@ -3257,3 +3261,96 @@ def brier_score_loss( raise y_true = np.array(y_true == pos_label, int) return np.average((y_true - y_proba) ** 2, weights=sample_weight) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): + """ + :math:`D^2` score function, fraction of log loss explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always uses the empirical mean of `y_true` as + constant prediction, disregarding the input features, gets a D^2 score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + y_true : array-like or label indicator matrix + The actuals labels for the n_samples samples. + + y_pred : array-like of shape (n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_pred.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_pred`` are assumed to be + ordered alphabetically, as done by + :class:`~sklearn.preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_pred`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + Returns + ------- + d2 : float or ndarray of floats + The D^2 score. + + Notes + ----- + This is not a symmetric function. + + Like R^2, D^2 score may be negative (it need not actually be the square of + a quantity D). + + This metric is not well-defined for a single sample and will return a NaN + value if n_samples is less than two. + """ + y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric") + check_consistent_length(y_pred, y_true, sample_weight) + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + # log loss of the fitted model + numerator = log_loss( + y_true=y_true, + y_pred=y_pred, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + # Proportion of labels in the dataset + weights = _check_sample_weight(sample_weight, y_true) + + _, y_value_indices = np.unique(y_true, return_inverse=True) + counts = np.bincount(y_value_indices, weights=weights) + y_prob = counts / weights.sum() + y_pred_null = np.tile(y_prob, (len(y_true), 1)) + + # log loss of the null model + denominator = log_loss( + y_true=y_true, + y_pred=y_pred_null, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + return 1 - (numerator / denominator) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 144871c8d02ee..40b762bfa7308 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -35,7 +35,7 @@ recall_score, zero_one_loss, ) -from sklearn.metrics._classification import _check_targets +from sklearn.metrics._classification import _check_targets, d2_log_loss_score from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelBinarizer, label_binarize from sklearn.tree import DecisionTreeClassifier @@ -2895,3 +2895,201 @@ def test_brier_score_loss_deprecation_warning(): y_prob=y_pred, y_proba=y_pred, ) + + +def test_d2_log_loss_score(): + y_true = [0, 0, 0, 1, 1, 1] + y_true_string = ["no", "no", "no", "yes", "yes", "yes"] + y_pred = np.array( + [ + [0.5, 0.5], + [0.9, 0.1], + [0.4, 0.6], + [0.6, 0.4], + [0.35, 0.65], + [0.01, 0.99], + ] + ) + y_pred_null = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true=y_true, y_pred=y_pred) + log_likelihood = log_loss(y_true=y_true, y_pred=y_pred, normalize=False) + log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null, normalize=False) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert d2_score == pytest.approx(d2_score_true) + + # check that using sample weight also gives the correct d2 score + sample_weight = np.array([2, 1, 3, 4, 3, 1]) + y_pred_null[:, 0] = sample_weight[:3].sum() / sample_weight.sum() + y_pred_null[:, 1] = sample_weight[3:].sum() / sample_weight.sum() + d2_score = d2_log_loss_score( + y_true=y_true, y_pred=y_pred, sample_weight=sample_weight + ) + log_likelihood = log_loss( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight, + normalize=False, + ) + log_likelihood_null = log_loss( + y_true=y_true, + y_pred=y_pred_null, + sample_weight=sample_weight, + normalize=False, + ) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert d2_score == pytest.approx(d2_score_true) + + # check if good predictions give a relatively higher value for the d2 score + y_pred = np.array( + [ + [0.9, 0.1], + [0.8, 0.2], + [0.9, 0.1], + [0.1, 0.9], + [0.2, 0.8], + [0.1, 0.9], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) + + # check if poor predictions gives a relatively low value for the d2 score + y_pred = np.array( + [ + [0.5, 0.5], + [0.1, 0.9], + [0.1, 0.9], + [0.9, 0.1], + [0.75, 0.25], + [0.1, 0.9], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 + y_true = [0, 0, 0, 1, 1, 1] + y_pred = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 when the positive class has a higher proportion + y_true = [0, 1, 1, 1] + y_true_string = ["no", "yes", "yes", "yes"] + y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]]) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 + sample_weight = [2, 2, 2, 2] + d2_score_with_sample_weight = d2_log_loss_score( + y_true, y_pred, sample_weight=sample_weight + ) + assert d2_score_with_sample_weight == 0 + + # check that the d2 scores seem correct when more than 2 + # labels are specified + y_true = ["high", "high", "low", "neutral"] + sample_weight = [1.4, 0.6, 0.8, 0.2] + + y_pred = np.array( + [ + [0.8, 0.1, 0.1], + [0.8, 0.1, 0.1], + [0.1, 0.8, 0.1], + [0.1, 0.1, 0.8], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert 0.5 < d2_score < 1.0 + + y_pred = np.array( + [ + [0.2, 0.5, 0.3], + [0.1, 0.7, 0.2], + [0.1, 0.1, 0.8], + [0.2, 0.7, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert d2_score < 0 + + +def test_d2_log_loss_score_raises(): + """Test that d2_log_loss raises error on invalid input.""" + y_true = [0, 1, 2] + y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]] + err = "contain different number of classes" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error if the number of classes in labels do not match the number + # of classes in y_pred. + y_true = ["a", "b", "c"] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + labels = [0, 1, 2] + err = "number of classes in labels is different" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels) + + # check error if y_true and y_pred do not have equal lengths + y_true = [0, 1, 2] + y_pred = [[0.5, 0.5, 0.5], [0.6, 0.3, 0.1]] + err = "inconsistent numbers of samples" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check warning for samples < 2 + y_true = [1] + y_pred = [[0.5, 0.5]] + err = "score is not well-defined" + with pytest.warns(UndefinedMetricWarning, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label + y_true = [1, 1, 1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]] + err = "y_true contains only one label" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label and labels also has + # only 1 label + y_true = [1, 1, 1] + labels = [1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]] + err = "The labels array needs to contain at least two" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels)