From cbe28f60bfda9ede87f69200e149a79338f9be38 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 2 Feb 2024 15:21:13 +0500 Subject: [PATCH 01/12] Minor refactoring --- sklearn/metrics/_classification.py | 47 ++++++++++++++++++++ sklearn/metrics/tests/test_classification.py | 25 ++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 9a592fbbb2c24..bd7f1b6c48499 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3242,3 +3242,50 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): raise y_true = np.array(y_true == pos_label, int) return np.average((y_true - y_prob) ** 2, weights=sample_weight) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "eps": [StrOptions({"auto"}), Interval(Real, 0, 1, closed="both")], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None): + y_pred = check_array( + y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] + ) + check_consistent_length(y_pred, y_true, sample_weight) + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + # log likelihood of the fitted model + numerator = -log_loss( + y_true=y_true, + y_pred=y_pred, + eps=eps, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + # Proportion of positive class labels in the dataset + p_null = np.mean(y_true) + y_pred_null = np.full_like(y_pred, p_null) + + # log likelihood of the null model + denominator = -log_loss( + y_true=y_true, + y_pred=y_pred_null, + eps=eps, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + return 1 - (numerator / denominator) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index abf1aae487599..037ef1a465b8d 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -35,7 +35,7 @@ recall_score, zero_one_loss, ) -from sklearn.metrics._classification import _check_targets +from sklearn.metrics._classification import _check_targets, d2_log_loss_score from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelBinarizer, label_binarize from sklearn.tree import DecisionTreeClassifier @@ -2864,3 +2864,26 @@ def test_classification_metric_division_by_zero_nan_validaton(scoring): X, y = datasets.make_classification(random_state=0) classifier = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X, y) cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise") + + +def test_d2_log_loss(): + y_true = [0, 0, 0, 1, 1, 1] + y_pred = np.array( + [[0.9, 0.1], [0.8, 0.2], [0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.1, 0.9]] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + + y_true = [0, 0, 0, 1, 1, 1] + y_pred = np.array( + [[0.5, 0.5], [0.1, 0.9], [0.1, 0.9], [0.9, 0.1], [0.75, 0.25], [0.1, 0.9]] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + + y_true = [0, 0, 0, 1, 1, 1] + y_pred = np.array( + [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 From 00e015dab36b7ea810bbc227f5dd57f5225e661b Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 9 Feb 2024 18:18:00 +0500 Subject: [PATCH 02/12] Add functionality to handle multiple classes, add further tests and add changelog --- doc/whats_new/v1.5.rst | 4 + sklearn/metrics/__init__.py | 2 + sklearn/metrics/_classification.py | 71 +++++++- sklearn/metrics/tests/test_classification.py | 175 ++++++++++++++++++- 4 files changed, 245 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 374e817b5f4c8..10bb621f7226e 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -112,6 +112,10 @@ Changelog :class:`~calibration.CalibrationDisplay`. :pr:`28051` by :user:`Pierre de Fréminville `. +- |Feature| :func:`metrics.d2_log_loss_score` has been added and this + calculates the D^2 score for the log loss. + :pr:`28351` by :user:`Omar Salman `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 713c5fe651dbb..d4ce0114d9af3 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -13,6 +13,7 @@ classification_report, cohen_kappa_score, confusion_matrix, + d2_log_loss_score, f1_score, fbeta_score, hamming_loss, @@ -114,6 +115,7 @@ "coverage_error", "d2_tweedie_score", "d2_absolute_error_score", + "d2_log_loss_score", "d2_pinball_score", "dcg_score", "davies_bouldin_score", diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index bd7f1b6c48499..22503d75ee6a9 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3255,6 +3255,70 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): prefer_skip_nested_validation=True, ) def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None): + """ + :math:`D^2` regression score function, fraction of Tweedie deviance explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always uses the empirical mean of `y_true` as + constant prediction, disregarding the input features, gets a D^2 score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels for n_samples samples. + + y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_pred.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_pred`` are assumed to be + ordered alphabetically, as done by + :class:`~sklearn.preprocessing.LabelBinarizer`. + + eps : float or "auto", default="auto" + Log loss is undefined for p=0 or p=1, so probabilities are + clipped to `max(eps, min(1 - eps, p))`. The default will depend on the + data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`. + + .. versionadded:: 1.2 + + .. versionchanged:: 1.2 + The default value changed from `1e-15` to `"auto"` that is + equivalent to `np.finfo(y_pred.dtype).eps`. + + .. deprecated:: 1.3 + `eps` is deprecated in 1.3 and will be removed in 1.5. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_pred`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + .. versionadded:: 0.18 + + Returns + ------- + z : float or ndarray of floats + The D^2 score. + + Notes + ----- + This is not a symmetric function. + + Like R^2, D^2 score may be negative (it need not actually be the square of + a quantity D). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + """ y_pred = check_array( y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] ) @@ -3274,9 +3338,10 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= labels=labels, ) - # Proportion of positive class labels in the dataset - p_null = np.mean(y_true) - y_pred_null = np.full_like(y_pred, p_null) + # Proportion of labels in the dataset + y_values, counts = np.unique(y_true, return_counts=True) + y_prob = counts / len(y_true) + y_pred_null = np.tile(y_prob, (len(y_true), 1)) # log likelihood of the null model denominator = -log_loss( diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 037ef1a465b8d..9e4770642dc1a 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -2866,24 +2866,191 @@ def test_classification_metric_division_by_zero_nan_validaton(scoring): cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise") -def test_d2_log_loss(): +def test_d2_log_loss_score(): + # compare the d2 score value computed using the bernoulli log pmf + # with the d2 score computed using the function. The values in y_true + # are defined such that "no" corresponds to 0 and "yes" to 1. + y_true = ["no", "no", "no", "yes", "yes", "yes"] + y_pred = np.array( + [ + [0.5, 0.5], + [0.9, 0.1], + [0.4, 0.6], + [0.6, 0.4], + [0.35, 0.65], + [0.01, 0.99], + ] + ) + y_pred_null = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + log_likelihood = np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1])) + log_likelihood_null = np.mean( + bernoulli.logpmf(np.array(y_true) == "yes", y_pred_null[:, 1]) + ) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert_almost_equal(d2_score, d2_score_true) + + # check if good predictions give a relatively higher value for the d2 score y_true = [0, 0, 0, 1, 1, 1] y_pred = np.array( - [[0.9, 0.1], [0.8, 0.2], [0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.1, 0.9]] + [ + [0.9, 0.1], + [0.8, 0.2], + [0.9, 0.1], + [0.1, 0.9], + [0.2, 0.8], + [0.1, 0.9], + ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert 0.5 < d2_score < 1.0 + # check if poor predictions gives a relatively low value for the d2 score. y_true = [0, 0, 0, 1, 1, 1] y_pred = np.array( - [[0.5, 0.5], [0.1, 0.9], [0.1, 0.9], [0.9, 0.1], [0.75, 0.25], [0.1, 0.9]] + [ + [0.5, 0.5], + [0.1, 0.9], + [0.1, 0.9], + [0.9, 0.1], + [0.75, 0.25], + [0.1, 0.9], + ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score < 0 + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 y_true = [0, 0, 0, 1, 1, 1] y_pred = np.array( - [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 when the positive class has a higher proportion. + y_true = [0, 1, 1, 1] + y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]]) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 where the y_true values are characters. + y_true = ["b", "c", "a", "d"] + y_pred = np.array( + [ + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 where the y_true values are characters and "a" + # has a relatively higher proportion. + y_true = ["b", "c", "a", "a"] + y_pred = np.array( + [ + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score == 0 + + # check if good predictions for character values gives a relatively + # higher d2 score. + y_true = ["b", "c", "a", "a"] + y_pred = np.array( + [ + [0.1, 0.8, 0.1], + [0.1, 0.1, 0.8], + [0.8, 0.1, 0.1], + [0.8, 0.1, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + + # check if poor predictions for character values gives a relatively + # low value for the d2 score. + y_true = ["b", "c", "a", "a"] + y_pred = np.array( + [ + [0.6, 0.3, 0.1], + [0.5, 0.2, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + + # check error if the number of classes are not equal. + y_true = [0, 1, 2] + y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]] + err = "contain different number of classes" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error if the number of classes in labels do not match the number + # of classes in y_pred. + y_true = ["a", "b", "c"] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + labels = [0, 1, 2] + err = "number of classes in labels is different" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels) + + # check error if y_true and y_pred do not have equal lengths + y_true = [0, 1, 2] + y_pred = [[0.5, 0.5, 0.5], [0.6, 0.3, 0.1]] + err = "inconsistent numbers of samples" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check warning for samples < 2 + y_true = [1] + y_pred = [[0.5, 0.5]] + err = "score is not well-defined" + with pytest.warns(UndefinedMetricWarning, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label + y_true = [1, 1, 1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]] + err = "y_true contains only one label" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label and labels also has + # only 1 label + y_true = [1, 1, 1] + labels = [1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]] + err = "The labels array needs to contain at least two" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels) From d3dc7a4eade21bbc156cec9dc0edad533a254346 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 9 Feb 2024 18:43:19 +0500 Subject: [PATCH 03/12] Fix docstring --- sklearn/metrics/_classification.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 22503d75ee6a9..29227ac97e26c 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3317,7 +3317,6 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= This metric is not well-defined for single samples and will return a NaN value if n_samples is less than two. - """ y_pred = check_array( y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] @@ -3328,8 +3327,8 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= warnings.warn(msg, UndefinedMetricWarning) return float("nan") - # log likelihood of the fitted model - numerator = -log_loss( + # log loss of the fitted model + numerator = log_loss( y_true=y_true, y_pred=y_pred, eps=eps, @@ -3343,8 +3342,8 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= y_prob = counts / len(y_true) y_pred_null = np.tile(y_prob, (len(y_true), 1)) - # log likelihood of the null model - denominator = -log_loss( + # log loss of the null model + denominator = log_loss( y_true=y_true, y_pred=y_pred_null, eps=eps, From d9126803130ce2d00fff1fd0a211161f9862a845 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 9 Feb 2024 18:45:45 +0500 Subject: [PATCH 04/12] Minor adjustment --- sklearn/metrics/_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 29227ac97e26c..00a1952d56cb0 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3254,7 +3254,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): }, prefer_skip_nested_validation=True, ) -def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None): +def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None): """ :math:`D^2` regression score function, fraction of Tweedie deviance explained. @@ -3279,6 +3279,9 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= ordered alphabetically, as done by :class:`~sklearn.preprocessing.LabelBinarizer`. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + eps : float or "auto", default="auto" Log loss is undefined for p=0 or p=1, so probabilities are clipped to `max(eps, min(1 - eps, p))`. The default will depend on the @@ -3293,9 +3296,6 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels= .. deprecated:: 1.3 `eps` is deprecated in 1.3 and will be removed in 1.5. - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. - labels : array-like, default=None If not provided, labels will be inferred from y_true. If ``labels`` is ``None`` and ``y_pred`` has shape (n_samples,) the labels are From fb4ad0cd7c0bd1f2f554a0d88166f9eb2d8ed5b1 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 9 Feb 2024 18:59:58 +0500 Subject: [PATCH 05/12] Fix the title of the doc --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 00a1952d56cb0..e36f078b6373e 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3256,7 +3256,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): ) def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None): """ - :math:`D^2` regression score function, fraction of Tweedie deviance explained. + :math:`D^2` score function, fraction of log loss explained. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A model that always uses the empirical mean of `y_true` as From 78c00bc356e51833b1bb731e7f9393ec40e7a43a Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 12 Feb 2024 14:43:07 +0500 Subject: [PATCH 06/12] Add the documentation for d2 log loss --- doc/modules/model_evaluation.rst | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index b60407bf1a12a..5d849f5332e81 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2761,6 +2761,52 @@ Here are some usage examples of the :func:`d2_absolute_error_score` function:: >>> d2_absolute_error_score(y_true, y_pred) 0.0 +D² log loss score +^^^^^^^^^^^^^^^^^^^^^^^ +The :func:`d2_log_loss_score` function implements the special case +of D² with the log loss, see :ref:`log_loss`, i.e.: + +.. math:: + + \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}). + +Here are some usage examples of the :func:`d2_log_loss_score` function:: + + >>> from sklearn.metrics import d2_log_loss_score + >>> y_true = [1, 1, 2, 3] + >>> y_pred = [ + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.98, 0.01, 0.01], + ... [0.01, 0.98, 0.01], + ... [0.01, 0.01, 0.98], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.981... + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.1, 0.6, 0.3], + ... [0.1, 0.6, 0.3], + ... [0.4, 0.5, 0.1], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + -0.552... + >>> y_true = [0, 0, 1, 1] + >>> y_pred = [0.5, 0.5, 0.5, 0.5] + >>> d2_log_loss_score(y_true, y_pred) + 0.0 + >>> y_true = [0, 0, 1, 1] + >>> y_pred = [0.8, 0.8, 0.2, 0.2] + >>> d2_log_loss_score(y_true, y_pred) + -1.321... + .. _visualization_regression_evaluation: Visual evaluation of regression models From 7d68e344c7b4b880859620b469b362249a65b9ae Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Sat, 30 Mar 2024 12:58:08 +0500 Subject: [PATCH 07/12] Address suggestions provided on PR --- sklearn/metrics/_classification.py | 34 ++++---------- sklearn/metrics/tests/test_classification.py | 48 ++++++++++++++++++++ 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 772fb2534b47e..2b3215f3bee5c 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3297,7 +3297,7 @@ def brier_score_loss( }, prefer_skip_nested_validation=True, ) -def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None): +def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): """ :math:`D^2` score function, fraction of log loss explained. @@ -3307,12 +3307,10 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels= Read more in the :ref:`User Guide `. - .. versionadded:: 1.5 - Parameters ---------- y_true : array-like or label indicator matrix - Ground truth (correct) labels for n_samples samples. + The actuals labels for the n_samples samples. y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) Predicted probabilities, as returned by a classifier's @@ -3325,27 +3323,11 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels= sample_weight : array-like of shape (n_samples,), default=None Sample weights. - eps : float or "auto", default="auto" - Log loss is undefined for p=0 or p=1, so probabilities are - clipped to `max(eps, min(1 - eps, p))`. The default will depend on the - data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`. - - .. versionadded:: 1.2 - - .. versionchanged:: 1.2 - The default value changed from `1e-15` to `"auto"` that is - equivalent to `np.finfo(y_pred.dtype).eps`. - - .. deprecated:: 1.3 - `eps` is deprecated in 1.3 and will be removed in 1.5. - labels : array-like, default=None If not provided, labels will be inferred from y_true. If ``labels`` is ``None`` and ``y_pred`` has shape (n_samples,) the labels are assumed to be binary and are inferred from ``y_true``. - .. versionadded:: 0.18 - Returns ------- z : float or ndarray of floats @@ -3374,22 +3356,26 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels= numerator = log_loss( y_true=y_true, y_pred=y_pred, - eps=eps, normalize=False, sample_weight=sample_weight, labels=labels, ) # Proportion of labels in the dataset - y_values, counts = np.unique(y_true, return_counts=True) - y_prob = counts / len(y_true) + if sample_weight is not None: + weights = np.asarray(sample_weight) + else: + weights = np.ones(shape=len(y_true), dtype=np.int64) + + _, y_value_indices = np.unique(y_true, return_inverse=True) + counts = np.bincount(y_value_indices, weights=weights) + y_prob = counts / weights.sum() y_pred_null = np.tile(y_prob, (len(y_true), 1)) # log loss of the null model denominator = log_loss( y_true=y_true, y_pred=y_pred_null, - eps=eps, normalize=False, sample_weight=sample_weight, labels=labels, diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index f346314616a42..b225826e90cb9 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -3100,3 +3100,51 @@ def test_d2_log_loss_score(): err = "The labels array needs to contain at least two" with pytest.raises(ValueError, match=err): d2_log_loss_score(y_true, y_pred, labels=labels) + + # Some tests with sample weights + + # check if poor predictions for character values gives a relatively + # low value for the d2 score when sample weights are also given. + y_true = ["h", "d", "c", "c"] + sample_weight = [1.0, 0.5, 0.5, 1.0] + y_pred = np.array( + [ + [0.6, 0.3, 0.1], + [0.5, 0.2, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert d2_score < 0 + + # check if good predictions for character values gives a relatively + # higher d2 score when sample weights are also given. + y_true = ["orange", "strawberry", "apple", "apple"] + sample_weight = [3, 2, 1, 4] + y_pred = np.array( + [ + [0.1, 0.8, 0.1], + [0.1, 0.1, 0.8], + [0.8, 0.1, 0.1], + [0.8, 0.1, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert 0.5 < d2_score < 1.0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 where the y_true values are characters and "okay" + # has a relatively higher proportion, when sample weights are also given. + y_true = ["low", "neutral", "high", "high"] + sample_weight = [2, 2, 2, 2] + y_pred = np.array( + [ + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert d2_score == 0 \ No newline at end of file From 04f7a38e5394b7716c6582694aa4bb5c6c5c2505 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Sat, 30 Mar 2024 13:02:36 +0500 Subject: [PATCH 08/12] Fix linting --- sklearn/metrics/tests/test_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index b225826e90cb9..591aa6d8703ec 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -3147,4 +3147,4 @@ def test_d2_log_loss_score(): ] ) d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) - assert d2_score == 0 \ No newline at end of file + assert d2_score == 0 From e1f6ae5d9cf75852278313793ea7a714e4d83fa2 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Fri, 12 Apr 2024 14:37:57 +0500 Subject: [PATCH 09/12] Address PR sugggestions --- sklearn/metrics/_classification.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 67dc4d3325354..479ab5d4231e2 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3290,7 +3290,6 @@ def brier_score_loss( { "y_true": ["array-like"], "y_pred": ["array-like"], - "eps": [StrOptions({"auto"}), Interval(Real, 0, 1, closed="both")], "sample_weight": ["array-like", None], "labels": ["array-like", None], }, @@ -3339,12 +3338,10 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): Like R^2, D^2 score may be negative (it need not actually be the square of a quantity D). - This metric is not well-defined for single samples and will return a NaN + This metric is not well-defined for a single sample and will return a NaN value if n_samples is less than two. """ - y_pred = check_array( - y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] - ) + y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric") check_consistent_length(y_pred, y_true, sample_weight) if _num_samples(y_pred) < 2: msg = "D^2 score is not well-defined with less than two samples." From bfc32ebad404b4f0a9f34df6723af02fda6ae7de Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 22 Apr 2024 14:55:30 +0500 Subject: [PATCH 10/12] Address PR suggestions: improve organization of tests --- doc/modules/model_evaluation.rst | 8 -- doc/whats_new/v1.5.rst | 2 +- sklearn/metrics/_classification.py | 2 +- sklearn/metrics/tests/test_classification.py | 127 ++++++++----------- 4 files changed, 53 insertions(+), 86 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5e37c68dfc2cf..c56b1bc01170b 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2865,14 +2865,6 @@ Here are some usage examples of the :func:`d2_log_loss_score` function:: ... ] >>> d2_log_loss_score(y_true, y_pred) -0.552... - >>> y_true = [0, 0, 1, 1] - >>> y_pred = [0.5, 0.5, 0.5, 0.5] - >>> d2_log_loss_score(y_true, y_pred) - 0.0 - >>> y_true = [0, 0, 1, 1] - >>> y_pred = [0.8, 0.8, 0.2, 0.2] - >>> d2_log_loss_score(y_true, y_pred) - -1.321... |details-end| diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index a7edeec12d3e8..cbd49e98adaae 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -354,7 +354,7 @@ Changelog is deprecated and will raise an error in v1.7. :pr:`18555` by :user:`Kaushik Amar Das `. -- |Feature| :func:`metrics.d2_log_loss_score` has been added and this +- |Feature| :func:`metrics.d2_log_loss_score` has been added which calculates the D^2 score for the log loss. :pr:`28351` by :user:`Omar Salman `. diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index ac7cf6cd01371..883111b09785f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -3328,7 +3328,7 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): Returns ------- - z : float or ndarray of floats + d2 : float or ndarray of floats The D^2 score. Notes diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index adef0b4eeb845..1f8647c5419f8 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -2908,10 +2908,7 @@ def test_brier_score_loss_deprecation_warning(): def test_d2_log_loss_score(): - # compare the d2 score value computed using the bernoulli log pmf - # with the d2 score computed using the function. The values in y_true - # are defined such that "no" corresponds to 0 and "yes" to 1. - y_true = ["no", "no", "no", "yes", "yes", "yes"] + y_true = [0, 0, 0, 1, 1, 1] y_pred = np.array( [ [0.5, 0.5], @@ -2933,10 +2930,8 @@ def test_d2_log_loss_score(): ] ) d2_score = d2_log_loss_score(y_true, y_pred) - log_likelihood = np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1])) - log_likelihood_null = np.mean( - bernoulli.logpmf(np.array(y_true) == "yes", y_pred_null[:, 1]) - ) + log_likelihood = log_loss(y_true=y_true, y_pred=y_pred) + log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null) d2_score_true = 1 - log_likelihood / log_likelihood_null assert_almost_equal(d2_score, d2_score_true) @@ -2993,24 +2988,26 @@ def test_d2_log_loss_score(): d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score == 0 + +def test_d2_log_loss_score_with_y_true_as_strings(): + y_true = ["high", "high", "low", "neutral"] + # check if simply using the average of the classes as the predictions - # gives a d2 score of 0 where the y_true values are characters. - y_true = ["b", "c", "a", "d"] + # gives a d2 score of 0. y_pred = np.array( [ - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], - [0.25, 0.25, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], + [0.5, 0.25, 0.25], ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score == 0 # check if simply using the average of the classes as the predictions - # gives a d2 score of 0 where the y_true values are characters and "a" - # has a relatively higher proportion. - y_true = ["b", "c", "a", "a"] + # gives a d2 score of 0 when sample weights are also given. + sample_weight = [2, 2, 2, 2] y_pred = np.array( [ [0.5, 0.25, 0.25], @@ -3019,37 +3016,63 @@ def test_d2_log_loss_score(): [0.5, 0.25, 0.25], ] ) - d2_score = d2_log_loss_score(y_true, y_pred) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) assert d2_score == 0 - # check if good predictions for character values gives a relatively - # higher d2 score. - y_true = ["b", "c", "a", "a"] + # check if good predictions give a relatively high d2 score. y_pred = np.array( [ + [0.8, 0.1, 0.1], + [0.8, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + + # check if good predictions give a relatively high d2 score + # when sample weights are also given. + sample_weight = [3, 2, 1, 4] + y_pred = np.array( + [ [0.8, 0.1, 0.1], [0.8, 0.1, 0.1], + [0.1, 0.8, 0.1], + [0.1, 0.1, 0.8], ] ) - d2_score = d2_log_loss_score(y_true, y_pred) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) assert 0.5 < d2_score < 1.0 - # check if poor predictions for character values gives a relatively - # low value for the d2 score. - y_true = ["b", "c", "a", "a"] + # check if poor predictions give a relatively low d2 score. y_pred = np.array( [ - [0.6, 0.3, 0.1], - [0.5, 0.2, 0.3], - [0.3, 0.4, 0.3], + [0.3, 0.6, 0.1], + [0.3, 0.2, 0.5], + [0.4, 0.3, 0.3], [0.4, 0.5, 0.1], ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score < 0 + # check if poor predictions give a relatively low d2 score + # when sample weights are also given. + sample_weight = [1.0, 0.5, 0.5, 1.0] + y_pred = np.array( + [ + [0.3, 0.6, 0.1], + [0.3, 0.2, 0.5], + [0.4, 0.3, 0.3], + [0.4, 0.5, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert d2_score < 0 + + +def test_d2_log_loss_score_errors(): # check error if the number of classes are not equal. y_true = [0, 1, 2] y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]] @@ -3095,51 +3118,3 @@ def test_d2_log_loss_score(): err = "The labels array needs to contain at least two" with pytest.raises(ValueError, match=err): d2_log_loss_score(y_true, y_pred, labels=labels) - - # Some tests with sample weights - - # check if poor predictions for character values gives a relatively - # low value for the d2 score when sample weights are also given. - y_true = ["h", "d", "c", "c"] - sample_weight = [1.0, 0.5, 0.5, 1.0] - y_pred = np.array( - [ - [0.6, 0.3, 0.1], - [0.5, 0.2, 0.3], - [0.3, 0.4, 0.3], - [0.4, 0.5, 0.1], - ] - ) - d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) - assert d2_score < 0 - - # check if good predictions for character values gives a relatively - # higher d2 score when sample weights are also given. - y_true = ["orange", "strawberry", "apple", "apple"] - sample_weight = [3, 2, 1, 4] - y_pred = np.array( - [ - [0.1, 0.8, 0.1], - [0.1, 0.1, 0.8], - [0.8, 0.1, 0.1], - [0.8, 0.1, 0.1], - ] - ) - d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) - assert 0.5 < d2_score < 1.0 - - # check if simply using the average of the classes as the predictions - # gives a d2 score of 0 where the y_true values are characters and "okay" - # has a relatively higher proportion, when sample weights are also given. - y_true = ["low", "neutral", "high", "high"] - sample_weight = [2, 2, 2, 2] - y_pred = np.array( - [ - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - ] - ) - d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) - assert d2_score == 0 From c17809e940e406530854b79c3887fb521d5aed23 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 22 Apr 2024 17:37:45 +0500 Subject: [PATCH 11/12] Refine tests --- sklearn/metrics/tests/test_classification.py | 123 ++++++++----------- 1 file changed, 54 insertions(+), 69 deletions(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 1f8647c5419f8..ba0e7f9c81306 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -2909,6 +2909,7 @@ def test_brier_score_loss_deprecation_warning(): def test_d2_log_loss_score(): y_true = [0, 0, 0, 1, 1, 1] + y_true_string = ["no", "no", "no", "yes", "yes", "yes"] y_pred = np.array( [ [0.5, 0.5], @@ -2929,14 +2930,35 @@ def test_d2_log_loss_score(): [0.5, 0.5], ] ) - d2_score = d2_log_loss_score(y_true, y_pred) - log_likelihood = log_loss(y_true=y_true, y_pred=y_pred) - log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null) + d2_score = d2_log_loss_score(y_true=y_true, y_pred=y_pred) + log_likelihood = log_loss(y_true=y_true, y_pred=y_pred, normalize=False) + log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null, normalize=False) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert d2_score == pytest.approx(d2_score_true) + + # check that using sample weight also gives the correct d2 score + sample_weight = np.array([2, 1, 3, 4, 3, 1]) + y_pred_null[:, 0] = sample_weight[:3].sum() / sample_weight.sum() + y_pred_null[:, 1] = sample_weight[3:].sum() / sample_weight.sum() + d2_score = d2_log_loss_score( + y_true=y_true, y_pred=y_pred, sample_weight=sample_weight + ) + log_likelihood = log_loss( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight, + normalize=False, + ) + log_likelihood_null = log_loss( + y_true=y_true, + y_pred=y_pred_null, + sample_weight=sample_weight, + normalize=False, + ) d2_score_true = 1 - log_likelihood / log_likelihood_null - assert_almost_equal(d2_score, d2_score_true) + assert d2_score == pytest.approx(d2_score_true) # check if good predictions give a relatively higher value for the d2 score - y_true = [0, 0, 0, 1, 1, 1] y_pred = np.array( [ [0.9, 0.1], @@ -2949,9 +2971,11 @@ def test_d2_log_loss_score(): ) d2_score = d2_log_loss_score(y_true, y_pred) assert 0.5 < d2_score < 1.0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) - # check if poor predictions gives a relatively low value for the d2 score. - y_true = [0, 0, 0, 1, 1, 1] + # check if poor predictions gives a relatively low value for the d2 score y_pred = np.array( [ [0.5, 0.5], @@ -2964,6 +2988,9 @@ def test_d2_log_loss_score(): ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score < 0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) # check if simply using the average of the classes as the predictions # gives a d2 score of 0 @@ -2980,46 +3007,29 @@ def test_d2_log_loss_score(): ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score == 0 + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 # check if simply using the average of the classes as the predictions - # gives a d2 score of 0 when the positive class has a higher proportion. + # gives a d2 score of 0 when the positive class has a higher proportion y_true = [0, 1, 1, 1] + y_true_string = ["no", "yes", "yes", "yes"] y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]]) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score == 0 - - -def test_d2_log_loss_score_with_y_true_as_strings(): - y_true = ["high", "high", "low", "neutral"] - - # check if simply using the average of the classes as the predictions - # gives a d2 score of 0. - y_pred = np.array( - [ - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - ] - ) - d2_score = d2_log_loss_score(y_true, y_pred) - assert d2_score == 0 - - # check if simply using the average of the classes as the predictions - # gives a d2 score of 0 when sample weights are also given. + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 sample_weight = [2, 2, 2, 2] - y_pred = np.array( - [ - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - [0.5, 0.25, 0.25], - ] + d2_score_with_sample_weight = d2_log_loss_score( + y_true, y_pred, sample_weight=sample_weight ) - d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) - assert d2_score == 0 + assert d2_score_with_sample_weight == 0 + + # check that the d2 scores seem correct when more than 2 + # labels are specified + y_true = ["high", "high", "low", "neutral"] + sample_weight = [1.4, 0.6, 0.8, 0.2] - # check if good predictions give a relatively high d2 score. y_pred = np.array( [ [0.8, 0.1, 0.1], @@ -3030,50 +3040,25 @@ def test_d2_log_loss_score_with_y_true_as_strings(): ) d2_score = d2_log_loss_score(y_true, y_pred) assert 0.5 < d2_score < 1.0 - - # check if good predictions give a relatively high d2 score - # when sample weights are also given. - sample_weight = [3, 2, 1, 4] - y_pred = np.array( - [ - [0.8, 0.1, 0.1], - [0.8, 0.1, 0.1], - [0.1, 0.8, 0.1], - [0.1, 0.1, 0.8], - ] - ) d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) assert 0.5 < d2_score < 1.0 - # check if poor predictions give a relatively low d2 score. y_pred = np.array( [ - [0.3, 0.6, 0.1], - [0.3, 0.2, 0.5], - [0.4, 0.3, 0.3], - [0.4, 0.5, 0.1], + [0.2, 0.5, 0.3], + [0.1, 0.7, 0.2], + [0.1, 0.1, 0.8], + [0.2, 0.7, 0.1], ] ) d2_score = d2_log_loss_score(y_true, y_pred) assert d2_score < 0 - - # check if poor predictions give a relatively low d2 score - # when sample weights are also given. - sample_weight = [1.0, 0.5, 0.5, 1.0] - y_pred = np.array( - [ - [0.3, 0.6, 0.1], - [0.3, 0.2, 0.5], - [0.4, 0.3, 0.3], - [0.4, 0.5, 0.1], - ] - ) d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) assert d2_score < 0 -def test_d2_log_loss_score_errors(): - # check error if the number of classes are not equal. +def test_d2_log_loss_score_raises(): + """Test that d2_log_loss raises error on invalid input.""" y_true = [0, 1, 2] y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]] err = "contain different number of classes" From 24cf7b0a9d9ecb4118275599ae8cceaa5a67be92 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Thu, 2 May 2024 18:14:00 +0500 Subject: [PATCH 12/12] Updates: suggestions provided in review --- doc/modules/model_evaluation.rst | 3 +++ sklearn/metrics/_classification.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c56b1bc01170b..7caacd697ea1c 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2837,6 +2837,9 @@ of D² with the log loss, see :ref:`log_loss`, i.e.: \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}). +The :math:`y_{\text{null}}` for the :func:`log_loss` is the per-class +proportion. + Here are some usage examples of the :func:`d2_log_loss_score` function:: >>> from sklearn.metrics import d2_log_loss_score diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 883111b09785f..9560e53b3ffa4 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -53,7 +53,11 @@ from ..utils.extmath import _nanaverage from ..utils.multiclass import type_of_target, unique_labels from ..utils.sparsefuncs import count_nonzero -from ..utils.validation import _check_pos_label_consistency, _num_samples +from ..utils.validation import ( + _check_pos_label_consistency, + _check_sample_weight, + _num_samples, +) def _check_zero_division(zero_division): @@ -3305,12 +3309,14 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): Read more in the :ref:`User Guide `. + .. versionadded:: 1.5 + Parameters ---------- y_true : array-like or label indicator matrix The actuals labels for the n_samples samples. - y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) + y_pred : array-like of shape (n_samples, n_classes) or (n_samples,) Predicted probabilities, as returned by a classifier's predict_proba method. If ``y_pred.shape = (n_samples,)`` the probabilities provided are assumed to be that of the @@ -3358,10 +3364,7 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): ) # Proportion of labels in the dataset - if sample_weight is not None: - weights = np.asarray(sample_weight) - else: - weights = np.ones(shape=len(y_true), dtype=np.int64) + weights = _check_sample_weight(sample_weight, y_true) _, y_value_indices = np.unique(y_true, return_inverse=True) counts = np.bincount(y_value_indices, weights=weights)