From cbe28f60bfda9ede87f69200e149a79338f9be38 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 2 Feb 2024 15:21:13 +0500
Subject: [PATCH 01/12] Minor refactoring

---
 sklearn/metrics/_classification.py           | 47 ++++++++++++++++++++
 sklearn/metrics/tests/test_classification.py | 25 ++++++++++-
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 9a592fbbb2c24..bd7f1b6c48499 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3242,3 +3242,50 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
             raise
     y_true = np.array(y_true == pos_label, int)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_pred": ["array-like"],
+        "eps": [StrOptions({"auto"}), Interval(Real, 0, 1, closed="both")],
+        "sample_weight": ["array-like", None],
+        "labels": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None):
+    y_pred = check_array(
+        y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
+    )
+    check_consistent_length(y_pred, y_true, sample_weight)
+    if _num_samples(y_pred) < 2:
+        msg = "D^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float("nan")
+
+    # log likelihood of the fitted model
+    numerator = -log_loss(
+        y_true=y_true,
+        y_pred=y_pred,
+        eps=eps,
+        normalize=False,
+        sample_weight=sample_weight,
+        labels=labels,
+    )
+
+    # Proportion of positive class labels in the dataset
+    p_null = np.mean(y_true)
+    y_pred_null = np.full_like(y_pred, p_null)
+
+    # log likelihood of the null model
+    denominator = -log_loss(
+        y_true=y_true,
+        y_pred=y_pred_null,
+        eps=eps,
+        normalize=False,
+        sample_weight=sample_weight,
+        labels=labels,
+    )
+
+    return 1 - (numerator / denominator)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index abf1aae487599..037ef1a465b8d 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -35,7 +35,7 @@
     recall_score,
     zero_one_loss,
 )
-from sklearn.metrics._classification import _check_targets
+from sklearn.metrics._classification import _check_targets, d2_log_loss_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import LabelBinarizer, label_binarize
 from sklearn.tree import DecisionTreeClassifier
@@ -2864,3 +2864,26 @@ def test_classification_metric_division_by_zero_nan_validaton(scoring):
     X, y = datasets.make_classification(random_state=0)
     classifier = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X, y)
     cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise")
+
+
+def test_d2_log_loss():
+    y_true = [0, 0, 0, 1, 1, 1]
+    y_pred = np.array(
+        [[0.9, 0.1], [0.8, 0.2], [0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.1, 0.9]]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert 0.5 < d2_score < 1.0
+
+    y_true = [0, 0, 0, 1, 1, 1]
+    y_pred = np.array(
+        [[0.5, 0.5], [0.1, 0.9], [0.1, 0.9], [0.9, 0.1], [0.75, 0.25], [0.1, 0.9]]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score < 0
+
+    y_true = [0, 0, 0, 1, 1, 1]
+    y_pred = np.array(
+        [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0

From 00e015dab36b7ea810bbc227f5dd57f5225e661b Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 9 Feb 2024 18:18:00 +0500
Subject: [PATCH 02/12] Add functionality to handle multiple classes, add
 further tests and add changelog

---
 doc/whats_new/v1.5.rst                       |   4 +
 sklearn/metrics/__init__.py                  |   2 +
 sklearn/metrics/_classification.py           |  71 +++++++-
 sklearn/metrics/tests/test_classification.py | 175 ++++++++++++++++++-
 4 files changed, 245 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 374e817b5f4c8..10bb621f7226e 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -112,6 +112,10 @@ Changelog
   :class:`~calibration.CalibrationDisplay`.
   :pr:`28051` by :user:`Pierre de Fréminville <pidefrem>`.
 
+- |Feature| :func:`metrics.d2_log_loss_score` has been added and this
+  calculates the D^2 score for the log loss.
+  :pr:`28351` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 713c5fe651dbb..d4ce0114d9af3 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -13,6 +13,7 @@
     classification_report,
     cohen_kappa_score,
     confusion_matrix,
+    d2_log_loss_score,
     f1_score,
     fbeta_score,
     hamming_loss,
@@ -114,6 +115,7 @@
     "coverage_error",
     "d2_tweedie_score",
     "d2_absolute_error_score",
+    "d2_log_loss_score",
     "d2_pinball_score",
     "dcg_score",
     "davies_bouldin_score",
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index bd7f1b6c48499..22503d75ee6a9 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3255,6 +3255,70 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     prefer_skip_nested_validation=True,
 )
 def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None):
+    """
+    :math:`D^2` regression score function, fraction of Tweedie deviance explained.
+
+    Best possible score is 1.0 and it can be negative (because the model can be
+    arbitrarily worse). A model that always uses the empirical mean of `y_true` as
+    constant prediction, disregarding the input features, gets a D^2 score of 0.0.
+
+    Read more in the :ref:`User Guide <d2_score>`.
+
+    .. versionadded:: 1.5
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`~sklearn.preprocessing.LabelBinarizer`.
+
+    eps : float or "auto", default="auto"
+        Log loss is undefined for p=0 or p=1, so probabilities are
+        clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
+        data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.2
+           The default value changed from `1e-15` to `"auto"` that is
+           equivalent to `np.finfo(y_pred.dtype).eps`.
+
+        .. deprecated:: 1.3
+           `eps` is deprecated in 1.3 and will be removed in 1.5.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+        .. versionadded:: 0.18
+
+    Returns
+    -------
+    z : float or ndarray of floats
+        The D^2 score.
+
+    Notes
+    -----
+    This is not a symmetric function.
+
+    Like R^2, D^2 score may be negative (it need not actually be the square of
+    a quantity D).
+
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+
+    """
     y_pred = check_array(
         y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
     )
@@ -3274,9 +3338,10 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
         labels=labels,
     )
 
-    # Proportion of positive class labels in the dataset
-    p_null = np.mean(y_true)
-    y_pred_null = np.full_like(y_pred, p_null)
+    # Proportion of labels in the dataset
+    y_values, counts = np.unique(y_true, return_counts=True)
+    y_prob = counts / len(y_true)
+    y_pred_null = np.tile(y_prob, (len(y_true), 1))
 
     # log likelihood of the null model
     denominator = -log_loss(
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 037ef1a465b8d..9e4770642dc1a 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2866,24 +2866,191 @@ def test_classification_metric_division_by_zero_nan_validaton(scoring):
     cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise")
 
 
-def test_d2_log_loss():
+def test_d2_log_loss_score():
+    # compare the d2 score value computed using the bernoulli log pmf
+    # with the d2 score computed using the function. The values in y_true
+    # are defined such that "no" corresponds to 0 and "yes" to 1.
+    y_true = ["no", "no", "no", "yes", "yes", "yes"]
+    y_pred = np.array(
+        [
+            [0.5, 0.5],
+            [0.9, 0.1],
+            [0.4, 0.6],
+            [0.6, 0.4],
+            [0.35, 0.65],
+            [0.01, 0.99],
+        ]
+    )
+    y_pred_null = np.array(
+        [
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    log_likelihood = np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1]))
+    log_likelihood_null = np.mean(
+        bernoulli.logpmf(np.array(y_true) == "yes", y_pred_null[:, 1])
+    )
+    d2_score_true = 1 - log_likelihood / log_likelihood_null
+    assert_almost_equal(d2_score, d2_score_true)
+
+    # check if good predictions give a relatively higher value for the d2 score
     y_true = [0, 0, 0, 1, 1, 1]
     y_pred = np.array(
-        [[0.9, 0.1], [0.8, 0.2], [0.9, 0.1], [0.1, 0.9], [0.2, 0.8], [0.1, 0.9]]
+        [
+            [0.9, 0.1],
+            [0.8, 0.2],
+            [0.9, 0.1],
+            [0.1, 0.9],
+            [0.2, 0.8],
+            [0.1, 0.9],
+        ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert 0.5 < d2_score < 1.0
 
+    # check if poor predictions gives a relatively low value for the d2 score.
     y_true = [0, 0, 0, 1, 1, 1]
     y_pred = np.array(
-        [[0.5, 0.5], [0.1, 0.9], [0.1, 0.9], [0.9, 0.1], [0.75, 0.25], [0.1, 0.9]]
+        [
+            [0.5, 0.5],
+            [0.1, 0.9],
+            [0.1, 0.9],
+            [0.9, 0.1],
+            [0.75, 0.25],
+            [0.1, 0.9],
+        ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score < 0
 
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0
     y_true = [0, 0, 0, 1, 1, 1]
     y_pred = np.array(
-        [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+        [
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+            [0.5, 0.5],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0 when the positive class has a higher proportion.
+    y_true = [0, 1, 1, 1]
+    y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]])
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0 where the y_true values are characters.
+    y_true = ["b", "c", "a", "d"]
+    y_pred = np.array(
+        [
+            [0.25, 0.25, 0.25, 0.25],
+            [0.25, 0.25, 0.25, 0.25],
+            [0.25, 0.25, 0.25, 0.25],
+            [0.25, 0.25, 0.25, 0.25],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score == 0
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0 where the y_true values are characters and "a"
+    # has a relatively higher proportion.
+    y_true = ["b", "c", "a", "a"]
+    y_pred = np.array(
+        [
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+        ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score == 0
+
+    # check if good predictions for character values gives a relatively
+    # higher d2 score.
+    y_true = ["b", "c", "a", "a"]
+    y_pred = np.array(
+        [
+            [0.1, 0.8, 0.1],
+            [0.1, 0.1, 0.8],
+            [0.8, 0.1, 0.1],
+            [0.8, 0.1, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert 0.5 < d2_score < 1.0
+
+    # check if poor predictions for character values gives a relatively
+    # low value for the d2 score.
+    y_true = ["b", "c", "a", "a"]
+    y_pred = np.array(
+        [
+            [0.6, 0.3, 0.1],
+            [0.5, 0.2, 0.3],
+            [0.3, 0.4, 0.3],
+            [0.4, 0.5, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert d2_score < 0
+
+    # check error if the number of classes are not equal.
+    y_true = [0, 1, 2]
+    y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]]
+    err = "contain different number of classes"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error if the number of classes in labels do not match the number
+    # of classes in y_pred.
+    y_true = ["a", "b", "c"]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
+    labels = [0, 1, 2]
+    err = "number of classes in labels is different"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred, labels=labels)
+
+    # check error if y_true and y_pred do not have equal lengths
+    y_true = [0, 1, 2]
+    y_pred = [[0.5, 0.5, 0.5], [0.6, 0.3, 0.1]]
+    err = "inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check warning for samples < 2
+    y_true = [1]
+    y_pred = [[0.5, 0.5]]
+    err = "score is not well-defined"
+    with pytest.warns(UndefinedMetricWarning, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error when y_true only has 1 label
+    y_true = [1, 1, 1]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]]
+    err = "y_true contains only one label"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred)
+
+    # check error when y_true only has 1 label and labels also has
+    # only 1 label
+    y_true = [1, 1, 1]
+    labels = [1]
+    y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 5]]
+    err = "The labels array needs to contain at least two"
+    with pytest.raises(ValueError, match=err):
+        d2_log_loss_score(y_true, y_pred, labels=labels)

From d3dc7a4eade21bbc156cec9dc0edad533a254346 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 9 Feb 2024 18:43:19 +0500
Subject: [PATCH 03/12] Fix docstring

---
 sklearn/metrics/_classification.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 22503d75ee6a9..29227ac97e26c 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3317,7 +3317,6 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
 
     This metric is not well-defined for single samples and will return a NaN
     value if n_samples is less than two.
-
     """
     y_pred = check_array(
         y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
@@ -3328,8 +3327,8 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
         warnings.warn(msg, UndefinedMetricWarning)
         return float("nan")
 
-    # log likelihood of the fitted model
-    numerator = -log_loss(
+    # log loss of the fitted model
+    numerator = log_loss(
         y_true=y_true,
         y_pred=y_pred,
         eps=eps,
@@ -3343,8 +3342,8 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
     y_prob = counts / len(y_true)
     y_pred_null = np.tile(y_prob, (len(y_true), 1))
 
-    # log likelihood of the null model
-    denominator = -log_loss(
+    # log loss of the null model
+    denominator = log_loss(
         y_true=y_true,
         y_pred=y_pred_null,
         eps=eps,

From d9126803130ce2d00fff1fd0a211161f9862a845 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 9 Feb 2024 18:45:45 +0500
Subject: [PATCH 04/12] Minor adjustment

---
 sklearn/metrics/_classification.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 29227ac97e26c..00a1952d56cb0 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3254,7 +3254,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     },
     prefer_skip_nested_validation=True,
 )
-def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=None):
+def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None):
     """
     :math:`D^2` regression score function, fraction of Tweedie deviance explained.
 
@@ -3279,6 +3279,9 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
         ordered alphabetically, as done by
         :class:`~sklearn.preprocessing.LabelBinarizer`.
 
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
     eps : float or "auto", default="auto"
         Log loss is undefined for p=0 or p=1, so probabilities are
         clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
@@ -3293,9 +3296,6 @@ def d2_log_loss_score(y_true, y_pred, *, eps="auto", sample_weight=None, labels=
         .. deprecated:: 1.3
            `eps` is deprecated in 1.3 and will be removed in 1.5.
 
-    sample_weight : array-like of shape (n_samples,), default=None
-        Sample weights.
-
     labels : array-like, default=None
         If not provided, labels will be inferred from y_true. If ``labels``
         is ``None`` and ``y_pred`` has shape (n_samples,) the labels are

From fb4ad0cd7c0bd1f2f554a0d88166f9eb2d8ed5b1 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 9 Feb 2024 18:59:58 +0500
Subject: [PATCH 05/12] Fix the title of the doc

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 00a1952d56cb0..e36f078b6373e 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3256,7 +3256,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
 )
 def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None):
     """
-    :math:`D^2` regression score function, fraction of Tweedie deviance explained.
+    :math:`D^2` score function, fraction of log loss explained.
 
     Best possible score is 1.0 and it can be negative (because the model can be
     arbitrarily worse). A model that always uses the empirical mean of `y_true` as

From 78c00bc356e51833b1bb731e7f9393ec40e7a43a Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Mon, 12 Feb 2024 14:43:07 +0500
Subject: [PATCH 06/12] Add the documentation for d2 log loss

---
 doc/modules/model_evaluation.rst | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index b60407bf1a12a..5d849f5332e81 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2761,6 +2761,52 @@ Here are some usage examples of the :func:`d2_absolute_error_score` function::
   >>> d2_absolute_error_score(y_true, y_pred)
   0.0
 
+D² log loss score
+^^^^^^^^^^^^^^^^^^^^^^^
+The :func:`d2_log_loss_score` function implements the special case
+of D² with the log loss, see :ref:`log_loss`, i.e.:
+
+.. math::
+
+  \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}).
+
+Here are some usage examples of the :func:`d2_log_loss_score` function::
+
+  >>> from sklearn.metrics import d2_log_loss_score
+  >>> y_true = [1, 1, 2, 3]
+  >>> y_pred = [
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ...    [0.5, 0.25, 0.25],
+  ... ]
+  >>> d2_log_loss_score(y_true, y_pred)
+  0.0
+  >>> y_true = [1, 2, 3]
+  >>> y_pred = [
+  ...     [0.98, 0.01, 0.01],
+  ...     [0.01, 0.98, 0.01],
+  ...     [0.01, 0.01, 0.98],
+  ... ]
+  >>> d2_log_loss_score(y_true, y_pred)
+  0.981...
+  >>> y_true = [1, 2, 3]
+  >>> y_pred = [
+  ...     [0.1, 0.6, 0.3],
+  ...     [0.1, 0.6, 0.3],
+  ...     [0.4, 0.5, 0.1],
+  ... ]
+  >>> d2_log_loss_score(y_true, y_pred)
+  -0.552...
+  >>> y_true = [0, 0, 1, 1]
+  >>> y_pred = [0.5, 0.5, 0.5, 0.5]
+  >>> d2_log_loss_score(y_true, y_pred)
+  0.0
+  >>> y_true = [0, 0, 1, 1]
+  >>> y_pred = [0.8, 0.8, 0.2, 0.2]
+  >>> d2_log_loss_score(y_true, y_pred)
+  -1.321...
+
 .. _visualization_regression_evaluation:
 
 Visual evaluation of regression models

From 7d68e344c7b4b880859620b469b362249a65b9ae Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Sat, 30 Mar 2024 12:58:08 +0500
Subject: [PATCH 07/12] Address suggestions provided on PR

---
 sklearn/metrics/_classification.py           | 34 ++++----------
 sklearn/metrics/tests/test_classification.py | 48 ++++++++++++++++++++
 2 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 772fb2534b47e..2b3215f3bee5c 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3297,7 +3297,7 @@ def brier_score_loss(
     },
     prefer_skip_nested_validation=True,
 )
-def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=None):
+def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
     """
     :math:`D^2` score function, fraction of log loss explained.
 
@@ -3307,12 +3307,10 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=
 
     Read more in the :ref:`User Guide <d2_score>`.
 
-    .. versionadded:: 1.5
-
     Parameters
     ----------
     y_true : array-like or label indicator matrix
-        Ground truth (correct) labels for n_samples samples.
+        The actuals labels for the n_samples samples.
 
     y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
         Predicted probabilities, as returned by a classifier's
@@ -3325,27 +3323,11 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
-    eps : float or "auto", default="auto"
-        Log loss is undefined for p=0 or p=1, so probabilities are
-        clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
-        data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
-
-        .. versionadded:: 1.2
-
-        .. versionchanged:: 1.2
-           The default value changed from `1e-15` to `"auto"` that is
-           equivalent to `np.finfo(y_pred.dtype).eps`.
-
-        .. deprecated:: 1.3
-           `eps` is deprecated in 1.3 and will be removed in 1.5.
-
     labels : array-like, default=None
         If not provided, labels will be inferred from y_true. If ``labels``
         is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
         assumed to be binary and are inferred from ``y_true``.
 
-        .. versionadded:: 0.18
-
     Returns
     -------
     z : float or ndarray of floats
@@ -3374,22 +3356,26 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, eps="auto", labels=
     numerator = log_loss(
         y_true=y_true,
         y_pred=y_pred,
-        eps=eps,
         normalize=False,
         sample_weight=sample_weight,
         labels=labels,
     )
 
     # Proportion of labels in the dataset
-    y_values, counts = np.unique(y_true, return_counts=True)
-    y_prob = counts / len(y_true)
+    if sample_weight is not None:
+        weights = np.asarray(sample_weight)
+    else:
+        weights = np.ones(shape=len(y_true), dtype=np.int64)
+
+    _, y_value_indices = np.unique(y_true, return_inverse=True)
+    counts = np.bincount(y_value_indices, weights=weights)
+    y_prob = counts / weights.sum()
     y_pred_null = np.tile(y_prob, (len(y_true), 1))
 
     # log loss of the null model
     denominator = log_loss(
         y_true=y_true,
         y_pred=y_pred_null,
-        eps=eps,
         normalize=False,
         sample_weight=sample_weight,
         labels=labels,
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index f346314616a42..b225826e90cb9 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -3100,3 +3100,51 @@ def test_d2_log_loss_score():
     err = "The labels array needs to contain at least two"
     with pytest.raises(ValueError, match=err):
         d2_log_loss_score(y_true, y_pred, labels=labels)
+
+    # Some tests with sample weights
+
+    # check if poor predictions for character values gives a relatively
+    # low value for the d2 score when sample weights are also given.
+    y_true = ["h", "d", "c", "c"]
+    sample_weight = [1.0, 0.5, 0.5, 1.0]
+    y_pred = np.array(
+        [
+            [0.6, 0.3, 0.1],
+            [0.5, 0.2, 0.3],
+            [0.3, 0.4, 0.3],
+            [0.4, 0.5, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert d2_score < 0
+
+    # check if good predictions for character values gives a relatively
+    # higher d2 score when sample weights are also given.
+    y_true = ["orange", "strawberry", "apple", "apple"]
+    sample_weight = [3, 2, 1, 4]
+    y_pred = np.array(
+        [
+            [0.1, 0.8, 0.1],
+            [0.1, 0.1, 0.8],
+            [0.8, 0.1, 0.1],
+            [0.8, 0.1, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert 0.5 < d2_score < 1.0
+
+    # check if simply using the average of the classes as the predictions
+    # gives a d2 score of 0 where the y_true values are characters and "okay"
+    # has a relatively higher proportion, when sample weights are also given.
+    y_true = ["low", "neutral", "high", "high"]
+    sample_weight = [2, 2, 2, 2]
+    y_pred = np.array(
+        [
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert d2_score == 0
\ No newline at end of file

From 04f7a38e5394b7716c6582694aa4bb5c6c5c2505 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Sat, 30 Mar 2024 13:02:36 +0500
Subject: [PATCH 08/12] Fix linting

---
 sklearn/metrics/tests/test_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index b225826e90cb9..591aa6d8703ec 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -3147,4 +3147,4 @@ def test_d2_log_loss_score():
         ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
-    assert d2_score == 0
\ No newline at end of file
+    assert d2_score == 0

From e1f6ae5d9cf75852278313793ea7a714e4d83fa2 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Fri, 12 Apr 2024 14:37:57 +0500
Subject: [PATCH 09/12] Address PR sugggestions

---
 sklearn/metrics/_classification.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 67dc4d3325354..479ab5d4231e2 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3290,7 +3290,6 @@ def brier_score_loss(
     {
         "y_true": ["array-like"],
         "y_pred": ["array-like"],
-        "eps": [StrOptions({"auto"}), Interval(Real, 0, 1, closed="both")],
         "sample_weight": ["array-like", None],
         "labels": ["array-like", None],
     },
@@ -3339,12 +3338,10 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
     Like R^2, D^2 score may be negative (it need not actually be the square of
     a quantity D).
 
-    This metric is not well-defined for single samples and will return a NaN
+    This metric is not well-defined for a single sample and will return a NaN
     value if n_samples is less than two.
     """
-    y_pred = check_array(
-        y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
-    )
+    y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric")
     check_consistent_length(y_pred, y_true, sample_weight)
     if _num_samples(y_pred) < 2:
         msg = "D^2 score is not well-defined with less than two samples."

From bfc32ebad404b4f0a9f34df6723af02fda6ae7de Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 22 Apr 2024 14:55:30 +0500
Subject: [PATCH 10/12] Address PR suggestions: improve organization of tests

---
 doc/modules/model_evaluation.rst             |   8 --
 doc/whats_new/v1.5.rst                       |   2 +-
 sklearn/metrics/_classification.py           |   2 +-
 sklearn/metrics/tests/test_classification.py | 127 ++++++++-----------
 4 files changed, 53 insertions(+), 86 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 5e37c68dfc2cf..c56b1bc01170b 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2865,14 +2865,6 @@ Here are some usage examples of the :func:`d2_log_loss_score` function::
   ... ]
   >>> d2_log_loss_score(y_true, y_pred)
   -0.552...
-  >>> y_true = [0, 0, 1, 1]
-  >>> y_pred = [0.5, 0.5, 0.5, 0.5]
-  >>> d2_log_loss_score(y_true, y_pred)
-  0.0
-  >>> y_true = [0, 0, 1, 1]
-  >>> y_pred = [0.8, 0.8, 0.2, 0.2]
-  >>> d2_log_loss_score(y_true, y_pred)
-  -1.321...
 
 |details-end|
 
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index a7edeec12d3e8..cbd49e98adaae 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -354,7 +354,7 @@ Changelog
   is deprecated and will raise an error in v1.7.
   :pr:`18555` by :user:`Kaushik Amar Das <cozek>`.
 
-- |Feature| :func:`metrics.d2_log_loss_score` has been added and this
+- |Feature| :func:`metrics.d2_log_loss_score` has been added which
   calculates the D^2 score for the log loss.
   :pr:`28351` by :user:`Omar Salman <OmarManzoor>`.
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index ac7cf6cd01371..883111b09785f 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3328,7 +3328,7 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
 
     Returns
     -------
-    z : float or ndarray of floats
+    d2 : float or ndarray of floats
         The D^2 score.
 
     Notes
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index adef0b4eeb845..1f8647c5419f8 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2908,10 +2908,7 @@ def test_brier_score_loss_deprecation_warning():
 
 
 def test_d2_log_loss_score():
-    # compare the d2 score value computed using the bernoulli log pmf
-    # with the d2 score computed using the function. The values in y_true
-    # are defined such that "no" corresponds to 0 and "yes" to 1.
-    y_true = ["no", "no", "no", "yes", "yes", "yes"]
+    y_true = [0, 0, 0, 1, 1, 1]
     y_pred = np.array(
         [
             [0.5, 0.5],
@@ -2933,10 +2930,8 @@ def test_d2_log_loss_score():
         ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
-    log_likelihood = np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1]))
-    log_likelihood_null = np.mean(
-        bernoulli.logpmf(np.array(y_true) == "yes", y_pred_null[:, 1])
-    )
+    log_likelihood = log_loss(y_true=y_true, y_pred=y_pred)
+    log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null)
     d2_score_true = 1 - log_likelihood / log_likelihood_null
     assert_almost_equal(d2_score, d2_score_true)
 
@@ -2993,24 +2988,26 @@ def test_d2_log_loss_score():
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score == 0
 
+
+def test_d2_log_loss_score_with_y_true_as_strings():
+    y_true = ["high", "high", "low", "neutral"]
+
     # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0 where the y_true values are characters.
-    y_true = ["b", "c", "a", "d"]
+    # gives a d2 score of 0.
     y_pred = np.array(
         [
-            [0.25, 0.25, 0.25, 0.25],
-            [0.25, 0.25, 0.25, 0.25],
-            [0.25, 0.25, 0.25, 0.25],
-            [0.25, 0.25, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
+            [0.5, 0.25, 0.25],
         ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score == 0
 
     # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0 where the y_true values are characters and "a"
-    # has a relatively higher proportion.
-    y_true = ["b", "c", "a", "a"]
+    # gives a d2 score of 0 when sample weights are also given.
+    sample_weight = [2, 2, 2, 2]
     y_pred = np.array(
         [
             [0.5, 0.25, 0.25],
@@ -3019,37 +3016,63 @@ def test_d2_log_loss_score():
             [0.5, 0.25, 0.25],
         ]
     )
-    d2_score = d2_log_loss_score(y_true, y_pred)
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
     assert d2_score == 0
 
-    # check if good predictions for character values gives a relatively
-    # higher d2 score.
-    y_true = ["b", "c", "a", "a"]
+    # check if good predictions give a relatively high d2 score.
     y_pred = np.array(
         [
+            [0.8, 0.1, 0.1],
+            [0.8, 0.1, 0.1],
             [0.1, 0.8, 0.1],
             [0.1, 0.1, 0.8],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred)
+    assert 0.5 < d2_score < 1.0
+
+    # check if good predictions give a relatively high d2 score
+    # when sample weights are also given.
+    sample_weight = [3, 2, 1, 4]
+    y_pred = np.array(
+        [
             [0.8, 0.1, 0.1],
             [0.8, 0.1, 0.1],
+            [0.1, 0.8, 0.1],
+            [0.1, 0.1, 0.8],
         ]
     )
-    d2_score = d2_log_loss_score(y_true, y_pred)
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
     assert 0.5 < d2_score < 1.0
 
-    # check if poor predictions for character values gives a relatively
-    # low value for the d2 score.
-    y_true = ["b", "c", "a", "a"]
+    # check if poor predictions give a relatively low d2 score.
     y_pred = np.array(
         [
-            [0.6, 0.3, 0.1],
-            [0.5, 0.2, 0.3],
-            [0.3, 0.4, 0.3],
+            [0.3, 0.6, 0.1],
+            [0.3, 0.2, 0.5],
+            [0.4, 0.3, 0.3],
             [0.4, 0.5, 0.1],
         ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score < 0
 
+    # check if poor predictions give a relatively low d2 score
+    # when sample weights are also given.
+    sample_weight = [1.0, 0.5, 0.5, 1.0]
+    y_pred = np.array(
+        [
+            [0.3, 0.6, 0.1],
+            [0.3, 0.2, 0.5],
+            [0.4, 0.3, 0.3],
+            [0.4, 0.5, 0.1],
+        ]
+    )
+    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
+    assert d2_score < 0
+
+
+def test_d2_log_loss_score_errors():
     # check error if the number of classes are not equal.
     y_true = [0, 1, 2]
     y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]]
@@ -3095,51 +3118,3 @@ def test_d2_log_loss_score():
     err = "The labels array needs to contain at least two"
     with pytest.raises(ValueError, match=err):
         d2_log_loss_score(y_true, y_pred, labels=labels)
-
-    # Some tests with sample weights
-
-    # check if poor predictions for character values gives a relatively
-    # low value for the d2 score when sample weights are also given.
-    y_true = ["h", "d", "c", "c"]
-    sample_weight = [1.0, 0.5, 0.5, 1.0]
-    y_pred = np.array(
-        [
-            [0.6, 0.3, 0.1],
-            [0.5, 0.2, 0.3],
-            [0.3, 0.4, 0.3],
-            [0.4, 0.5, 0.1],
-        ]
-    )
-    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
-    assert d2_score < 0
-
-    # check if good predictions for character values gives a relatively
-    # higher d2 score when sample weights are also given.
-    y_true = ["orange", "strawberry", "apple", "apple"]
-    sample_weight = [3, 2, 1, 4]
-    y_pred = np.array(
-        [
-            [0.1, 0.8, 0.1],
-            [0.1, 0.1, 0.8],
-            [0.8, 0.1, 0.1],
-            [0.8, 0.1, 0.1],
-        ]
-    )
-    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
-    assert 0.5 < d2_score < 1.0
-
-    # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0 where the y_true values are characters and "okay"
-    # has a relatively higher proportion, when sample weights are also given.
-    y_true = ["low", "neutral", "high", "high"]
-    sample_weight = [2, 2, 2, 2]
-    y_pred = np.array(
-        [
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-        ]
-    )
-    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
-    assert d2_score == 0

From c17809e940e406530854b79c3887fb521d5aed23 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Mon, 22 Apr 2024 17:37:45 +0500
Subject: [PATCH 11/12] Refine tests

---
 sklearn/metrics/tests/test_classification.py | 123 ++++++++-----------
 1 file changed, 54 insertions(+), 69 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 1f8647c5419f8..ba0e7f9c81306 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2909,6 +2909,7 @@ def test_brier_score_loss_deprecation_warning():
 
 def test_d2_log_loss_score():
     y_true = [0, 0, 0, 1, 1, 1]
+    y_true_string = ["no", "no", "no", "yes", "yes", "yes"]
     y_pred = np.array(
         [
             [0.5, 0.5],
@@ -2929,14 +2930,35 @@ def test_d2_log_loss_score():
             [0.5, 0.5],
         ]
     )
-    d2_score = d2_log_loss_score(y_true, y_pred)
-    log_likelihood = log_loss(y_true=y_true, y_pred=y_pred)
-    log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null)
+    d2_score = d2_log_loss_score(y_true=y_true, y_pred=y_pred)
+    log_likelihood = log_loss(y_true=y_true, y_pred=y_pred, normalize=False)
+    log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null, normalize=False)
+    d2_score_true = 1 - log_likelihood / log_likelihood_null
+    assert d2_score == pytest.approx(d2_score_true)
+
+    # check that using sample weight also gives the correct d2 score
+    sample_weight = np.array([2, 1, 3, 4, 3, 1])
+    y_pred_null[:, 0] = sample_weight[:3].sum() / sample_weight.sum()
+    y_pred_null[:, 1] = sample_weight[3:].sum() / sample_weight.sum()
+    d2_score = d2_log_loss_score(
+        y_true=y_true, y_pred=y_pred, sample_weight=sample_weight
+    )
+    log_likelihood = log_loss(
+        y_true=y_true,
+        y_pred=y_pred,
+        sample_weight=sample_weight,
+        normalize=False,
+    )
+    log_likelihood_null = log_loss(
+        y_true=y_true,
+        y_pred=y_pred_null,
+        sample_weight=sample_weight,
+        normalize=False,
+    )
     d2_score_true = 1 - log_likelihood / log_likelihood_null
-    assert_almost_equal(d2_score, d2_score_true)
+    assert d2_score == pytest.approx(d2_score_true)
 
     # check if good predictions give a relatively higher value for the d2 score
-    y_true = [0, 0, 0, 1, 1, 1]
     y_pred = np.array(
         [
             [0.9, 0.1],
@@ -2949,9 +2971,11 @@ def test_d2_log_loss_score():
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert 0.5 < d2_score < 1.0
+    # check that a similar value is obtained for string labels
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == pytest.approx(d2_score)
 
-    # check if poor predictions gives a relatively low value for the d2 score.
-    y_true = [0, 0, 0, 1, 1, 1]
+    # check if poor predictions gives a relatively low value for the d2 score
     y_pred = np.array(
         [
             [0.5, 0.5],
@@ -2964,6 +2988,9 @@ def test_d2_log_loss_score():
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score < 0
+    # check that a similar value is obtained for string labels
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == pytest.approx(d2_score)
 
     # check if simply using the average of the classes as the predictions
     # gives a d2 score of 0
@@ -2980,46 +3007,29 @@ def test_d2_log_loss_score():
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score == 0
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == 0
 
     # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0 when the positive class has a higher proportion.
+    # gives a d2 score of 0 when the positive class has a higher proportion
     y_true = [0, 1, 1, 1]
+    y_true_string = ["no", "yes", "yes", "yes"]
     y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]])
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score == 0
-
-
-def test_d2_log_loss_score_with_y_true_as_strings():
-    y_true = ["high", "high", "low", "neutral"]
-
-    # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0.
-    y_pred = np.array(
-        [
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-        ]
-    )
-    d2_score = d2_log_loss_score(y_true, y_pred)
-    assert d2_score == 0
-
-    # check if simply using the average of the classes as the predictions
-    # gives a d2 score of 0 when sample weights are also given.
+    d2_score_string = d2_log_loss_score(y_true_string, y_pred)
+    assert d2_score_string == 0
     sample_weight = [2, 2, 2, 2]
-    y_pred = np.array(
-        [
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-            [0.5, 0.25, 0.25],
-        ]
+    d2_score_with_sample_weight = d2_log_loss_score(
+        y_true, y_pred, sample_weight=sample_weight
     )
-    d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
-    assert d2_score == 0
+    assert d2_score_with_sample_weight == 0
+
+    # check that the d2 scores seem correct when more than 2
+    # labels are specified
+    y_true = ["high", "high", "low", "neutral"]
+    sample_weight = [1.4, 0.6, 0.8, 0.2]
 
-    # check if good predictions give a relatively high d2 score.
     y_pred = np.array(
         [
             [0.8, 0.1, 0.1],
@@ -3030,50 +3040,25 @@ def test_d2_log_loss_score_with_y_true_as_strings():
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert 0.5 < d2_score < 1.0
-
-    # check if good predictions give a relatively high d2 score
-    # when sample weights are also given.
-    sample_weight = [3, 2, 1, 4]
-    y_pred = np.array(
-        [
-            [0.8, 0.1, 0.1],
-            [0.8, 0.1, 0.1],
-            [0.1, 0.8, 0.1],
-            [0.1, 0.1, 0.8],
-        ]
-    )
     d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
     assert 0.5 < d2_score < 1.0
 
-    # check if poor predictions give a relatively low d2 score.
     y_pred = np.array(
         [
-            [0.3, 0.6, 0.1],
-            [0.3, 0.2, 0.5],
-            [0.4, 0.3, 0.3],
-            [0.4, 0.5, 0.1],
+            [0.2, 0.5, 0.3],
+            [0.1, 0.7, 0.2],
+            [0.1, 0.1, 0.8],
+            [0.2, 0.7, 0.1],
         ]
     )
     d2_score = d2_log_loss_score(y_true, y_pred)
     assert d2_score < 0
-
-    # check if poor predictions give a relatively low d2 score
-    # when sample weights are also given.
-    sample_weight = [1.0, 0.5, 0.5, 1.0]
-    y_pred = np.array(
-        [
-            [0.3, 0.6, 0.1],
-            [0.3, 0.2, 0.5],
-            [0.4, 0.3, 0.3],
-            [0.4, 0.5, 0.1],
-        ]
-    )
     d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight)
     assert d2_score < 0
 
 
-def test_d2_log_loss_score_errors():
-    # check error if the number of classes are not equal.
+def test_d2_log_loss_score_raises():
+    """Test that d2_log_loss raises error on invalid input."""
     y_true = [0, 1, 2]
     y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]]
     err = "contain different number of classes"

From 24cf7b0a9d9ecb4118275599ae8cceaa5a67be92 Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft>
Date: Thu, 2 May 2024 18:14:00 +0500
Subject: [PATCH 12/12] Updates: suggestions provided in review

---
 doc/modules/model_evaluation.rst   |  3 +++
 sklearn/metrics/_classification.py | 15 +++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index c56b1bc01170b..7caacd697ea1c 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -2837,6 +2837,9 @@ of D² with the log loss, see :ref:`log_loss`, i.e.:
 
   \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}).
 
+The :math:`y_{\text{null}}` for the :func:`log_loss` is the per-class
+proportion.
+
 Here are some usage examples of the :func:`d2_log_loss_score` function::
 
   >>> from sklearn.metrics import d2_log_loss_score
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 883111b09785f..9560e53b3ffa4 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -53,7 +53,11 @@
 from ..utils.extmath import _nanaverage
 from ..utils.multiclass import type_of_target, unique_labels
 from ..utils.sparsefuncs import count_nonzero
-from ..utils.validation import _check_pos_label_consistency, _num_samples
+from ..utils.validation import (
+    _check_pos_label_consistency,
+    _check_sample_weight,
+    _num_samples,
+)
 
 
 def _check_zero_division(zero_division):
@@ -3305,12 +3309,14 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
 
     Read more in the :ref:`User Guide <d2_score>`.
 
+    .. versionadded:: 1.5
+
     Parameters
     ----------
     y_true : array-like or label indicator matrix
         The actuals labels for the n_samples samples.
 
-    y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+    y_pred : array-like of shape (n_samples, n_classes) or (n_samples,)
         Predicted probabilities, as returned by a classifier's
         predict_proba method. If ``y_pred.shape = (n_samples,)``
         the probabilities provided are assumed to be that of the
@@ -3358,10 +3364,7 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
     )
 
     # Proportion of labels in the dataset
-    if sample_weight is not None:
-        weights = np.asarray(sample_weight)
-    else:
-        weights = np.ones(shape=len(y_true), dtype=np.int64)
+    weights = _check_sample_weight(sample_weight, y_true)
 
     _, y_value_indices = np.unique(y_true, return_inverse=True)
     counts = np.bincount(y_value_indices, weights=weights)