From f6307183822fa5ae4dd9565ec928ef9d6acf810e Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 20:47:08 -0400
Subject: [PATCH 01/24] add multi-class support

---
 sklearn/metrics/_classification.py | 98 ++++++++++++++++++++++--------
 1 file changed, 73 insertions(+), 25 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index e845e02808872..55a0e9a003d22 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2402,7 +2402,8 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
 
 
 @_deprecate_positional_args
-def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
+def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
+                     labels=None):
     """Compute the Brier score loss.
 
     The smaller the Brier score loss, the better, hence the naming with "loss".
@@ -2430,8 +2431,13 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     y_true : array of shape (n_samples,)
         True targets.
 
-    y_prob : array of shape (n_samples,)
-        Probabilities of the positive class.
+    y_prob : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`preprocessing.LabelBinarizer`, unless pos_label, or labels is specified.
 
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
@@ -2439,7 +2445,13 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     pos_label : int or str, default=None
         Label of the positive class.
         Defaults to the greater label unless y_true is all 0 or all -1
-        in which case pos_label defaults to 1.
+        in which case pos_label defaults to 1. Ignored if more than 2 distinct values in
+        y_true or labels argument is passed.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
 
     Returns
     -------
@@ -2467,31 +2479,67 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     .. [1] `Wikipedia entry for the Brier score
             <https://en.wikipedia.org/wiki/Brier_score>`_.
     """
-    y_true = column_or_1d(y_true)
+    y_true = check_array(y_prob, ensure_2d=False)
     y_prob = column_or_1d(y_prob)
-    assert_all_finite(y_true)
-    assert_all_finite(y_prob)
-    check_consistent_length(y_true, y_prob, sample_weight)
-
-    labels = np.unique(y_true)
-    if len(labels) > 2:
-        raise ValueError("Only binary classification is supported. "
-                         "Labels in y_true: %s." % labels)
+    check_consistent_length(y_prob, y_true, sample_weight)
+
     if y_prob.max() > 1:
         raise ValueError("y_prob contains values greater than 1.")
     if y_prob.min() < 0:
         raise ValueError("y_prob contains values less than 0.")
 
-    # if pos_label=None, when y_true is in {-1, 1} or {0, 1},
-    # pos_label is set to 1 (consistent with precision_recall_curve/roc_curve),
-    # otherwise pos_label is set to the greater label
-    # (different from precision_recall_curve/roc_curve,
-    # the purpose is to keep backward compatibility).
-    if pos_label is None:
-        if (np.array_equal(labels, [0]) or
-                np.array_equal(labels, [-1])):
-            pos_label = 1
+    lb = LabelBinarizer()
+    if labels is not None:
+        lb = lb.fit(labels)
+    elif len(np.unique(y_true)) > 2:
+        lb = lb.fit(y_true)
+    else:
+        # if pos_label=None, when y_true is in {-1, 1} or {0, 1},
+        # pos_label is set to 1 (consistent with precision_recall_curve/roc_curve),
+        # otherwise pos_label is set to the greater label
+        # (different from precision_recall_curve/roc_curve,
+        # the purpose is to keep backward compatibility).
+        if pos_label is None:
+            if np.all(y_true == 0) or np.all(y_true == -1):
+                pos_label = 1
+            else:
+                pos_label = y_true.max()
+        y_true = np.array(y_true == pos_label, int)
+        lb = lb.fit([0, 1])  # fit on [0, 1] because y_true can be all 0s or all 1s, but we want to assume binary
+
+    transformed_labels = lb.transform(y_true)
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(1-transformed_labels, transformed_labels, axis=1)
+
+    # If y_prob is of single dimension, assume y_true to be binary
+    if y_prob.ndim == 1:
+        y_prob = y_prob[:, np.newaxis]
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_prob.shape[1]:
+        if labels is None:
+            raise ValueError("y_true and y_prob contain different number of "
+                             "classes {0}, {1}. Please provide the true "
+                             "labels explicitly through the labels argument. "
+                             "Classes found in "
+                             "y_true: {2}".format(transformed_labels.shape[1],
+                                                  y_prob.shape[1],
+                                                  lb.classes_))
         else:
-            pos_label = y_true.max()
-    y_true = np.array(y_true == pos_label, int)
-    return np.average((y_true - y_prob) ** 2, weights=sample_weight)
+            raise ValueError('The number of classes in labels is different '
+                             'from that in y_prob. Classes found in '
+                             'labels: {0}'.format(lb.classes_))
+
+    # calculate
+    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1), weights=sample_weights)
+
+    # Original definition of Brier Score sums loss over all samples over all classes. sklearn historically uses
+    # the alternate version for the binary case, where it sums only over positive class. Maintain that behaviour
+    # for backwards compatibility
+    if len(lb.classes_) == 2:
+        brier_loss /= 2
+
+    return brier_loss

From e08d4f4b33866988a50a9eb583927e529d6696c3 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 20:48:32 -0400
Subject: [PATCH 02/24] fix swapped y_true y_prob

---
 sklearn/metrics/_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 55a0e9a003d22..0ef4d3b245fbe 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2479,8 +2479,8 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     .. [1] `Wikipedia entry for the Brier score
             <https://en.wikipedia.org/wiki/Brier_score>`_.
     """
-    y_true = check_array(y_prob, ensure_2d=False)
-    y_prob = column_or_1d(y_prob)
+    y_true = column_or_1d(y_true)
+    y_prob = check_array(y_prob, ensure_2d=False)
     check_consistent_length(y_prob, y_true, sample_weight)
 
     if y_prob.max() > 1:

From eff8854ac969eeb7088b90a57f122ff4537b8ffd Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 20:51:28 -0400
Subject: [PATCH 03/24] fix docstring

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 0ef4d3b245fbe..fbb786a8def79 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2437,7 +2437,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
         the probabilities provided are assumed to be that of the
         positive class. The labels in ``y_pred`` are assumed to be
         ordered alphabetically, as done by
-        :class:`preprocessing.LabelBinarizer`, unless pos_label, or labels is specified.
+        :class:`preprocessing.LabelBinarizer`, unless ``pos_label`` or ``labels`` is specified.
 
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.

From d86439514a061a4cb984e86205b5e2131e76a5db Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 21:14:10 -0400
Subject: [PATCH 04/24] fix docstring

---
 sklearn/metrics/_classification.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index fbb786a8def79..167386891b6e2 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2409,18 +2409,21 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     The smaller the Brier score loss, the better, hence the naming with "loss".
     The Brier score measures the mean squared difference between the predicted
     probability and the actual outcome. The Brier score always
-    takes on a value between zero and one, since this is the largest
+    takes on a value between zero and two, since this is the largest
     possible difference between a predicted probability (which must be
     between zero and one) and the actual outcome (which can take on values
-    of only 0 and 1). It can be decomposed is the sum of refinement loss and
-    calibration loss.
+    of only 0 and 1), when summed over all the distinct classes. NOTE: in the binary case, scikit-learn uses an
+    alternate version of the brier score that is exactly half of the original definition and thus has a range
+    between [0, 1].
+
+    It can be decomposed is the sum of refinement loss and calibration loss.
 
     The Brier score is appropriate for binary and categorical outcomes that
     can be structured as true or false, but is inappropriate for ordinal
     variables which can take on three or more values (this is because the
     Brier score assumes that all possible outcomes are equivalently
     "distant" from one another). Which label is considered to be the positive
-    label is controlled via the parameter `pos_label`, which defaults to
+    label in the binary case is controlled via the parameter `pos_label`, which defaults to
     the greater label unless `y_true` is all 0 or all -1, in which case
     `pos_label` defaults to 1.
 

From 32ab60a5e13d6dbbe9d27dd28bbb2152db9e9ed4 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 21:17:59 -0400
Subject: [PATCH 05/24] fix variable name spelling

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 167386891b6e2..7ff4f3a4dd81c 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2537,7 +2537,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
                              'labels: {0}'.format(lb.classes_))
 
     # calculate
-    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1), weights=sample_weights)
+    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1), weights=sample_weight)
 
     # Original definition of Brier Score sums loss over all samples over all classes. sklearn historically uses
     # the alternate version for the binary case, where it sums only over positive class. Maintain that behaviour

From 6e73c0d8a2d3b01a2d4a3694ffbe1db91a25a151 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 21:39:41 -0400
Subject: [PATCH 06/24] add tests

---
 sklearn/metrics/tests/test_classification.py | 31 ++++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 0fa97eea609f2..8db32ea89b41d 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2317,14 +2317,28 @@ def test_brier_score_loss():
     with pytest.raises(ValueError):
         brier_score_loss(y_true, y_pred - 1.)
 
-    # ensure to raise an error for multiclass y_true
+    # ensure to raise an error for wrong number of classes
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = ("Only binary classification is supported. Labels "
-                     "in y_true: {}".format(np.array([0, 1, 2])))
+    error_message = ("y_true and y_prob contain different number of "
+                     "classes 3, 2. Please provide the true "
+                     "labels explicitly through the labels argument. "
+                     "Classes found in "
+                     "y_true: [0 1 2]")
     with pytest.raises(ValueError, match=re.escape(error_message)):
         brier_score_loss(y_true, y_pred)
 
+    y_true = ['eggs', 'spam', 'ham']
+    y_pred = [[1, 0, 0],
+              [0, 1, 0],
+              [0, 1, 0]]
+    labels = ['eggs', 'spam', 'ham', 'yams']
+    error_message = ("The number of classes in labels is different "
+                     "from that in y_prob. Classes found in "
+                     "labels: ['eggs' 'ham' 'spam' 'yams']")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        brier_score_loss(y_true, y_pred, labels=labels)
+
     # calculate correctly when there's only one class in y_true
     assert_almost_equal(brier_score_loss([-1], [0.4]), 0.16)
     assert_almost_equal(brier_score_loss([0], [0.4]), 0.16)
@@ -2334,6 +2348,17 @@ def test_brier_score_loss():
     assert_almost_equal(
         brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36)
 
+    # calculate multiclass
+    assert_almost_equal(brier_score_loss(['eggs', 'spam', 'ham'], [[1, 0, 0],
+                                                                   [0, 1, 0],
+                                                                   [0, 1, 0]]),
+                        2/3)
+    assert_almost_equal(brier_score_loss(['eggs', 'spam', 'ham'], [[1, 0, 0, 0],
+                                                                   [0, 1, 0, 0],
+                                                                   [0, 1, 0, 0]],
+                                         labels=['eggs', 'spam', 'ham', 'yams']),
+                        2/3)
+
 
 def test_balanced_accuracy_score_unseen():
     assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',

From 9cd4247d1dd68ae2b479ccbb0a8c0d140f046afb Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 25 Oct 2020 21:50:17 -0400
Subject: [PATCH 07/24] import re

---
 sklearn/metrics/tests/test_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index a4dccb16ca9f1..7bf42dc10c184 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -4,10 +4,10 @@
 from itertools import chain
 from itertools import permutations
 import warnings
-
 import numpy as np
 from scipy import linalg
 import pytest
+import re
 
 from sklearn import datasets
 from sklearn import svm

From 136994532e4182bca533534d6b0cd33210448dbe Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 28 Oct 2020 10:40:21 -0400
Subject: [PATCH 08/24] fix docstring

---
 sklearn/metrics/_classification.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 81c921687aef9..afadce37d6ca3 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2448,17 +2448,6 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
         Sample weights.
 
     pos_label : int or str, default=None
-<<<<<<< HEAD
-        Label of the positive class.
-        Defaults to the greater label unless y_true is all 0 or all -1
-        in which case pos_label defaults to 1. Ignored if more than 2 distinct values in
-        y_true or labels argument is passed.
-
-    labels : array-like, default=None
-        If not provided, labels will be inferred from y_true. If ``labels``
-        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
-        assumed to be binary and are inferred from ``y_true``.
-=======
         Label of the positive class. `pos_label` will be infered in the
         following manner:
 
@@ -2467,7 +2456,11 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
           `pos_label` should be explicitely specified;
         * otherwise, `pos_label` defaults to the greater label,
           i.e. `np.unique(y_true)[-1]`.
->>>>>>> upstream/master
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
 
     Returns
     -------

From a183d06720815aafc34db2b9ae91b96564a6bcc8 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 28 Oct 2020 10:49:32 -0400
Subject: [PATCH 09/24] fix linting

---
 sklearn/metrics/_classification.py | 31 ++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index afadce37d6ca3..14ff6f8913d84 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2414,8 +2414,9 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     takes on a value between zero and two, since this is the largest
     possible difference between a predicted probability (which must be
     between zero and one) and the actual outcome (which can take on values
-    of only 0 and 1), when summed over all the distinct classes. NOTE: in the binary case, scikit-learn uses an
-    alternate version of the brier score that is exactly half of the original definition and thus has a range
+    of only 0 and 1), when summed over all the distinct classes. NOTE: in the
+    binary case, scikit-learn uses an alternate version of the brier score that
+    is exactly half of the original definition and thus has a range
     between [0, 1].
 
     It can be decomposed is the sum of refinement loss and calibration loss.
@@ -2425,9 +2426,9 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     variables which can take on three or more values (this is because the
     Brier score assumes that all possible outcomes are equivalently
     "distant" from one another). Which label is considered to be the positive
-    label in the binary case is controlled via the parameter `pos_label`, which defaults to
-    the greater label unless `y_true` is all 0 or all -1, in which case
-    `pos_label` defaults to 1.
+    label in the binary case is controlled via the parameter `pos_label`, which
+    defaults to the greater label unless `y_true` is all 0 or all -1, in which
+    case `pos_label` defaults to 1.
 
     Read more in the :ref:`User Guide <brier_score_loss>`.
 
@@ -2436,13 +2437,14 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     y_true : array of shape (n_samples,)
         True targets.
 
-    y_prob : array-like of float, shape = (n_samples, n_classes) or (n_samples,)
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
         Predicted probabilities, as returned by a classifier's
         predict_proba method. If ``y_pred.shape = (n_samples,)``
         the probabilities provided are assumed to be that of the
         positive class. The labels in ``y_pred`` are assumed to be
         ordered alphabetically, as done by
-        :class:`preprocessing.LabelBinarizer`, unless ``pos_label`` or ``labels`` is specified.
+        :class:`preprocessing.LabelBinarizer`, unless ``pos_label`` or
+        ``labels`` is specified.
 
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
@@ -2514,11 +2516,14 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
             else:
                 raise
         y_true = np.array(y_true == pos_label, int)
-        lb = lb.fit([0, 1])  # fit on [0, 1] because y_true can be all 0s or all 1s, but we want to assume binary
+        # fit on [0, 1] because y_true can be all 0s or all 1s, but we want
+        # to assume binary
+        lb = lb.fit([0, 1])
 
     transformed_labels = lb.transform(y_true)
     if transformed_labels.shape[1] == 1:
-        transformed_labels = np.append(1-transformed_labels, transformed_labels, axis=1)
+        transformed_labels = np.append(1-transformed_labels,
+                                       transformed_labels, axis=1)
 
     # If y_prob is of single dimension, assume y_true to be binary
     if y_prob.ndim == 1:
@@ -2543,10 +2548,12 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
                              'labels: {0}'.format(lb.classes_))
 
     # calculate
-    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1), weights=sample_weight)
+    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
+                            weights=sample_weight)
 
-    # Original definition of Brier Score sums loss over all samples over all classes. sklearn historically uses
-    # the alternate version for the binary case, where it sums only over positive class. Maintain that behaviour
+    # Original definition of Brier Score sums loss over all samples over all
+    # classes. sklearn historically uses the alternate version for the binary
+    # case, where it sums only over positive class. Maintain that behaviour
     # for backwards compatibility
     if len(lb.classes_) == 2:
         brier_loss /= 2

From 08688d3aac3d1b17805619749fdb28bd626070bb Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 28 Oct 2020 10:57:44 -0400
Subject: [PATCH 10/24] fix linting

---
 sklearn/metrics/tests/test_classification.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 7bf42dc10c184..7c1bfe292476c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2352,11 +2352,13 @@ def test_brier_score_loss():
                                                                    [0, 1, 0],
                                                                    [0, 1, 0]]),
                         2/3)
-    assert_almost_equal(brier_score_loss(['eggs', 'spam', 'ham'], [[1, 0, 0, 0],
-                                                                   [0, 1, 0, 0],
-                                                                   [0, 1, 0, 0]],
-                                         labels=['eggs', 'spam', 'ham', 'yams']),
-                        2/3)
+    assert_almost_equal(
+        brier_score_loss(['eggs', 'spam', 'ham'],
+                         [[1, 0, 0, 0],
+                          [0, 1, 0, 0],
+                          [0, 1, 0, 0]],
+                         labels=['eggs', 'spam', 'ham', 'yams']),
+        2/3)
 
 
 def test_balanced_accuracy_score_unseen():

From 4f8a5f280c736bd5f196155a4080884193e1b992 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 28 Oct 2020 11:02:38 -0400
Subject: [PATCH 11/24] remove unused import

---
 sklearn/metrics/_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 14ff6f8913d84..76d63263d6e5d 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -30,7 +30,6 @@
 
 from ..preprocessing import LabelBinarizer
 from ..preprocessing import LabelEncoder
-from ..utils import assert_all_finite
 from ..utils import check_array
 from ..utils import check_consistent_length
 from ..utils import column_or_1d

From 7b5143375260de0618ea090de4ebebad50e5c72f Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Sun, 1 Nov 2020 23:04:11 -0500
Subject: [PATCH 12/24] add multiclass_brier_score_loss

---
 sklearn/metrics/_classification.py | 201 ++++++++++++++++++++---------
 1 file changed, 138 insertions(+), 63 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 76d63263d6e5d..3c866ad781461 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -30,6 +30,7 @@
 
 from ..preprocessing import LabelBinarizer
 from ..preprocessing import LabelEncoder
+from ..utils import assert_all_finite
 from ..utils import check_array
 from ..utils import check_consistent_length
 from ..utils import column_or_1d
@@ -40,8 +41,6 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
-from ._base import _check_pos_label_consistency
-
 
 def _check_zero_division(zero_division):
     if isinstance(zero_division, str) and zero_division == "warn":
@@ -2403,31 +2402,32 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
 
 
 @_deprecate_positional_args
-def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
-                     labels=None):
+def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     """Compute the Brier score loss.
 
     The smaller the Brier score loss, the better, hence the naming with "loss".
     The Brier score measures the mean squared difference between the predicted
     probability and the actual outcome. The Brier score always
-    takes on a value between zero and two, since this is the largest
+    takes on a value between zero and one, since this is the largest
     possible difference between a predicted probability (which must be
     between zero and one) and the actual outcome (which can take on values
-    of only 0 and 1), when summed over all the distinct classes. NOTE: in the
-    binary case, scikit-learn uses an alternate version of the brier score that
-    is exactly half of the original definition and thus has a range
-    between [0, 1].
-
-    It can be decomposed is the sum of refinement loss and calibration loss.
+    of only 0 and 1). It can be decomposed is the sum of refinement loss and
+    calibration loss.
 
     The Brier score is appropriate for binary and categorical outcomes that
     can be structured as true or false, but is inappropriate for ordinal
     variables which can take on three or more values (this is because the
     Brier score assumes that all possible outcomes are equivalently
     "distant" from one another). Which label is considered to be the positive
-    label in the binary case is controlled via the parameter `pos_label`, which
-    defaults to the greater label unless `y_true` is all 0 or all -1, in which
-    case `pos_label` defaults to 1.
+    label is controlled via the parameter `pos_label`, which defaults to
+    the greater label unless `y_true` is all 0 or all -1, in which case
+    `pos_label` defaults to 1.
+
+    A more generalized form of Brier score is implemented in
+    :func:`multiclass_brier_score_loss` that is applicable to the multi-class
+    case as well. When used for the binary case, `multiclass_brier_score_loss`
+    returns Brier score that is exactly twice of the value returned by this
+    function.
 
     Read more in the :ref:`User Guide <brier_score_loss>`.
 
@@ -2436,32 +2436,16 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     y_true : array of shape (n_samples,)
         True targets.
 
-    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
-        Predicted probabilities, as returned by a classifier's
-        predict_proba method. If ``y_pred.shape = (n_samples,)``
-        the probabilities provided are assumed to be that of the
-        positive class. The labels in ``y_pred`` are assumed to be
-        ordered alphabetically, as done by
-        :class:`preprocessing.LabelBinarizer`, unless ``pos_label`` or
-        ``labels`` is specified.
+    y_prob : array of shape (n_samples,)
+        Probabilities of the positive class.
 
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     pos_label : int or str, default=None
-        Label of the positive class. `pos_label` will be infered in the
-        following manner:
-
-        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
-        * else if `y_true` contains string, an error will be raised and
-          `pos_label` should be explicitely specified;
-        * otherwise, `pos_label` defaults to the greater label,
-          i.e. `np.unique(y_true)[-1]`.
-
-    labels : array-like, default=None
-        If not provided, labels will be inferred from y_true. If ``labels``
-        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
-        assumed to be binary and are inferred from ``y_true``.
+        Label of the positive class.
+        Defaults to the greater label unless y_true is all 0 or all -1
+        in which case pos_label defaults to 1.
 
     Returns
     -------
@@ -2484,6 +2468,112 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)
     0.0
 
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Brier score
+            <https://en.wikipedia.org/wiki/Brier_score>`_.
+    """
+    y_true = column_or_1d(y_true)
+    y_prob = column_or_1d(y_prob)
+    assert_all_finite(y_true)
+    assert_all_finite(y_prob)
+    check_consistent_length(y_true, y_prob, sample_weight)
+
+    labels = np.unique(y_true)
+    if len(labels) > 2:
+        raise ValueError("Only binary classification is supported. "
+                         "Labels in y_true: %s. "
+                         "Use multiclass_brier_score_loss instead" % labels)
+    if y_prob.max() > 1:
+        raise ValueError("y_prob contains values greater than 1.")
+    if y_prob.min() < 0:
+        raise ValueError("y_prob contains values less than 0.")
+
+    # if pos_label=None, when y_true is in {-1, 1} or {0, 1},
+    # pos_label is set to 1 (consistent with precision_recall_curve/roc_curve),
+    # otherwise pos_label is set to the greater label
+    # (different from precision_recall_curve/roc_curve,
+    # the purpose is to keep backward compatibility).
+    if pos_label is None:
+        if (np.array_equal(labels, [0]) or
+                np.array_equal(labels, [-1])):
+            pos_label = 1
+        else:
+            pos_label = y_true.max()
+    y_true = np.array(y_true == pos_label, int)
+    return np.average((y_true - y_prob) ** 2, weights=sample_weight)
+
+
+def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
+                                labels=None):
+    r"""Compute the Brier score loss.
+
+    The smaller the Brier score loss, the better, hence the naming with "loss".
+    The Brier score measures the mean squared difference between the predicted
+    probability and the actual outcome.
+
+    For :math:`N` samples with :math:`C` different classes, the multi-class
+    Brier score is defined as:
+
+    .. math::
+        \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \bar{y}_{ic})^{2}
+
+    where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`,
+    otherwise 0 and :math:`\bar{y}_{ic}` is the predicted probability of
+    observation `i` for class `c`. The probabilities for `c` classes for
+    observation `i` should sum to 1.
+
+    The Brier score always takes on a value between [0, 2]. For the
+    binary case however, there is a more common definition of Brier score
+    implemented in :func:`brier_score_loss` that is exactly half of the value
+    returned by this function, thereby having a range between [0, 1].
+
+    It can be decomposed as the sum of refinement loss and calibration loss.
+
+    The Brier score is appropriate for binary and categorical outcomes that
+    can be structured as true or false, but is inappropriate for ordinal
+    variables which can take on three or more values (this is because the
+    Brier score assumes that all possible outcomes are equivalently
+    "distant" from one another).
+
+    Read more in the :ref:`User Guide <brier_score_loss>`.
+
+    Parameters
+    ----------
+    y_true : array of shape (n_samples,)
+        True targets.
+
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_pred`` are assumed to be
+        ordered alphabetically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    score : float
+        Brier score loss.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import multiclass_brier_score_loss
+    >>> multiclass_brier_score_loss([0, 1, 1, 0],
+    ...                             [0.1, 0.9, 0.8, 0.3])
+    0.074...
+    >>> multiclass_brier_score_loss(['eggs', 'ham', 'spam'], [[.8, .1, .1],
+    ...                                                       [.2, .7, .1],
+    ...                                                       [.2, .2, .6]])
+    0.146...
     References
     ----------
     .. [1] `Wikipedia entry for the Brier score
@@ -2501,25 +2591,20 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
     lb = LabelBinarizer()
     if labels is not None:
         lb = lb.fit(labels)
-    elif len(np.unique(y_true)) > 2:
-        lb = lb.fit(y_true)
     else:
-        try:
-            pos_label = _check_pos_label_consistency(pos_label, y_true)
-        except ValueError:
-            classes = np.unique(y_true)
-            if classes.dtype.kind not in ('O', 'U', 'S'):
-                # for backward compatibility, if classes are not string then
-                # `pos_label` will correspond to the greater label
-                pos_label = classes[-1]
-            else:
-                raise
-        y_true = np.array(y_true == pos_label, int)
-        # fit on [0, 1] because y_true can be all 0s or all 1s, but we want
-        # to assume binary
-        lb = lb.fit([0, 1])
+        lb = lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError('y_true contains only one label ({0}). Please '
+                             'provide the true labels explicitly through the '
+                             'labels argument.'.format(lb.classes_[0]))
+        else:
+            raise ValueError('The labels array needs to contain at least two '
+                             'labels, got {0}.'.format(lb.classes_))
 
     transformed_labels = lb.transform(y_true)
+
     if transformed_labels.shape[1] == 1:
         transformed_labels = np.append(1-transformed_labels,
                                        transformed_labels, axis=1)
@@ -2546,15 +2631,5 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None,
                              'from that in y_prob. Classes found in '
                              'labels: {0}'.format(lb.classes_))
 
-    # calculate
-    brier_loss = np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
-                            weights=sample_weight)
-
-    # Original definition of Brier Score sums loss over all samples over all
-    # classes. sklearn historically uses the alternate version for the binary
-    # case, where it sums only over positive class. Maintain that behaviour
-    # for backwards compatibility
-    if len(lb.classes_) == 2:
-        brier_loss /= 2
-
-    return brier_loss
+    return np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
+                      weights=sample_weight)

From d5c90bf2588477c4e90139ef9b7ab8008087635d Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:12:04 -0500
Subject: [PATCH 13/24] add tests

---
 sklearn/metrics/__init__.py                  |  2 +
 sklearn/metrics/_classification.py           |  1 +
 sklearn/metrics/tests/test_classification.py | 91 +++++++++++++++-----
 3 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index ebe9affb5e3e3..14d92c0dfb297 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -35,6 +35,7 @@
 from ._classification import zero_one_loss
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
+from ._classification import multiclass_brier_score_loss
 
 from . import cluster
 from .cluster import adjusted_mutual_info_score
@@ -137,6 +138,7 @@
     'median_absolute_error',
     'mean_absolute_percentage_error',
     'multilabel_confusion_matrix',
+    'multiclass_brier_score_loss',
     'mutual_info_score',
     'ndcg_score',
     'normalized_mutual_info_score',
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 3c866ad781461..a7f39c26b446b 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2574,6 +2574,7 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
     ...                                                       [.2, .7, .1],
     ...                                                       [.2, .2, .6]])
     0.146...
+
     References
     ----------
     .. [1] `Wikipedia entry for the Brier score
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 7c1bfe292476c..c4aabf9c1a08b 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -4,10 +4,11 @@
 from itertools import chain
 from itertools import permutations
 import warnings
+import re
+
 import numpy as np
 from scipy import linalg
 import pytest
-import re
 
 from sklearn import datasets
 from sklearn import svm
@@ -44,6 +45,7 @@
 from sklearn.metrics import zero_one_loss
 from sklearn.metrics import brier_score_loss
 from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import multiclass_brier_score_loss
 
 from sklearn.metrics._classification import _check_targets
 from sklearn.exceptions import UndefinedMetricWarning
@@ -2316,6 +2318,45 @@ def test_brier_score_loss():
     with pytest.raises(ValueError):
         brier_score_loss(y_true, y_pred - 1.)
 
+    # ensure to raise an error for multiclass y_true
+    y_true = np.array([0, 1, 2, 0])
+    y_pred = np.array([0.8, 0.6, 0.4, 0.2])
+    error_message = ("Only binary classification is supported. Labels in "
+                     "y_true: {}. Use multiclass_brier_score_loss "
+                     "instead".format(np.array([0, 1, 2])))
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        brier_score_loss(y_true, y_pred)
+
+    # calculate correctly when there's only one class in y_true
+    assert_almost_equal(brier_score_loss([-1], [0.4]), 0.16)
+    assert_almost_equal(brier_score_loss([0], [0.4]), 0.16)
+    assert_almost_equal(brier_score_loss([1], [0.4]), 0.36)
+    assert_almost_equal(
+        brier_score_loss(['foo'], [0.4], pos_label='bar'), 0.16)
+    assert_almost_equal(
+        brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36)
+
+
+def test_multiclass_brier_score_loss():
+    # Check brier_score_loss function
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95])
+
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
+                        .05083333)
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
+                        brier_score_loss(y_true, y_pred) * 2)
+
+    with pytest.raises(ValueError):
+        # bad length of y_pred
+        multiclass_brier_score_loss(y_true, y_pred[1:])
+    with pytest.raises(ValueError):
+        # y_pred has value greater than 1
+        multiclass_brier_score_loss(y_true, y_pred + 1.)
+    with pytest.raises(ValueError):
+        # y_pred has value less than 1
+        multiclass_brier_score_loss(y_true, y_pred - 1.)
+
     # ensure to raise an error for wrong number of classes
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
@@ -2325,7 +2366,7 @@ def test_brier_score_loss():
                      "Classes found in "
                      "y_true: [0 1 2]")
     with pytest.raises(ValueError, match=re.escape(error_message)):
-        brier_score_loss(y_true, y_pred)
+        multiclass_brier_score_loss(y_true, y_pred)
 
     y_true = ['eggs', 'spam', 'ham']
     y_pred = [[1, 0, 0],
@@ -2336,30 +2377,38 @@ def test_brier_score_loss():
                      "from that in y_prob. Classes found in "
                      "labels: ['eggs' 'ham' 'spam' 'yams']")
     with pytest.raises(ValueError, match=re.escape(error_message)):
-        brier_score_loss(y_true, y_pred, labels=labels)
+        multiclass_brier_score_loss(y_true, y_pred, labels=labels)
+
+    # raise error message when there's only one class in y_true
+    y_true = ['eggs']
+    y_pred = [.1]
+    error_message = ('y_true contains only one label ({0}). Please '
+                     'provide the true labels explicitly through the '
+                     'labels argument.'.format(y_true[0]))
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        multiclass_brier_score_loss(y_true, y_pred)
 
-    # calculate correctly when there's only one class in y_true
-    assert_almost_equal(brier_score_loss([-1], [0.4]), 0.16)
-    assert_almost_equal(brier_score_loss([0], [0.4]), 0.16)
-    assert_almost_equal(brier_score_loss([1], [0.4]), 0.36)
-    assert_almost_equal(
-        brier_score_loss(['foo'], [0.4], pos_label='bar'), 0.16)
-    assert_almost_equal(
-        brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36)
+    # error is fixed when labels is specified
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred,
+                                                    labels=['eggs', 'ham']),
+                        .02)
 
-    # calculate multiclass
-    assert_almost_equal(brier_score_loss(['eggs', 'spam', 'ham'], [[1, 0, 0],
-                                                                   [0, 1, 0],
-                                                                   [0, 1, 0]]),
-                        2/3)
+    # test cases for multi-class
     assert_almost_equal(
-        brier_score_loss(['eggs', 'spam', 'ham'],
-                         [[1, 0, 0, 0],
-                          [0, 1, 0, 0],
-                          [0, 1, 0, 0]],
-                         labels=['eggs', 'spam', 'ham', 'yams']),
+        multiclass_brier_score_loss(['eggs', 'spam', 'ham'],
+                                    [[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 1, 0, 0]],
+                                    labels=['eggs', 'spam', 'ham', 'yams']),
         2/3)
 
+    assert_almost_equal(
+        multiclass_brier_score_loss([1, 0, 2],
+                                    [[0.2, 0.7, 0.1],
+                                     [0.6, 0.2, 0.2],
+                                     [0.6, 0.1, 0.3]]),
+        .41333333)
+
 
 def test_balanced_accuracy_score_unseen():
     assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',

From 22438280c486b90aea6fd2bfa54d37c85f4fbcc2 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:15:06 -0500
Subject: [PATCH 14/24] fix docstring

---
 sklearn/metrics/_classification.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index a7f39c26b446b..584e94bf203c9 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2443,9 +2443,13 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
         Sample weights.
 
     pos_label : int or str, default=None
-        Label of the positive class.
-        Defaults to the greater label unless y_true is all 0 or all -1
-        in which case pos_label defaults to 1.
+        Label of the positive class. `pos_label` will be infered in the
+        following manner:
+        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
+        * else if `y_true` contains string, an error will be raised and
+          `pos_label` should be explicitely specified;
+        * otherwise, `pos_label` defaults to the greater label,
+          i.e. `np.unique(y_true)[-1]`.
 
     Returns
     -------

From 3e4465f3ac81a8ba4ac4ca9e49a91b775f133cd3 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:33:35 -0500
Subject: [PATCH 15/24] use f-strings

---
 sklearn/metrics/_classification.py           | 64 ++++++++++----------
 sklearn/metrics/tests/test_classification.py | 11 ++--
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 584e94bf203c9..7891b91413afd 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -41,6 +41,8 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
+from ._base import _check_pos_label_consistency
+
 
 def _check_zero_division(zero_division):
     if isinstance(zero_division, str) and zero_division == "warn":
@@ -2483,27 +2485,29 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     assert_all_finite(y_prob)
     check_consistent_length(y_true, y_prob, sample_weight)
 
-    labels = np.unique(y_true)
-    if len(labels) > 2:
-        raise ValueError("Only binary classification is supported. "
-                         "Labels in y_true: %s. "
-                         "Use multiclass_brier_score_loss instead" % labels)
+    y_type = type_of_target(y_true)
+    if y_type != "binary":
+        if y_type == 'multiclass':
+            raise ValueError("Only binary classification is supported. "
+                             "Use multiclass_brier_score_loss instead")
+        else:
+            raise ValueError(f"Only binary classification is supported. The "
+                             f"type of the target is {y_type}.")
     if y_prob.max() > 1:
         raise ValueError("y_prob contains values greater than 1.")
     if y_prob.min() < 0:
         raise ValueError("y_prob contains values less than 0.")
 
-    # if pos_label=None, when y_true is in {-1, 1} or {0, 1},
-    # pos_label is set to 1 (consistent with precision_recall_curve/roc_curve),
-    # otherwise pos_label is set to the greater label
-    # (different from precision_recall_curve/roc_curve,
-    # the purpose is to keep backward compatibility).
-    if pos_label is None:
-        if (np.array_equal(labels, [0]) or
-                np.array_equal(labels, [-1])):
-            pos_label = 1
-        else:
-            pos_label = y_true.max()
+    try:
+        pos_label = _check_pos_label_consistency(pos_label, y_true)
+    except ValueError:
+        classes = np.unique(y_true)
+        if classes.dtype.kind not in ('O', 'U', 'S'):
+            # for backward compatibility, if classes are not string then
+            # `pos_label` will correspond to the greater label
+            pos_label = classes[-1]
+    else:
+        raise
     y_true = np.array(y_true == pos_label, int)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
 
@@ -2601,12 +2605,12 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
 
     if len(lb.classes_) == 1:
         if labels is None:
-            raise ValueError('y_true contains only one label ({0}). Please '
-                             'provide the true labels explicitly through the '
-                             'labels argument.'.format(lb.classes_[0]))
+            raise ValueError(f'y_true contains only one label: '
+                             f'{lb.classes_[0]}. Please provide the true '
+                             f'labels explicitly through the labels argument.')
         else:
-            raise ValueError('The labels array needs to contain at least two '
-                             'labels, got {0}.'.format(lb.classes_))
+            raise ValueError(f'The labels array needs to contain at least two '
+                             f'labels, got {lb.classes_}.')
 
     transformed_labels = lb.transform(y_true)
 
@@ -2624,17 +2628,15 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
     transformed_labels = check_array(transformed_labels)
     if len(lb.classes_) != y_prob.shape[1]:
         if labels is None:
-            raise ValueError("y_true and y_prob contain different number of "
-                             "classes {0}, {1}. Please provide the true "
-                             "labels explicitly through the labels argument. "
-                             "Classes found in "
-                             "y_true: {2}".format(transformed_labels.shape[1],
-                                                  y_prob.shape[1],
-                                                  lb.classes_))
+            raise ValueError(f"y_true and y_prob contain different number of "
+                             f"classes {transformed_labels.shape[1]}, "
+                             f"{y_prob.shape[1]}. Please provide the true "
+                             f"labels explicitly through the labels argument. "
+                             f"Classes found in y_true: {lb.classes_}")
         else:
-            raise ValueError('The number of classes in labels is different '
-                             'from that in y_prob. Classes found in '
-                             'labels: {0}'.format(lb.classes_))
+            raise ValueError(f'The number of classes in labels is different '
+                             f'from that in y_prob. Classes found in '
+                             f'labels: {lb.classes_}')
 
     return np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
                       weights=sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c4aabf9c1a08b..e95c904165285 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2321,9 +2321,8 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = ("Only binary classification is supported. Labels in "
-                     "y_true: {}. Use multiclass_brier_score_loss "
-                     "instead".format(np.array([0, 1, 2])))
+    error_message = ("Only binary classification is supported. "
+                     "Use multiclass_brier_score_loss instead")
     with pytest.raises(ValueError, match=re.escape(error_message)):
         brier_score_loss(y_true, y_pred)
 
@@ -2382,9 +2381,9 @@ def test_multiclass_brier_score_loss():
     # raise error message when there's only one class in y_true
     y_true = ['eggs']
     y_pred = [.1]
-    error_message = ('y_true contains only one label ({0}). Please '
-                     'provide the true labels explicitly through the '
-                     'labels argument.'.format(y_true[0]))
+    error_message = (f'y_true contains only one label {y_true[0]}. Please '
+                     f'provide the true labels explicitly through the '
+                     f'labels argument.')
     with pytest.raises(ValueError, match=re.escape(error_message)):
         multiclass_brier_score_loss(y_true, y_pred)
 

From eafda421f743aa8c3e04ea9887b3f9d2ac857efa Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:42:31 -0500
Subject: [PATCH 16/24] fix tests

---
 sklearn/metrics/_classification.py           | 4 ++--
 sklearn/metrics/tests/test_classification.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 7891b91413afd..e85711e0a22e5 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2506,8 +2506,8 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
             # for backward compatibility, if classes are not string then
             # `pos_label` will correspond to the greater label
             pos_label = classes[-1]
-    else:
-        raise
+        else:
+            raise
     y_true = np.array(y_true == pos_label, int)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index e95c904165285..875e52ef84190 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2381,7 +2381,7 @@ def test_multiclass_brier_score_loss():
     # raise error message when there's only one class in y_true
     y_true = ['eggs']
     y_pred = [.1]
-    error_message = (f'y_true contains only one label {y_true[0]}. Please '
+    error_message = (f'y_true contains only one label: {y_true[0]}. Please '
                      f'provide the true labels explicitly through the '
                      f'labels argument.')
     with pytest.raises(ValueError, match=re.escape(error_message)):

From 038abf71b379147ac8bd6fa08836be12ebaaeb39 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:47:22 -0500
Subject: [PATCH 17/24] fix error message

---
 sklearn/metrics/_classification.py           | 12 ++++++------
 sklearn/metrics/tests/test_classification.py |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index e85711e0a22e5..c4caf72817b5a 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2487,12 +2487,12 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
 
     y_type = type_of_target(y_true)
     if y_type != "binary":
-        if y_type == 'multiclass':
-            raise ValueError("Only binary classification is supported. "
-                             "Use multiclass_brier_score_loss instead")
-        else:
-            raise ValueError(f"Only binary classification is supported. The "
-                             f"type of the target is {y_type}.")
+        raise ValueError(
+            f"Only binary classification is supported. The type of the target "
+            f"is {y_type}. For the multiclass case, use "
+            f"multiclass_brier_score_loss instead"
+        )
+
     if y_prob.max() > 1:
         raise ValueError("y_prob contains values greater than 1.")
     if y_prob.min() < 0:
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 875e52ef84190..578acf20906cf 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2321,8 +2321,9 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = ("Only binary classification is supported. "
-                     "Use multiclass_brier_score_loss instead")
+    error_message = (f"Only binary classification is supported. The type of "
+                     f"the target is multiclass. For the multiclass case, use "
+                     f"multiclass_brier_score_loss instead")
     with pytest.raises(ValueError, match=re.escape(error_message)):
         brier_score_loss(y_true, y_pred)
 

From 838f8279d7715ff8f79f89c221758f4f13a7a4a3 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 13:48:35 -0500
Subject: [PATCH 18/24] fix docstring

---
 sklearn/metrics/_classification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index c4caf72817b5a..60b4e3c37d393 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2447,6 +2447,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     pos_label : int or str, default=None
         Label of the positive class. `pos_label` will be infered in the
         following manner:
+
         * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
         * else if `y_true` contains string, an error will be raised and
           `pos_label` should be explicitely specified;

From 5ef41c73d9294dd86d4f80b04b083c7dbacbbd7f Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Mon, 2 Nov 2020 14:20:01 -0500
Subject: [PATCH 19/24] fix linting

---
 sklearn/metrics/tests/test_classification.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 578acf20906cf..958fab8f174d4 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2321,9 +2321,9 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = (f"Only binary classification is supported. The type of "
-                     f"the target is multiclass. For the multiclass case, use "
-                     f"multiclass_brier_score_loss instead")
+    error_message = ("Only binary classification is supported. The type of "
+                     "the target is multiclass. For the multiclass case, use "
+                     "multiclass_brier_score_loss instead")
     with pytest.raises(ValueError, match=re.escape(error_message)):
         brier_score_loss(y_true, y_pred)
 

From 4fb4c4f90edf623d2c95871abe3298666ee12285 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <aggvarun01@users.noreply.github.com>
Date: Wed, 4 Nov 2020 17:02:46 -0500
Subject: [PATCH 20/24] Update sklearn/metrics/_classification.py

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 60b4e3c37d393..0224a5952a051 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2525,10 +2525,10 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
     Brier score is defined as:
 
     .. math::
-        \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \bar{y}_{ic})^{2}
+        \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \hat{y}_{ic})^{2}
 
     where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`,
-    otherwise 0 and :math:`\bar{y}_{ic}` is the predicted probability of
+    otherwise 0 and :math:`\hat{y}_{ic}` is the predicted probability of
     observation `i` for class `c`. The probabilities for `c` classes for
     observation `i` should sum to 1.
 

From 3260bf32ab1c46da4c42bacd1539260ac5cd6b68 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <aggvarun01@users.noreply.github.com>
Date: Wed, 4 Nov 2020 17:13:44 -0500
Subject: [PATCH 21/24] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 0224a5952a051..5502da8a2b17e 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2554,9 +2554,9 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
 
     y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
         Predicted probabilities, as returned by a classifier's
-        predict_proba method. If ``y_pred.shape = (n_samples,)``
+        predict_proba method. If ``y_prob.shape = (n_samples,)``
         the probabilities provided are assumed to be that of the
-        positive class. The labels in ``y_pred`` are assumed to be
+        positive class. The labels in ``y_prob`` are assumed to be
         ordered alphabetically, as done by
         :class:`preprocessing.LabelBinarizer`.
 

From 86d793e8ce9721ca5b90fa2044ead581786ab4b8 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 4 Nov 2020 21:48:45 -0500
Subject: [PATCH 22/24] split tests

---
 sklearn/metrics/tests/test_classification.py | 64 ++++++++++++++------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 958fab8f174d4..428576e2fccdd 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2338,15 +2338,61 @@ def test_brier_score_loss():
 
 
 def test_multiclass_brier_score_loss():
-    # Check brier_score_loss function
+    # test cases for binary case
     y_true = np.array([0, 1, 1, 0, 1, 1])
     y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95])
 
     assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
                         .05083333)
+    # Check brier_score_loss and multiclass_brier_score_loss are consistent
     assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
                         brier_score_loss(y_true, y_pred) * 2)
 
+    # test cases for multi-class
+    assert_almost_equal(
+        multiclass_brier_score_loss(['eggs', 'spam', 'ham'],
+                                    [[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 1, 0, 0]],
+                                    labels=['eggs', 'spam', 'ham', 'yams']),
+        2/3)
+
+    assert_almost_equal(
+        multiclass_brier_score_loss([1, 0, 2],
+                                    [[0.2, 0.7, 0.1],
+                                     [0.6, 0.2, 0.2],
+                                     [0.6, 0.1, 0.3]]),
+        .41333333)
+
+    # check perfect predictions for 2 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1],
+                                                    [0., 0., 1., 1.]),
+                        0)
+
+    # check perfect predictions for 3 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 1, 2],
+                                                    [[1., 0., 0.],
+                                                     [0., 1., 0.],
+                                                     [0., 0., 1.]]),
+                        0)
+
+    # check perfectly incorrect predictions for 2 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1],
+                                                    [1., 1., 0., 0.]),
+                        2)
+
+    # check perfectly incorrect predictions for 3 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 1, 2],
+                                                    [[0., 1., 0.],
+                                                     [1., 0., 0.],
+                                                     [1., 0., 0.]]),
+                        2)
+
+
+def test_multiclass_brier_score_loss_invalid_inputs():
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95])
+
     with pytest.raises(ValueError):
         # bad length of y_pred
         multiclass_brier_score_loss(y_true, y_pred[1:])
@@ -2393,22 +2439,6 @@ def test_multiclass_brier_score_loss():
                                                     labels=['eggs', 'ham']),
                         .02)
 
-    # test cases for multi-class
-    assert_almost_equal(
-        multiclass_brier_score_loss(['eggs', 'spam', 'ham'],
-                                    [[1, 0, 0, 0],
-                                     [0, 1, 0, 0],
-                                     [0, 1, 0, 0]],
-                                    labels=['eggs', 'spam', 'ham', 'yams']),
-        2/3)
-
-    assert_almost_equal(
-        multiclass_brier_score_loss([1, 0, 2],
-                                    [[0.2, 0.7, 0.1],
-                                     [0.6, 0.2, 0.2],
-                                     [0.6, 0.1, 0.3]]),
-        .41333333)
-
 
 def test_balanced_accuracy_score_unseen():
     assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',

From 411ec1a52f9db12cbc3515afdead6a344a2bf8cf Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Fri, 6 Nov 2020 15:25:14 -0500
Subject: [PATCH 23/24] add private function

---
 sklearn/metrics/_classification.py           | 186 ++++++++++---------
 sklearn/metrics/tests/test_classification.py |   2 +-
 2 files changed, 95 insertions(+), 93 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 5502da8a2b17e..7454537e4ed6c 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -128,6 +128,92 @@ def _check_targets(y_true, y_pred):
     return y_type, y_true, y_pred
 
 
+def _validate_multiclass_probabilistic_prediction(y_true, y_prob,
+                                                  sample_weight, labels):
+    r"""Convert y_true and y_prob to shape [n_samples, n_classes]
+
+    1. Verify that y_true, y_prob, and sample_weights have the same first dim
+    2. Ensure 2 or more classes in y_true i.e. valid classification task. The
+       classes are provided by the labels argument, or inferred using y_true.
+       When inferring y_true is assumed binary if it has shape (n_samples, ).
+    3. Validate y_true, and y_prob have the same number of classes. Convert to
+       shape [n_samples, n_classes]/
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_prob.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_prob`` are assumed to be
+        ordered lexicographically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    transformed_labels : array of shape [n_samples, n_classes]
+
+    y_prob : array of shape [n_samples, n_classes]
+    """
+    y_prob = check_array(y_prob, ensure_2d=False)
+    check_consistent_length(y_prob, y_true, sample_weight)
+
+    lb = LabelBinarizer()
+
+    if labels is not None:
+        lb = lb.fit(labels)
+    else:
+        lb = lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError(f'y_true contains only one label: '
+                             f'{lb.classes_[0]}. Please provide the true '
+                             f'labels explicitly through the labels argument.')
+        else:
+            raise ValueError(f'The labels array needs to contain at least two '
+                             f'labels, got {lb.classes_}.')
+
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(1-transformed_labels,
+                                       transformed_labels, axis=1)
+
+    # If y_prob is of single dimension, assume y_true to be binary
+    if y_prob.ndim == 1:
+        y_prob = y_prob[:, np.newaxis]
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_prob.shape[1]:
+        if labels is None:
+            raise ValueError(f"y_true and y_prob contain different number of "
+                             f"classes {transformed_labels.shape[1]}, "
+                             f"{y_prob.shape[1]}. Please provide the true "
+                             f"labels explicitly through the labels argument. "
+                             f"Classes found in y_true: {lb.classes_}")
+        else:
+            raise ValueError(f'The number of classes in labels is different '
+                             f'from that in y_prob. Classes found in '
+                             f'labels: {lb.classes_}')
+
+    return transformed_labels, y_prob
+
+
 def _weighted_sum(sample_score, sample_weight, normalize=False):
     if normalize:
         return np.average(sample_score, weights=sample_weight)
@@ -2222,58 +2308,13 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
     C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
     p. 209.
     """
-    y_pred = check_array(y_pred, ensure_2d=False)
-    check_consistent_length(y_pred, y_true, sample_weight)
-
-    lb = LabelBinarizer()
-
-    if labels is not None:
-        lb.fit(labels)
-    else:
-        lb.fit(y_true)
-
-    if len(lb.classes_) == 1:
-        if labels is None:
-            raise ValueError('y_true contains only one label ({0}). Please '
-                             'provide the true labels explicitly through the '
-                             'labels argument.'.format(lb.classes_[0]))
-        else:
-            raise ValueError('The labels array needs to contain at least two '
-                             'labels for log_loss, '
-                             'got {0}.'.format(lb.classes_))
-
-    transformed_labels = lb.transform(y_true)
-
-    if transformed_labels.shape[1] == 1:
-        transformed_labels = np.append(1 - transformed_labels,
-                                       transformed_labels, axis=1)
+    transformed_labels, y_pred = _validate_multiclass_probabilistic_prediction(
+        y_true, y_pred, sample_weight, labels
+    )
 
     # Clipping
     y_pred = np.clip(y_pred, eps, 1 - eps)
 
-    # If y_pred is of single dimension, assume y_true to be binary
-    # and then check.
-    if y_pred.ndim == 1:
-        y_pred = y_pred[:, np.newaxis]
-    if y_pred.shape[1] == 1:
-        y_pred = np.append(1 - y_pred, y_pred, axis=1)
-
-    # Check if dimensions are consistent.
-    transformed_labels = check_array(transformed_labels)
-    if len(lb.classes_) != y_pred.shape[1]:
-        if labels is None:
-            raise ValueError("y_true and y_pred contain different number of "
-                             "classes {0}, {1}. Please provide the true "
-                             "labels explicitly through the labels argument. "
-                             "Classes found in "
-                             "y_true: {2}".format(transformed_labels.shape[1],
-                                                  y_pred.shape[1],
-                                                  lb.classes_))
-        else:
-            raise ValueError('The number of classes in labels is different '
-                             'from that in y_pred. Classes found in '
-                             'labels: {0}'.format(lb.classes_))
-
     # Renormalize
     y_pred /= y_pred.sum(axis=1)[:, np.newaxis]
     loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
@@ -2557,7 +2598,7 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
         predict_proba method. If ``y_prob.shape = (n_samples,)``
         the probabilities provided are assumed to be that of the
         positive class. The labels in ``y_prob`` are assumed to be
-        ordered alphabetically, as done by
+        ordered lexicographically, as done by
         :class:`preprocessing.LabelBinarizer`.
 
     sample_weight : array-like of shape (n_samples,), default=None
@@ -2590,54 +2631,15 @@ def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
             <https://en.wikipedia.org/wiki/Brier_score>`_.
     """
     y_true = column_or_1d(y_true)
-    y_prob = check_array(y_prob, ensure_2d=False)
-    check_consistent_length(y_prob, y_true, sample_weight)
+
+    transformed_labels, y_prob = _validate_multiclass_probabilistic_prediction(
+        y_true, y_prob, sample_weight, labels
+    )
 
     if y_prob.max() > 1:
         raise ValueError("y_prob contains values greater than 1.")
     if y_prob.min() < 0:
         raise ValueError("y_prob contains values less than 0.")
 
-    lb = LabelBinarizer()
-    if labels is not None:
-        lb = lb.fit(labels)
-    else:
-        lb = lb.fit(y_true)
-
-    if len(lb.classes_) == 1:
-        if labels is None:
-            raise ValueError(f'y_true contains only one label: '
-                             f'{lb.classes_[0]}. Please provide the true '
-                             f'labels explicitly through the labels argument.')
-        else:
-            raise ValueError(f'The labels array needs to contain at least two '
-                             f'labels, got {lb.classes_}.')
-
-    transformed_labels = lb.transform(y_true)
-
-    if transformed_labels.shape[1] == 1:
-        transformed_labels = np.append(1-transformed_labels,
-                                       transformed_labels, axis=1)
-
-    # If y_prob is of single dimension, assume y_true to be binary
-    if y_prob.ndim == 1:
-        y_prob = y_prob[:, np.newaxis]
-    if y_prob.shape[1] == 1:
-        y_prob = np.append(1 - y_prob, y_prob, axis=1)
-
-    # Check if dimensions are consistent.
-    transformed_labels = check_array(transformed_labels)
-    if len(lb.classes_) != y_prob.shape[1]:
-        if labels is None:
-            raise ValueError(f"y_true and y_prob contain different number of "
-                             f"classes {transformed_labels.shape[1]}, "
-                             f"{y_prob.shape[1]}. Please provide the true "
-                             f"labels explicitly through the labels argument. "
-                             f"Classes found in y_true: {lb.classes_}")
-        else:
-            raise ValueError(f'The number of classes in labels is different '
-                             f'from that in y_prob. Classes found in '
-                             f'labels: {lb.classes_}')
-
     return np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
                       weights=sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 428576e2fccdd..38f33058cd52b 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2259,7 +2259,7 @@ def test_log_loss():
     y_true = [2, 2]
     y_pred = [[0.2, 0.7], [0.6, 0.5]]
     y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
-    error_str = (r'y_true contains only one label \(2\). Please provide '
+    error_str = (r'y_true contains only one label: 2. Please provide '
                  r'the true labels explicitly through the labels argument.')
     with pytest.raises(ValueError, match=error_str):
         log_loss(y_true, y_pred)

From f84493cfe6ab263df328d2396ed89cf66c52bef0 Mon Sep 17 00:00:00 2001
From: Varun Aggarwal <varunaggarwal@Varuns-MBP.fios-router.home>
Date: Wed, 11 Nov 2020 17:36:42 -0500
Subject: [PATCH 24/24] add warning for labels

---
 sklearn/metrics/_classification.py           |  7 +++++
 sklearn/metrics/tests/test_classification.py | 32 +++++++++++++++++++-
 sklearn/metrics/tests/test_common.py         |  2 +-
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 7454537e4ed6c..ec86f76f057b1 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -173,6 +173,13 @@ def _validate_multiclass_probabilistic_prediction(y_true, y_prob,
 
     if labels is not None:
         lb = lb.fit(labels)
+        # LabelBinarizer does not respect the order implied by labels, which
+        # can be misleading.
+        if not np.all(lb.classes_ == labels):
+            warnings.warn(f"Labels passed were {labels}. But this function "
+                          f"assumes labels are ordered lexicographically. "
+                          f"Ensure that labels in y_prob are ordered as "
+                          f"{lb.classes_}.", UserWarning)
     else:
         lb = lb.fit(y_true)
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 38f33058cd52b..7b70fb73b9823 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2299,6 +2299,21 @@ def test_log_loss_pandas_input():
         assert_almost_equal(loss, 1.0383217, decimal=6)
 
 
+def test_log_loss_warnings():
+    assert_warns_message(
+        UserWarning,
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Ensure that labels in y_prob are ordered as "
+        "['eggs' 'ham' 'spam'].",
+        log_loss,
+        ['eggs', 'spam', 'ham'],
+        [[1, 0, 0],
+         [0, 1, 0],
+         [0, 0, 1]],
+        labels=['spam', 'eggs', 'ham'])
+
+
 def test_brier_score_loss():
     # Check brier_score_loss function
     y_true = np.array([0, 1, 1, 0, 1, 1])
@@ -2354,7 +2369,7 @@ def test_multiclass_brier_score_loss():
                                     [[1, 0, 0, 0],
                                      [0, 1, 0, 0],
                                      [0, 1, 0, 0]],
-                                    labels=['eggs', 'spam', 'ham', 'yams']),
+                                    labels=['eggs', 'ham', 'spam', 'yams']),
         2/3)
 
     assert_almost_equal(
@@ -2440,6 +2455,21 @@ def test_multiclass_brier_score_loss_invalid_inputs():
                         .02)
 
 
+def test_multiclass_brier_score_loss_warnings():
+    assert_warns_message(
+        UserWarning,
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Ensure that labels in y_prob are ordered as "
+        "['eggs' 'ham' 'spam'].",
+        multiclass_brier_score_loss,
+        ['eggs', 'spam', 'ham'],
+        [[1, 0, 0],
+         [0, 1, 0],
+         [0, 0, 1]],
+        labels=['spam', 'eggs', 'ham'])
+
+
 def test_balanced_accuracy_score_unseen():
     assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',
                          balanced_accuracy_score, [0, 0, 0], [0, 0, 1])
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 6688ddc2aa834..f0e1b37dcf244 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -483,7 +483,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "unnormalized_multilabel_confusion_matrix",
 
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
-    "macro_recall_score", "log_loss", "hinge_loss",
+    "macro_recall_score", "hinge_loss",
     "mean_gamma_deviance", "mean_poisson_deviance",
     "mean_compound_poisson_deviance", "mean_absolute_percentage_error"
 }