diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 84e7c98e29324..0d55c9eb7da40 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -35,6 +35,7 @@
 from ._classification import zero_one_loss
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
+from ._classification import multiclass_brier_score_loss
 
 from . import cluster
 from .cluster import adjusted_mutual_info_score
@@ -139,6 +140,7 @@
     'median_absolute_error',
     'mean_absolute_percentage_error',
     'multilabel_confusion_matrix',
+    'multiclass_brier_score_loss',
     'mutual_info_score',
     'ndcg_score',
     'normalized_mutual_info_score',
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 44a84e125727b..ec86f76f057b1 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -128,6 +128,99 @@ def _check_targets(y_true, y_pred):
     return y_type, y_true, y_pred
 
 
+def _validate_multiclass_probabilistic_prediction(y_true, y_prob,
+                                                  sample_weight, labels):
+    r"""Convert y_true and y_prob to shape [n_samples, n_classes]
+
+    1. Verify that y_true, y_prob, and sample_weights have the same first dim
+    2. Ensure 2 or more classes in y_true i.e. valid classification task. The
+       classes are provided by the labels argument, or inferred using y_true.
+       When inferring y_true is assumed binary if it has shape (n_samples, ).
+    3. Validate y_true, and y_prob have the same number of classes. Convert to
+       shape [n_samples, n_classes]/
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_prob.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_prob`` are assumed to be
+        ordered lexicographically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    transformed_labels : array of shape [n_samples, n_classes]
+
+    y_prob : array of shape [n_samples, n_classes]
+    """
+    y_prob = check_array(y_prob, ensure_2d=False)
+    check_consistent_length(y_prob, y_true, sample_weight)
+
+    lb = LabelBinarizer()
+
+    if labels is not None:
+        lb = lb.fit(labels)
+        # LabelBinarizer does not respect the order implied by labels, which
+        # can be misleading.
+        if not np.all(lb.classes_ == labels):
+            warnings.warn(f"Labels passed were {labels}. But this function "
+                          f"assumes labels are ordered lexicographically. "
+                          f"Ensure that labels in y_prob are ordered as "
+                          f"{lb.classes_}.", UserWarning)
+    else:
+        lb = lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError(f'y_true contains only one label: '
+                             f'{lb.classes_[0]}. Please provide the true '
+                             f'labels explicitly through the labels argument.')
+        else:
+            raise ValueError(f'The labels array needs to contain at least two '
+                             f'labels, got {lb.classes_}.')
+
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(1-transformed_labels,
+                                       transformed_labels, axis=1)
+
+    # If y_prob is of single dimension, assume y_true to be binary
+    if y_prob.ndim == 1:
+        y_prob = y_prob[:, np.newaxis]
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_prob.shape[1]:
+        if labels is None:
+            raise ValueError(f"y_true and y_prob contain different number of "
+                             f"classes {transformed_labels.shape[1]}, "
+                             f"{y_prob.shape[1]}. Please provide the true "
+                             f"labels explicitly through the labels argument. "
+                             f"Classes found in y_true: {lb.classes_}")
+        else:
+            raise ValueError(f'The number of classes in labels is different '
+                             f'from that in y_prob. Classes found in '
+                             f'labels: {lb.classes_}')
+
+    return transformed_labels, y_prob
+
+
 def _weighted_sum(sample_score, sample_weight, normalize=False):
     if normalize:
         return np.average(sample_score, weights=sample_weight)
@@ -2222,58 +2315,13 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
     C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
     p. 209.
     """
-    y_pred = check_array(y_pred, ensure_2d=False)
-    check_consistent_length(y_pred, y_true, sample_weight)
-
-    lb = LabelBinarizer()
-
-    if labels is not None:
-        lb.fit(labels)
-    else:
-        lb.fit(y_true)
-
-    if len(lb.classes_) == 1:
-        if labels is None:
-            raise ValueError('y_true contains only one label ({0}). Please '
-                             'provide the true labels explicitly through the '
-                             'labels argument.'.format(lb.classes_[0]))
-        else:
-            raise ValueError('The labels array needs to contain at least two '
-                             'labels for log_loss, '
-                             'got {0}.'.format(lb.classes_))
-
-    transformed_labels = lb.transform(y_true)
-
-    if transformed_labels.shape[1] == 1:
-        transformed_labels = np.append(1 - transformed_labels,
-                                       transformed_labels, axis=1)
+    transformed_labels, y_pred = _validate_multiclass_probabilistic_prediction(
+        y_true, y_pred, sample_weight, labels
+    )
 
     # Clipping
     y_pred = np.clip(y_pred, eps, 1 - eps)
 
-    # If y_pred is of single dimension, assume y_true to be binary
-    # and then check.
-    if y_pred.ndim == 1:
-        y_pred = y_pred[:, np.newaxis]
-    if y_pred.shape[1] == 1:
-        y_pred = np.append(1 - y_pred, y_pred, axis=1)
-
-    # Check if dimensions are consistent.
-    transformed_labels = check_array(transformed_labels)
-    if len(lb.classes_) != y_pred.shape[1]:
-        if labels is None:
-            raise ValueError("y_true and y_pred contain different number of "
-                             "classes {0}, {1}. Please provide the true "
-                             "labels explicitly through the labels argument. "
-                             "Classes found in "
-                             "y_true: {2}".format(transformed_labels.shape[1],
-                                                  y_pred.shape[1],
-                                                  lb.classes_))
-        else:
-            raise ValueError('The number of classes in labels is different '
-                             'from that in y_pred. Classes found in '
-                             'labels: {0}'.format(lb.classes_))
-
     # Renormalize
     y_pred /= y_pred.sum(axis=1)[:, np.newaxis]
     loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
@@ -2425,6 +2473,12 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     the greater label unless `y_true` is all 0 or all -1, in which case
     `pos_label` defaults to 1.
 
+    A more generalized form of Brier score is implemented in
+    :func:`multiclass_brier_score_loss` that is applicable to the multi-class
+    case as well. When used for the binary case, `multiclass_brier_score_loss`
+    returns Brier score that is exactly twice of the value returned by this
+    function.
+
     Read more in the :ref:`User Guide <brier_score_loss>`.
 
     Parameters
@@ -2484,7 +2538,8 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     if y_type != "binary":
         raise ValueError(
             f"Only binary classification is supported. The type of the target "
-            f"is {y_type}."
+            f"is {y_type}. For the multiclass case, use "
+            f"multiclass_brier_score_loss instead"
         )
 
     if y_prob.max() > 1:
@@ -2504,3 +2559,94 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
             raise
     y_true = np.array(y_true == pos_label, int)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
+
+
+def multiclass_brier_score_loss(y_true, y_prob, sample_weight=None,
+                                labels=None):
+    r"""Compute the Brier score loss.
+
+    The smaller the Brier score loss, the better, hence the naming with "loss".
+    The Brier score measures the mean squared difference between the predicted
+    probability and the actual outcome.
+
+    For :math:`N` samples with :math:`C` different classes, the multi-class
+    Brier score is defined as:
+
+    .. math::
+        \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \hat{y}_{ic})^{2}
+
+    where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`,
+    otherwise 0 and :math:`\hat{y}_{ic}` is the predicted probability of
+    observation `i` for class `c`. The probabilities for `c` classes for
+    observation `i` should sum to 1.
+
+    The Brier score always takes on a value between [0, 2]. For the
+    binary case however, there is a more common definition of Brier score
+    implemented in :func:`brier_score_loss` that is exactly half of the value
+    returned by this function, thereby having a range between [0, 1].
+
+    It can be decomposed as the sum of refinement loss and calibration loss.
+
+    The Brier score is appropriate for binary and categorical outcomes that
+    can be structured as true or false, but is inappropriate for ordinal
+    variables which can take on three or more values (this is because the
+    Brier score assumes that all possible outcomes are equivalently
+    "distant" from one another).
+
+    Read more in the :ref:`User Guide <brier_score_loss>`.
+
+    Parameters
+    ----------
+    y_true : array of shape (n_samples,)
+        True targets.
+
+    y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method. If ``y_prob.shape = (n_samples,)``
+        the probabilities provided are assumed to be that of the
+        positive class. The labels in ``y_prob`` are assumed to be
+        ordered lexicographically, as done by
+        :class:`preprocessing.LabelBinarizer`.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    labels : array-like, default=None
+        If not provided, labels will be inferred from y_true. If ``labels``
+        is ``None`` and ``y_prob`` has shape (n_samples,) the labels are
+        assumed to be binary and are inferred from ``y_true``.
+
+    Returns
+    -------
+    score : float
+        Brier score loss.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import multiclass_brier_score_loss
+    >>> multiclass_brier_score_loss([0, 1, 1, 0],
+    ...                             [0.1, 0.9, 0.8, 0.3])
+    0.074...
+    >>> multiclass_brier_score_loss(['eggs', 'ham', 'spam'], [[.8, .1, .1],
+    ...                                                       [.2, .7, .1],
+    ...                                                       [.2, .2, .6]])
+    0.146...
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Brier score
+            <https://en.wikipedia.org/wiki/Brier_score>`_.
+    """
+    y_true = column_or_1d(y_true)
+
+    transformed_labels, y_prob = _validate_multiclass_probabilistic_prediction(
+        y_true, y_prob, sample_weight, labels
+    )
+
+    if y_prob.max() > 1:
+        raise ValueError("y_prob contains values greater than 1.")
+    if y_prob.min() < 0:
+        raise ValueError("y_prob contains values less than 0.")
+
+    return np.average(np.sum((transformed_labels - y_prob) ** 2, axis=1),
+                      weights=sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c32e9c89ada47..7b70fb73b9823 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -4,6 +4,7 @@
 from itertools import chain
 from itertools import permutations
 import warnings
+import re
 
 import numpy as np
 from scipy import linalg
@@ -44,6 +45,7 @@
 from sklearn.metrics import zero_one_loss
 from sklearn.metrics import brier_score_loss
 from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import multiclass_brier_score_loss
 
 from sklearn.metrics._classification import _check_targets
 from sklearn.exceptions import UndefinedMetricWarning
@@ -2257,7 +2259,7 @@ def test_log_loss():
     y_true = [2, 2]
     y_pred = [[0.2, 0.7], [0.6, 0.5]]
     y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
-    error_str = (r'y_true contains only one label \(2\). Please provide '
+    error_str = (r'y_true contains only one label: 2. Please provide '
                  r'the true labels explicitly through the labels argument.')
     with pytest.raises(ValueError, match=error_str):
         log_loss(y_true, y_pred)
@@ -2297,6 +2299,21 @@ def test_log_loss_pandas_input():
         assert_almost_equal(loss, 1.0383217, decimal=6)
 
 
+def test_log_loss_warnings():
+    assert_warns_message(
+        UserWarning,
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Ensure that labels in y_prob are ordered as "
+        "['eggs' 'ham' 'spam'].",
+        log_loss,
+        ['eggs', 'spam', 'ham'],
+        [[1, 0, 0],
+         [0, 1, 0],
+         [0, 0, 1]],
+        labels=['spam', 'eggs', 'ham'])
+
+
 def test_brier_score_loss():
     # Check brier_score_loss function
     y_true = np.array([0, 1, 1, 0, 1, 1])
@@ -2319,12 +2336,10 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = (
-        "Only binary classification is supported. The type of the target is "
-        "multiclass"
-    )
-
-    with pytest.raises(ValueError, match=error_message):
+    error_message = ("Only binary classification is supported. The type of "
+                     "the target is multiclass. For the multiclass case, use "
+                     "multiclass_brier_score_loss instead")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
         brier_score_loss(y_true, y_pred)
 
     # calculate correctly when there's only one class in y_true
@@ -2337,6 +2352,124 @@ def test_brier_score_loss():
         brier_score_loss(['foo'], [0.4], pos_label='foo'), 0.36)
 
 
+def test_multiclass_brier_score_loss():
+    # test cases for binary case
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95])
+
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
+                        .05083333)
+    # Check brier_score_loss and multiclass_brier_score_loss are consistent
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred),
+                        brier_score_loss(y_true, y_pred) * 2)
+
+    # test cases for multi-class
+    assert_almost_equal(
+        multiclass_brier_score_loss(['eggs', 'spam', 'ham'],
+                                    [[1, 0, 0, 0],
+                                     [0, 1, 0, 0],
+                                     [0, 1, 0, 0]],
+                                    labels=['eggs', 'ham', 'spam', 'yams']),
+        2/3)
+
+    assert_almost_equal(
+        multiclass_brier_score_loss([1, 0, 2],
+                                    [[0.2, 0.7, 0.1],
+                                     [0.6, 0.2, 0.2],
+                                     [0.6, 0.1, 0.3]]),
+        .41333333)
+
+    # check perfect predictions for 2 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1],
+                                                    [0., 0., 1., 1.]),
+                        0)
+
+    # check perfect predictions for 3 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 1, 2],
+                                                    [[1., 0., 0.],
+                                                     [0., 1., 0.],
+                                                     [0., 0., 1.]]),
+                        0)
+
+    # check perfectly incorrect predictions for 2 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 0, 1, 1],
+                                                    [1., 1., 0., 0.]),
+                        2)
+
+    # check perfectly incorrect predictions for 3 classes
+    assert_almost_equal(multiclass_brier_score_loss([0, 1, 2],
+                                                    [[0., 1., 0.],
+                                                     [1., 0., 0.],
+                                                     [1., 0., 0.]]),
+                        2)
+
+
+def test_multiclass_brier_score_loss_invalid_inputs():
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95])
+
+    with pytest.raises(ValueError):
+        # bad length of y_pred
+        multiclass_brier_score_loss(y_true, y_pred[1:])
+    with pytest.raises(ValueError):
+        # y_pred has value greater than 1
+        multiclass_brier_score_loss(y_true, y_pred + 1.)
+    with pytest.raises(ValueError):
+        # y_pred has value less than 1
+        multiclass_brier_score_loss(y_true, y_pred - 1.)
+
+    # ensure to raise an error for wrong number of classes
+    y_true = np.array([0, 1, 2, 0])
+    y_pred = np.array([0.8, 0.6, 0.4, 0.2])
+    error_message = ("y_true and y_prob contain different number of "
+                     "classes 3, 2. Please provide the true "
+                     "labels explicitly through the labels argument. "
+                     "Classes found in "
+                     "y_true: [0 1 2]")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        multiclass_brier_score_loss(y_true, y_pred)
+
+    y_true = ['eggs', 'spam', 'ham']
+    y_pred = [[1, 0, 0],
+              [0, 1, 0],
+              [0, 1, 0]]
+    labels = ['eggs', 'spam', 'ham', 'yams']
+    error_message = ("The number of classes in labels is different "
+                     "from that in y_prob. Classes found in "
+                     "labels: ['eggs' 'ham' 'spam' 'yams']")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        multiclass_brier_score_loss(y_true, y_pred, labels=labels)
+
+    # raise error message when there's only one class in y_true
+    y_true = ['eggs']
+    y_pred = [.1]
+    error_message = (f'y_true contains only one label: {y_true[0]}. Please '
+                     f'provide the true labels explicitly through the '
+                     f'labels argument.')
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        multiclass_brier_score_loss(y_true, y_pred)
+
+    # error is fixed when labels is specified
+    assert_almost_equal(multiclass_brier_score_loss(y_true, y_pred,
+                                                    labels=['eggs', 'ham']),
+                        .02)
+
+
+def test_multiclass_brier_score_loss_warnings():
+    assert_warns_message(
+        UserWarning,
+        "Labels passed were ['spam', 'eggs', 'ham']. But this function "
+        "assumes labels are ordered lexicographically. "
+        "Ensure that labels in y_prob are ordered as "
+        "['eggs' 'ham' 'spam'].",
+        multiclass_brier_score_loss,
+        ['eggs', 'spam', 'ham'],
+        [[1, 0, 0],
+         [0, 1, 0],
+         [0, 0, 1]],
+        labels=['spam', 'eggs', 'ham'])
+
+
 def test_balanced_accuracy_score_unseen():
     assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',
                          balanced_accuracy_score, [0, 0, 0], [0, 0, 1])
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 6688ddc2aa834..f0e1b37dcf244 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -483,7 +483,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "unnormalized_multilabel_confusion_matrix",
 
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
-    "macro_recall_score", "log_loss", "hinge_loss",
+    "macro_recall_score", "hinge_loss",
     "mean_gamma_deviance", "mean_poisson_deviance",
     "mean_compound_poisson_deviance", "mean_absolute_percentage_error"
 }