scikit-learn · larsmans · Sep 6, 2012 · May 28, 2013 · Jul 1, 2013 · Jul 26, 2013
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -721,6 +721,7 @@ Classification metrics
    metrics.hamming_loss
    metrics.hinge_loss
    metrics.jaccard_similarity_score
+   metrics.log_loss
    metrics.matthews_corrcoef
    metrics.precision_recall_curve
    metrics.precision_recall_fscore_support

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -75,7 +75,7 @@ of acceptable values::
     >>> model = svm.SVC()
     >>> cross_validation.cross_val_score(model, X, y, scoring='wrong_choice')
     Traceback (most recent call last):
-    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'mean_squared_error', 'precision', 'r2', 'recall', 'roc_auc']
+    ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'log_likelihood', 'mean_squared_error', 'precision', 'r2', 'recall', 'roc_auc']
 
 .. note::
 
@@ -712,6 +712,55 @@ with a svm classifier::
   0.3...
 
 
+Log-likelihood and log loss
+---------------------------
+Log-likelihood is a score to evaluate probabilistic classifiers
+by their probability outputs (``predict_proba``)
+rather than their discrete predictions.
+
+Log loss is negative log-likelihood and is used as the loss function
+in logistic regression and neural networks,
+as well as some variants of expectation-maximization,
+
+For binary classification with a true label :math:`y_t \in \{0,1\}`
+and a probability estimate :math:`y_p = P(y_t = 1)`,
+the log-likelihood of the model that predicted :math:`y_p` is:
+
+.. math::
+
+    L(y_t, y_p) = \log P(y_t|y_p) = (y_t \log y_p + (1 - y_t) \log (1 - y_p))
+
+This extends to the multiclass case as follows.
+Let the true labels for a set of samples
+be encoded as a 1-of-K binary indicator matrix :math:`T`,
+i.e. :math:`t_{i,k} = 1` if sample :math:`i` has label :math:`k`
+taken from a set of :math:`K` labels.
+Let :math:`Y` be a matrix of probability estimates,
+with :math:`y_{i,k} = P(t_{i,k} = 1)`.
+Then the total log-likelihood of the whole set is
+
+.. math::
+
+    L(T, Y) = \log P(T|Y) = \sum_i \sum_j t_{i,k} \log y_{i,k}
+
+The functions :func:`log_likelihood_score` and :func:`log_loss`
+compute either total or mean log-likelihood/loss
+given a list of ground-truth labels and a probability matrix,
+as returned by an estimator's ``predict_proba`` method.
+
+    >>> from sklearn.metrics import log_likelihood_score, log_loss
+    >>> y_true = [0, 0, 1, 1]
+    >>> y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]
+    >>> log_likelihood_score(y_true, y_pred)    # doctest: +ELLIPSIS
+    -0.1738...
+    >>> log_loss(y_true, y_pred)                # doctest: +ELLIPSIS
+    0.1738...
+
+The first ``[.9, .1]`` in ``y_pred``
+denotes 90% probability that the first sample has label 0.
+Log-likelihood is negative or zero (with zero meaning perfect predictions).
+
+
 Matthews correlation coefficient
 .................................
 
@@ -1031,6 +1080,7 @@ Scoring                 Function
 'accuracy'              :func:`sklearn.metrics.accuracy_score`
 'average_precision'     :func:`sklearn.metrics.average_precision_score`
 'f1'                    :func:`sklearn.metrics.f1_score`
+'log_likelihood'        :func:`sklearn.metric.log_likelihood_score`
 'precision'             :func:`sklearn.metrics.precision_score`
 'recall'                :func:`sklearn.metrics.recall_score`
 'roc_auc'               :func:`sklearn.metrics.auc_score`

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -99,6 +99,9 @@ Changelog
      the fraction or the number of correctly classified sample
      by `Arnaud Joly`_.
 
+   - Added :func:`metrics.log_loss` that computes log loss, aka cross-entropy
+     loss. By Jochen Wersdörfer and `Lars Buitinck`_.
+
    - A bug that caused :class:`ensemble.AdaBoostClassifier`'s to output
      incorrect probabilities has been fixed.
 

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -15,6 +15,8 @@
                       hamming_loss,
                       hinge_loss,
                       jaccard_similarity_score,
+                      log_likelihood_score,
+                      log_loss,
                       matthews_corrcoef,
                       mean_squared_error,
                       mean_absolute_error,
@@ -68,6 +70,7 @@
            'homogeneity_completeness_v_measure',
            'homogeneity_score',
            'jaccard_similarity_score',
+           'log_loss',
            'matthews_corrcoef',
            'mean_squared_error',
            'mean_absolute_error',

diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
@@ -23,7 +23,7 @@
 from scipy.spatial.distance import hamming as sp_hamming
 
 from ..externals.six.moves import zip
-from ..preprocessing import LabelBinarizer
+from ..preprocessing import LabelBinarizer, label_binarize
 from ..utils import check_arrays
 from ..utils import deprecated
 from ..utils.fixes import divide
@@ -2111,3 +2111,92 @@ def r2_score(y_true, y_pred):
             return 0.0
 
     return 1 - numerator / denominator
+
+
+def log_likelihood_score(y_true, y_pred, eps=1e-15, normalize=True):
+    """Log-likelihood of the model that generated y_pred.
+
+    This function returns the probability of y_true being the true labels
+    when a probability model has returned y_pred, also known as the likelihood
+    of the model. Log-likelihood is the most common optimization objective for
+    probability models such as logistic regression.
+
+    For a single sample with true label yt in {0,1} and estimated probability
+    yp that yt = 1, the log-likelihood is
+
+        log P(yt|yp) = (yt log(yp) + (1 - yt) log(1 - yp))
+
+    Note that log-probabilities are <= 0 with 0 meaning perfect predictions.
+
+    Parameters
+    ----------
+    y_true : array-like or list of labels or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_pred : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    eps : float
+        Log loss is undefined for p=0 or p=1, so probabilities are
+        clipped to max(eps, min(1 - eps, p)).
+
+    normalize : bool, optional (default=True)
+        If true, return the mean loss per sample.
+        Otherwise, return the total loss.
+
+    Returns
+    -------
+    score : float
+
+    Examples
+    --------
+    >>> log_likelihood_score(["spam", "ham", "ham", "spam"],  # doctest: +ELLIPSIS
+    ...                      [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
+    -0.21616...
+
+    References
+    ----------
+    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
+    p. 209.
+
+    See also
+    --------
+    log_loss
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+    """
+    lb = LabelBinarizer()
+    T = lb.fit_transform(y_true)
+    if T.shape[1] == 1:
+        T = np.append(1 - T, T, axis=1)
+
+    # Clip and renormalize
+    Y = np.clip(y_pred, eps, 1 - eps)
+    Y /= Y.sum(axis=1)[:, np.newaxis]
+
+    loss = (T * np.log(Y)).sum()
+    return loss / T.shape[0] if normalize else loss
+
+
+def log_loss(y_true, y_pred, eps=1e-15, normalize=True):
+    """Log loss, aka logistic loss or cross-entropy loss.
+
+    This is the loss function used in logistic regression and other
+    probability models, defined as the negative log-likelihood of a models
+    prediction given ground truth labels. For a single sample with true label
+    yt in {0,1} and estimated probability yp that yt = 1, the log loss is
+
+        -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
+
+    See log_likelihood_score for the parameters.
+
+    Examples
+    --------
+    >>> log_loss(["spam", "ham", "ham", "spam"],  # doctest: +ELLIPSIS
+    ...          [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
+    0.21616...
+    """
+    return -log_likelihood_score(y_true, y_pred, eps, normalize)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
@@ -23,7 +23,7 @@
 
 from . import (r2_score, mean_squared_error, accuracy_score, f1_score,
                auc_score, average_precision_score, precision_score,
-               recall_score)
+               recall_score, log_likelihood_score)
 
 from .cluster import adjusted_rand_score
 from ..externals import six
@@ -224,6 +224,9 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 precision_scorer = make_scorer(precision_score)
 recall_scorer = make_scorer(recall_score)
 
+# Score function for probabilistic classification
+log_likelihood_scorer = make_scorer(log_likelihood_score, needs_proba=True)
+
 # Clustering scores
 adjusted_rand_scorer = make_scorer(adjusted_rand_score)
 
@@ -232,4 +235,5 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                accuracy=accuracy_scorer, f1=f1_scorer, roc_auc=auc_scorer,
                average_precision=average_precision_scorer,
                precision=precision_scorer, recall=recall_scorer,
+               log_likelihood=log_likelihood_scorer,
                adjusted_rand_score=adjusted_rand_scorer)
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
@@ -36,6 +36,7 @@
                              hamming_loss,
                              hinge_loss,
                              jaccard_similarity_score,
+                             log_loss,
                              matthews_corrcoef,
                              mean_squared_error,
                              mean_absolute_error,
@@ -1801,3 +1802,30 @@ def test__column_or_1d():
             assert_array_equal(_column_or_1d(y), np.ravel(y))
         else:
             assert_raises(ValueError, _column_or_1d, y)
+
+
+def test_log_loss():
+    # binary case with symbolic labels ("no" < "yes")
+    y_true = ["no", "no", "no", "yes", "yes", "yes"]
+    y_pred = np.array([[0.5, 0.5], [0.1, 0.9], [0.01, 0.99],
+                       [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]])
+    loss = log_loss(y_true, y_pred)
+    assert_almost_equal(loss, 1.8817971)
+
+    # multiclass case; adapted from http://bit.ly/RJJHWA
+    y_true = [1, 0, 2]
+    y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
+    loss = log_loss(y_true, y_pred, normalize=True)
+    assert_almost_equal(loss, 0.6904911)
+
+    # check that we got all the shapes and axes right
+    # by doubling the length of y_true and y_pred
+    y_true *= 2
+    y_pred *= 2
+    loss = log_loss(y_true, y_pred, normalize=False)
+    assert_almost_equal(loss, 0.6904911 * 6, decimal=6)
+
+    # check eps and handling of absolute zero and one probabilities
+    y_pred = np.asarray(y_pred) > .5
+    loss = log_loss(y_true, y_pred, normalize=True, eps=.1)
+    assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, .1, .9)))
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
@@ -4,7 +4,8 @@
 from sklearn.utils.testing import assert_almost_equal, assert_array_equal
 from sklearn.utils.testing import assert_raises
 
-from sklearn.metrics import f1_score, r2_score, auc_score, fbeta_score
+from sklearn.metrics import (f1_score, r2_score, auc_score, fbeta_score,
+                             log_loss)
 from sklearn.metrics.cluster import adjusted_rand_score
 from sklearn.metrics import make_scorer, SCORERS
 from sklearn.svm import LinearSVC
@@ -59,32 +60,6 @@ def test_regression_scorers():
     assert_almost_equal(score1, score2)
 
 
-def test_proba_scorer():
-    """Non-regression test for _ProbaScorer (which did not have __call__)."""
-    # This test can be removed once we have an actual scorer that uses
-    # predict_proba, e.g. by merging #2013.
-    def log_loss(y, p):
-        """Binary log loss with labels in {0, 1}."""
-        return -(y * np.log(p) + (1 - y) * np.log(1 - p))
-
-    log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
-                                  needs_proba=True)
-
-    class DiscreteUniform(object):
-        def __init__(self, n_classes):
-            self.n_classes = n_classes
-
-        def predict_proba(self, X):
-            n = self.n_classes
-            return np.repeat(1. / n, n)
-
-    estimator = DiscreteUniform(5)
-    y_true = np.array([0, 0, 1, 1, 1])
-    y_pred = estimator.predict_proba(None)
-    assert_array_equal(log_loss(y_true, y_pred),
-                       -log_loss_scorer(estimator, None, y_true))
-
-
 def test_thresholded_scorers():
     """Test scorers that take thresholds."""
     X, y = make_blobs(random_state=0, centers=2)
@@ -97,6 +72,10 @@ def test_thresholded_scorers():
     assert_almost_equal(score1, score2)
     assert_almost_equal(score1, score3)
 
+    logscore = SCORERS['log_likelihood'](clf, X_test, y_test)
+    logloss = log_loss(y_test, clf.predict_proba(X_test))
+    assert_almost_equal(-logscore, logloss)
+
     # same for an estimator without decision_function
     clf = DecisionTreeClassifier()
     clf.fit(X_train, y_train)