scikit-learn
diff --git a/‎doc/modules/classes.rst
Lines changed: 12 additions & 0 deletions b/‎doc/modules/classes.rst
Lines changed: 12 additions & 0 deletions
diff --git a/‎doc/modules/model_evaluation.rst
Lines changed: 52 additions & 0 deletions b/‎doc/modules/model_evaluation.rst
Lines changed: 52 additions & 0 deletions
diff --git a/‎sklearn/metrics/__init__.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/metrics/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sklearn/metrics/metrics.py
Lines changed: 70 additions & 0 deletions b/‎sklearn/metrics/metrics.py
Lines changed: 70 additions & 0 deletions
@@ -770,6 +770,18 @@ details.
    metrics.mean_squared_error
    metrics.r2_score
 
+Multilabel ranking metrics
+--------------------------
+See the :ref:`multilabel_ranking_metrics` section of the user guide for further
+details.
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   metrics.label_ranking_average_precision_score
+
+
 
 Clustering metrics
 ------------------
 
@@ -900,6 +900,58 @@ In the multilabel case with binary label indicators: ::
     elimination with cross-validation.
 
 
+.. _multilabel_ranking_metrics:
+
+Multilabel ranking metrics
+--------------------------
+
+.. currentmodule:: sklearn.metrics
+
+In multilabel learning, each sample can have any number of ground truth labels
+associated with it. The goal is to give high scores and better rank to
+the ground truth labels.
+
+Label ranking average precision
+...............................
+The :func:`label_ranking_average_precision_score` function
+implements the label ranking average precision (LRAP). This metric is linked to
+the :func:`average_precision_score` function, but is based on the notion of
+label ranking instead of precision and recall.
+
+Label ranking average precision (LRAP) is the average over each ground truth
+label assigned to each sample, of the ratio of true vs. total labels with lower
+score. This metric will yield better score if you are able to give better rank
+to the labels associated to each sample. The obtained score is always strictly
+greater than 0 and the best value is 1. If there is exactly one relevant
+label per sample, label ranking average precision is equivalent to the `mean
+reciprocal rank <http://en.wikipedia.org/wiki/Mean_reciprocal_rank>`.
+
+Formally, given a binary indicator matrix of the ground truth labels
+:math:`y \in \mathcal{R}^{n_\text{samples} \times n_\text{labels}}` and the
+score associated to each label
+:math:`\hat{f} \in \mathcal{R}^{n_\text{samples} \times n_\text{labels}}`,
+the average precision is defined as
+
+.. math::
+  LRAP(y, \hat{f}) = \frac{1}{n_{\text{samples}}}
+    \sum_{i=0}^{n_{\text{samples}} - 1} \frac{1}{|y_i|}
+    \sum_{j:y_{ij} = 1} \frac{|\mathcal{L}_{ij}|}{\text{rank}_{ij}}
+
+
+with :math:`\mathcal{L}_{ij} = \left\{k: y_{ik} = 1, \hat{f}_{ik} \geq \hat{f}_{ij} \right\}`,
+:math:`\text{rank}_{ij} = \left|\left\{k: \hat{f}_{ik} \geq \hat{f}_{ij} \right\}\right|`
+and :math:`|\cdot|` is the l0 norm or the cardinality of the set.
+
+Here a small example of usage of this function::
+
+    >>> import numpy as np
+    >>> from sklearn.metrics import label_ranking_average_precision_score
+    >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
+    >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
+    >>> label_ranking_average_precision_score(y_true, y_score) # doctest: +ELLIPSIS
+    0.416...
+
+
 .. _regression_metrics:
 
 Regression metrics
 
@@ -15,6 +15,7 @@
                       hamming_loss,
                       hinge_loss,
                       jaccard_similarity_score,
+                      label_ranking_average_precision_score,
                       log_loss,
                       matthews_corrcoef,
                       mean_squared_error,
@@ -72,6 +73,7 @@
            'homogeneity_completeness_v_measure',
            'homogeneity_score',
            'jaccard_similarity_score',
+           'label_ranking_average_precision_score',
            'log_loss',
            'matthews_corrcoef',
            'mean_squared_error',
 
@@ -36,6 +36,7 @@
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.fixes import isclose
+from ..utils.stats import rankdata
 
 
 ###############################################################################
@@ -2144,6 +2145,75 @@ def hamming_loss(y_true, y_pred, classes=None):
         raise ValueError("{0} is not supported".format(y_type))
 
 
+def label_ranking_average_precision_score(y_true, y_score):
+    """Compute ranking-based average precision
+
+    Label ranking average precision (LRAP) is the average over each ground
+    truth label assigned to each sample, of the ratio of true vs. total
+    labels with lower score.
+
+    This metric is used in multilabel ranking problem, where the goal
+    is to give better rank to the labels associated to each sample.
+
+    The obtained score is always strictly greater than 0 and
+    the best value is 1.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples, n_labels]
+        True binary labels in binary indicator format.
+
+    y_score : array, shape = [n_samples, n_labels]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or binary decisions.
+
+    Return
+    ------
+    score : float
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import label_ranking_average_precision_score
+    >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
+    >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
+    >>> label_ranking_average_precision_score(y_true, y_score) \
+        # doctest: +ELLIPSIS
+    0.416...
+
+    """
+    y_true, y_score = check_arrays(y_true, y_score)
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    # Handle badly formated array and the degenerate case with one label
+    y_type = type_of_target(y_true)
+    if (y_type != "multilabel-indicator"
+            and not (y_type == "binary" and y_true.ndim == 2)):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    n_samples, n_labels = y_true.shape
+
+    out = 0.
+    for i in range(n_samples):
+        relevant = y_true[i].nonzero()[0]
+
+        if (relevant.size == 0 or relevant.size == n_labels):
+            # If all labels are relevant or unrelevant, the score is also
+            # equal to 1. The label ranking has no meaning.
+            out += 1.
+            continue
+
+        scores_i = - y_score[i]
+        true_mask = y_true[i].astype(bool)
+        rank = rankdata(scores_i, 'max')[true_mask]
+        L = rankdata(scores_i[true_mask], 'max')
+        out += np.divide(L, rank, dtype=float).mean()
+
+    return out / n_samples
+
+
 ###############################################################################
 # Regression metrics
 ###############################################################################