scikit-learn
diff --git a/‎doc/whats_new.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/metrics/classification.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/metrics/classification.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/metrics/tests/test_classification.py
Lines changed: 14 additions & 0 deletions b/‎sklearn/metrics/tests/test_classification.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎sklearn/metrics/tests/test_common.py
Lines changed: 23 additions & 0 deletions b/‎sklearn/metrics/tests/test_common.py
Lines changed: 23 additions & 0 deletions
@@ -276,6 +276,10 @@ Bug fixes
       :class:`sklearn.naive_bayes.MultinomialNB` and
       :class:`sklearn.naive_bayes.BernoulliNB`. By `Trevor Stephens`_.
 
+    - Fixed a crash in :func:`metrics.precision_recall_fscore_support`
+      when using unsorted ``labels`` in the multi-label setting.
+      By `Andreas Müller`_.
+
 API changes summary
 -------------------
 
 
@@ -847,7 +847,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     if labels is None:
         labels
8000
 = unique_labels(y_true, y_pred)
     else:
-        labels = np.asarray(labels)
+        labels = np.sort(labels)
 
     ### Calculate tp_sum, pred_sum, true_sum ###
 
 
@@ -303,6 +303,20 @@ def test_precision_recall_f1_score_multiclass():
     assert_array_equal(s, [24, 20, 31])
 
 
+def test_precision_refcall_f1_score_multilabel_unordered_labels():
+    # test that labels need not be sorted in the multilabel case
+    y_true = np.array([[1, 1, 0, 0]])
+    y_pred = np.array([[0, 0, 1, 1]])
+    for average in ['samples', 'micro', 'macro', 'weighted', None]:
+        p, r, f, s = precision_recall_fscore_support(
+            y_true, y_pred, labels=[4, 1, 2, 3], warn_for=[], average=average)
+        assert_array_equal(p, 0)
+        assert_array_equal(r, 0)
+        assert_array_equal(f, 0)
+        if average is None:
+            assert_array_equal(s, [0, 1, 1, 0])
+
+
 def test_precision_recall_f1_score_multiclass_pos_label_none():
     """Test Precision Recall and F1 Score for multiclass classification task
 
 
@@ -1064,3 +1064,26 @@ def test_sample_weight_invariance(n_samples=50):
         else:
             yield (check_sample_weight_invariance, name, metric, y_true,
                    y_pred)
+
+
+def test_no_averaging_labels():
+    # test labels argument when not using averaging
+    # in multi-class and multi-label cases
+    y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
+    y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]])
+    y_true_multiclass = np.array([1, 2, 3])
+    y_pred_multiclass = np.array([1, 3, 4])
+    labels = np.array([4, 1, 2, 3])
+    _, inverse_labels = np.unique(labels, return_inverse=True)
+
+    for name in METRICS_WITH_AVERAGING:
+        for y_true, y_pred in [[y_true_multiclass, y_pred_multiclass],
+                               [y_true_multilabel, y_pred_multilabel]]:
+            if name not in MULTILABELS_METRICS and y_pred.shape[1] > 0:
+                continue
+
+            metric = ALL_METRICS[name]
+
+            score_labels = metric(y_true, y_pred, labels=labels, average=None)
+            score = metric(y_true, y_pred, average=None)
+            assert_array_equal(score_labels, score[inverse_labels])