scikit-learn
diff --git a/‎sklearn/metrics/tests/test_metrics.py
Lines changed: 71 additions & 62 deletions b/‎sklearn/metrics/tests/test_metrics.py
Lines changed: 71 additions & 62 deletions
@@ -1,6 +1,5 @@
 from __future__ import division
 
-import random
 import warnings
 import numpy as np
 
@@ -9,8 +8,7 @@
 
 from sklearn.preprocessing import LabelBinarizer
 from sklearn.datasets import make_multilabel_classification
-from sklearn.utils import (check_random_state,
-                           shuffle)
+from sklearn.utils import check_random_state, shuffle
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.testing import (assert_true,
                                    assert_raises,
@@ -85,8 +83,8 @@ def make_prediction(dataset=None, binary=False):
     n_samples, n_features = X.shape
     p = np.arange(n_samples)
 
-    random.seed(0)
-    random.shuffle(p)
+    rng = check_random_state(37)
+    rng.shuffle(p)
     X, y = X[p], y[p]
     half = int(n_samples / 2)
 
@@ -114,7 +112,7 @@ def test_roc_curve():
 
     fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
     roc_auc = auc(fpr, tpr)
-    assert_array_almost_equal(roc_auc, 0.80, decimal=2)
+    assert_array_almost_equal(roc_auc, 0.90, decimal=2)
     assert_almost_equal(roc_auc, auc_score(y_true, probas_pred))
 
 
@@ -159,7 +157,7 @@ def test_roc_curve_confidence():
 
     fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5)
     roc_auc = auc(fpr, tpr)
-    assert_array_almost_equal(roc_auc, 0.80, decimal=2)
+    assert_array_almost_equal(roc_auc, 0.90, decimal=2)
 
 
 def test_roc_curve_hard():
@@ -181,7 +179,7 @@ def test_roc_curve_hard():
     # hard decisions
     fpr, tpr, thresholds = roc_curve(y_true, pred)
     roc_auc = auc(fpr, tpr)
-    assert_array_almost_equal(roc_auc, 0.74, decimal=2)
+    assert_array_almost_equal(roc_auc, 0.78, decimal=2)
 
 
 def test_roc_curve_one_label():
@@ -245,7 +243,8 @@ def test_auc_score_non_binary_class():
     """Test that auc_score function returns an error when trying to compute AUC
     for non-binary class values.
     """
-    y_pred = np.random.rand(10)
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
     # y_true contains only one class value
     y_true = np.zeros(10, dtype="int")
     assert_raise_message(ValueError, "AUC is defined for binary "
@@ -257,7 +256,7 @@ def test_auc_score_non_binary_class():
     assert_raise_message(ValueError, "AUC is defined for binary "
                          "classification only", auc_score, y_true, y_pred)
     # y_true contains three different class values
-    y_true = np.random.randint(0, 3, size=10)
+    y_true = rng.randint(0, 3, size=10)
     assert_raise_message(ValueError, "AUC is defined for binary "
                          "classification only", auc_score, y_true, y_pred)
 
@@ -268,22 +267,22 @@ def test_precision_recall_f1_score_binary():
 
     # detailed measures for each class
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
-    assert_array_almost_equal(p, [0.73, 0.75], 2)
-    assert_array_almost_equal(r, [0.76, 0.72], 2)
-    assert_array_almost_equal(f, [0.75, 0.74], 2)
+    assert_array_almost_equal(p, [0.73, 0.85], 2)
+    assert_array_almost_equal(r, [0.88, 0.68], 2)
+    assert_array_almost_equal(f, [0.80, 0.76], 2)
     assert_array_equal(s, [25, 25])
 
     # individual scoring function that can be used for grid search: in the
     # binary class case the score is the value of the measure for the positive
     # class (e.g. label == 1)
     ps = precision_score(y_true, y_pred)
-    assert_array_almost_equal(ps, 0.75, 2)
+    assert_array_almost_equal(ps, 0.85, 2)
 
     rs = recall_score(y_true, y_pred)
-    assert_array_almost_equal(rs, 0.72, 2)
+    assert_array_almost_equal(rs, 0.68, 2)
 
     fs = f1_score(y_true, y_pred)
-    assert_array_almost_equal(fs, 0.74, 2)
+    assert_array_almost_equal(fs, 0.76, 2)
 
 
 def test_average_precision_score_duplicate_values():
@@ -331,7 +330,7 @@ def test_confusion_matrix_binary():
     y_true, y_pred, _ = make_prediction(binary=True)
 
     cm = confusion_matrix(y_true, y_pred)
-    assert_array_equal(cm, [[19, 6], [7, 18]])
+    assert_array_equal(cm, [[22, 3], [8, 17]])
 
     tp = cm[0, 0]
     tn = cm[1, 1]
@@ -345,7 +344,7 @@ def test_confusion_matrix_binary():
         true_mcc = num / den
     mcc = matthews_corrcoef(y_true, y_pred)
     assert_array_almost_equal(mcc, true_mcc, decimal=2)
-    assert_array_almost_equal(mcc, 0.48, decimal=2)
+    assert_array_almost_equal(mcc, 0.57, decimal=2)
 
 
 def test_matthews_corrcoef_nan():
@@ -360,46 +359,46 @@ def test_precision_recall_f1_score_multiclass():
 
     # compute scores with default labels introspection
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
-    assert_array_almost_equal(p, [0.82, 0.55, 0.47], 2)
-    assert_array_almost_equal(r, [0.92, 0.17, 0.90], 2)
-    assert_array_almost_equal(f, [0.87, 0.26, 0.62], 2)
-    assert_array_equal(s, [25, 30, 20])
+    assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2)
+    assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2)
+    assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2)
+    assert_array_equal(s, [24, 31, 20])
 
     # averaging tests
     ps = precision_score(y_true, y_pred, pos_label=1, average='micro')
-    assert_array_almost_equal(ps, 0.61, 2)
+    assert_array_almost_equal(ps, 0.53, 2)
 
     rs = recall_score(y_true, y_pred, average='micro')
-    assert_array_almost_equal(rs, 0.61, 2)
+    assert_array_almost_equal(rs, 0.53, 2)
 
     fs = f1_score(y_true, y_pred, average='micro')
-    assert_array_almost_equal(fs, 0.61, 2)
+    assert_array_almost_equal(fs, 0.53, 2)
 
     ps = precision_score(y_true, y_pred, average='macro')
-    assert_array_almost_equal(ps, 0.62, 2)
+    assert_array_almost_equal(ps, 0.53, 2)
 
     rs = recall_score(y_true, y_pred, average='macro')
-    assert_array_almost_equal(rs, 0.66, 2)
+    assert_array_almost_equal(rs, 0.60, 2)
 
     fs = f1_score(y_true, y_pred, average='macro')
-    assert_array_almost_equal(fs, 0.58, 2)
+    assert_array_almost_equal(fs, 0.51, 2)
 
     ps = precision_score(y_true, y_pred, average='weighted')
-    assert_array_almost_equal(ps, 0.62, 2)
+    assert_array_almost_equal(ps, 0.51, 2)
 
     rs = recall_score(y_true, y_pred, average='weighted')
-    assert_array_almost_equal(rs, 0.61, 2)
+    assert_array_almost_equal(rs, 0.53, 2)
 
     fs = f1_score(y_true, y_pred, average='weighted')
-    assert_array_almost_equal(fs, 0.55, 2)
+    assert_array_almost_equal(fs, 0.47, 2)
 
     # same prediction but with and explicit label ordering
     p, r, f, s = precision_recall_fscore_support(
         y_true, y_pred, labels=[0, 2, 1], average=None)
-    assert_array_almost_equal(p, [0.82, 0.47, 0.55], 2)
-    assert_array_almost_equal(r, [0.92, 0.90, 0.17], 2)
-    assert_array_almost_equal(f, [0.87, 0.62, 0.26], 2)
-    assert_array_equal(s, [25, 20, 30])
+    assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2)
+    assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2)
+    assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2)
+    assert_array_equal(s, [24, 20, 31])
 
 
 def test_precision_recall_f1_score_multiclass_pos_label_none():
@@ -443,15 +442,15 @@ def test_confusion_matrix_multiclass():
 
     # compute confusion matrix with default labels introspection
     cm = confusion_matrix(y_true, y_pred)
-    assert_array_equal(cm, [[23, 2, 0],
-                            [5, 5, 20],
+    assert_array_equal(cm, [[19, 4, 1],
+                            [4, 3, 24],
                             [0, 2, 18]])
 
     # compute confusion matrix with explicit label ordering
     cm = confusion_matrix(y_true, y_pred, labels=[0, 2, 1])
-    assert_array_equal(cm, [[23, 0, 2],
+    assert_array_equal(cm, [[19, 1, 4],
                             [0, 18, 2],
-                            [5, 20, 5]])
+                            [4, 24, 3]])
 
 
 def test_confusion_matrix_multiclass_subset_labels():
@@ -460,14 +459,14 @@ def test_confusion_matrix_multiclass_subset_labels():
 
     # compute confusion matrix with only first two labels considered
     cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
-    assert_array_equal(cm, [[23, 2],
-                            [5, 5]])
+    assert_array_equal(cm, [[19, 4],
+                            [4, 3]])
 
     # compute confusion matrix with explicit label ordering for only subset
     # of labels
     cm = confusion_matrix(y_true, y_pred, labels=[2, 1])
     assert_array_equal(cm, [[18, 2],
-                            [20, 5]])
+                            [24, 3]])
 
 
 def test_classification_report():
@@ -479,11 +478,11 @@ def test_classification_report():
     expected_report = """\
              precision    recall  f1-score   support
 
-     setosa       0.82      0.92      0.87        25
- versicolor       0.56      0.17      0.26        30
-  virginica       0.47      0.90      0.62        20
+     setosa       0.83      0.79      0.81        24
+ versicolor       0.33      0.10      0.15        31
+  virginica       0.42      0.90      0.57        20
 
-avg / total       0.62      0.61      0.56        75
+avg / total       0.51      0.53      0.47        75
 """
     report = classification_report(
         y_true, y_pred, labels=np.arange(len(iris.target_names)),
@@ -499,6 +498,15 @@ def test_classification_report():
           2       0.47      0.90      0.62        20
 
 avg / total       0.62      0.61      0.56        75
+"""
+    expected_report = """\
+             precision    recall  f1-score   support
+
+          0       0.83      0.79      0.81        24
+          1       0.33      0.10      0.15        31
+          2       0.42      0.90      0.57        20
+
+avg / total       0.51      0.53      0.47        75
 """
     report = classification_report(y_true, y_pred)
     assert_equal(report, expected_report)
@@ -526,7 +534,7 @@ def _test_precision_recall_curve(y_true, probas_pred):
     """Test Precision-Recall and aread under PR curve"""
     p, r, thresholds = precision_recall_curve(y_true, probas_pred)
     precision_recall_auc = auc(r, p)
-    assert_array_almost_equal(precision_recall_auc, 0.82, 2)
+    assert_array_almost_equal(precision_recall_auc, 0.85, 2)
     assert_array_almost_equal(precision_recall_auc,
                               average_precision_score(y_true, probas_pred))
     # Smoke test in the case of proba having only one value
@@ -570,18 +578,18 @@ def test_losses():
     # --------------
     with warnings.catch_warnings(record=True):
     # Throw deprecated warning
-        assert_equal(zero_one(y_true, y_pred), 13)
+        assert_equal(zero_one(y_true, y_pred), 11)
         assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
-                            13 / float(n_samples), 2)
+                            11 / float(n_samples), 2)
 
     assert_almost_equal(zero_one_loss(y_true, y_pred),
-                        13 / float(n_samples), 2)
-    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13)
+                        11 / float(n_samples), 2)
+    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11)
     assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
     assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)
 
     assert_almost_equal(hamming_loss(y_true, y_pred),
-                        2 * 13. / (n_samples * n_classes), 2)
+                        2 * 11. / (n_samples * n_classes), 2)
 
     assert_equal(accuracy_score(y_true, y_pred),
                  1 - zero_one_loss(y_true, y_pred))
@@ -597,21 +605,21 @@ def test_losses():
     # Regression
     # ----------
     assert_almost_equal(mean_squared_error(y_true, y_pred),
-                        12.999 / n_samples, 2)
+                        10.999 / n_samples, 2)
     assert_almost_equal(mean_squared_error(y_true, y_true),
                         0.00, 2)
 
     # mean_absolute_error and mean_squared_error are equal because
     # it is a binary problem.
     assert_almost_equal(mean_absolute_error(y_true, y_pred),
-                        12.999 / n_samples, 2)
+                        10.999 / n_samples, 2)
     assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)
 
-    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
+    assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2)
     assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
     assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)
 
-    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
+    assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2)
     assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
     assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
     assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
@@ -826,11 +834,12 @@ def test_multioutput_regression_invariance_to_dimension_shuffling():
     y_true = np.reshape(y_true, (-1, n_dims))
     y_pred = np.reshape(y_pred, (-1, n_dims))
 
+    rng = check_random_state(314159)
     for metric in [r2_score, mean_squared_error, mean_absolute_error]:
         error = metric(y_true, y_pred)
 
         for _ in xrange(3):
-            perm = np.random.permutation(n_dims)
+            perm = rng.permutation(n_dims)
             assert_almost_equal(error,
                                 metric(y_true[:, perm], y_pred[:, perm]))
 
@@ -855,14 +864,14 @@ def test_multilabel_representation_invariance():
 
     # NOTE: The "sorted" trick is necessary to shuffle labels, because it
     # allows to return the shuffled tuple.
-    py_random_state = random.Random(0)
-    shuffled = lambda x: sorted(x, key=lambda *args: py_random_state.random())
+    rng = check_random_state(42)
+    shuffled = lambda x: sorted(x, key=lambda *args: rng.rand())
     y1_shuffle = [shuffled(x) for x in y1]
     y2_shuffle = [shuffled(x) for x in y2]
 
-    # Let's have redundant label
-    y1_redundant = [x * py_random_state.randint(1, 3) for x in y1]
-    y2_redundant = [x * py_random_state.randint(1, 3) for x in y2]
+    # Let's have redundant labels
+    y1_redundant = [x * rng.randint(1, 4) for x in y1]
+    y2_redundant = [x * rng.randint(1, 4) for x in y2]
 
     # Binary indicator matrix format
     lb = LabelBinarizer().fit([range(n_classes)])