From 2965167944227eb7c98955ce26fd53c4085c7122 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 09:18:30 +0200 Subject: [PATCH 01/15] ENH add multilabel support to precision recall fscore --- doc/modules/model_evaluation.rst | 60 ++-- sklearn/metrics/metrics.py | 463 ++++++++++++++++++++++---- sklearn/metrics/tests/test_metrics.py | 309 ++++++++++++++++- 3 files changed, 746 insertions(+), 86 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5c8d41c033f69..55f64e830ef59 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -47,21 +47,22 @@ Others also work in the multiclass case: classification_report confusion_matrix - f1_score - fbeta_score - precision_recall_fscore_support - precision_score - recall_score + And some also work in the multilabel case: .. autosummary:: :template: function.rst - accuracy_score - hamming_loss - jaccard_similarity_score - zero_one_loss + accuracy_score + f1_score + fbeta_score + hamming_loss + jaccard_similarity_score + precision_recall_fscore_support + precision_score + recall_score + zero_one_loss Some metrics might require probability estimates of the positive class, @@ -465,11 +466,6 @@ Moreover, these notions can be further extended. The functions It can result in F1 score that is not between precision and recall. * ``None``: no averaging is performed. -.. warning:: - - Currently those functions support only the multiclass case. However the - following definitions are general and remain valid in the multilabel - case. Let's define some notations: @@ -566,15 +562,19 @@ Here an example where ``average`` is set to to ``weighted``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.22... - >>> metrics.recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.recall_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... >>> metrics.f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.26... - >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, + ... average='weighted') # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) Here an example where ``average`` is set to ``None``:: @@ -582,18 +582,36 @@ Here an example where ``average`` is set to ``None``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average=None) + ... # doctest: +ELLIPSIS array([ 0.66..., 0. , 0. ]) >>> metrics.recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) >>> metrics.f1_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS array([ 0.8, 0. , 0. ]) - >>> metrics.fbeta_score(y_true, y_pred, average=None, beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS array([ 0.71..., 0. , 0. ]) - >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5) # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5) + ... # doctest: +ELLIPSIS (array([ 0.66..., 0. , 0. ]), array([ 1., 0., 0.]), array([ 0.71..., 0. , 0. ]), array([2, 2, 2]...)) +Those functions also support the multilabel case. + + >>> from sklearn import metrics + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + + >>> metrics.f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.59... + >>> metrics.f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.61... + >>> metrics.f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.59... + >>> metrics.f1_score(y_true, y_pred, average=None) + array([ 0.5, 0.8, 0.5]) + Hinge loss ---------- diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index b88d37422cfe2..285c7efe0f045 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1090,15 +1090,15 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): F1 = 2 * (precision * recall) / (precision + recall) - In the multi-class case, this is the weighted average of the F1 score of - each class. + In the multi-class and multi-label case, this is the weighted average of + the F1 score of each class. Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1106,7 +1106,9 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel classification, + it is used to infer what is a positive label in the label indicator + matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1117,7 +1119,8 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): Average over classes (does not take imbalance into account). ``'micro'``: Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + implies that ``precision == recall == F1``. In multilabel + classification, this is true only if every sample has a label. ``'weighted'``: Average weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1157,6 +1160,34 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> f1_score(y_true, y_pred, average=None) array([ 0.8, 0. , 0. ]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import f1_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.59... + >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.61... + >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.59... + >>> f1_score(y_true, y_pred, average=None) + array([ 0.5, 0.8, 0.5]) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.80... + >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.5 + >>> f1_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) + """ return fbeta_score(y_true, y_pred, 1, labels=labels, pos_label=pos_label, average=average) @@ -1176,10 +1207,10 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. beta: float @@ -1190,7 +1221,9 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel classification, + it is used to infer what is a positive label in the label indicator + matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1201,7 +1234,8 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, Average over classes (does not take imbalance into account). ``'micro'``: Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + implies that ``precision == recall == F1``. In multilabel + classification, this is true only if every sample has a label. ``'weighted'``: Average weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1240,19 +1274,54 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, >>> from sklearn.metrics import fbeta_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... - >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS 0.33... - >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... - >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS array([ 0.71..., 0. , 0. ]) + + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import fbeta_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.49... + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + 0.5 + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS + array([ 0.38..., 0.71..., 0.38...]) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.90... + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS + 0.42... + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + array([ 1., 1., 0.]) + """ _, _, f, _ = precision_recall_fscore_support(y_true, y_pred, beta=beta, @@ -1262,6 +1331,124 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, return f +def _tp_tn_fp_fn(y_true, y_pred, labels=None, pos_label=1): + """Compute the number of true/false positives/negative for each class + + Parameters + ---------- + y_true : array-like or list of labels or label indicator matrix + Ground truth (correct) labels. + + y_pred : array-like or list of labels or label indicator matrix + Predicted labels, as returned by a classifier. + + labels : array, shape = [n_labels], optional + Integer array of labels. + + pos_label : int, 1 by default + In multilabel classification, it is used to infer what is a positive + label in the label indicator matrix format. + + Returns + ------- + true_pos : array of int, shape = [n_unique_labels] + Number of true positives + + true_neg : array of int, shape = [n_unique_labels] + Number of true negative + + false_pos : array of int, shape = [n_unique_labels] + Number of false positives + + false_pos : array of int, shape = [n_unique_labels] + Number of false positives + + Examples + -------- + In the binary case: + + >>> from sklearn.metrics.metrics import _tp_tn_fp_fn + >>> y_pred = [0, 1, 0, 0] + >>> y_true = [0, 1, 0, 1] + >>> _tp_tn_fp_fn(y_true, y_pred) + (array([2, 1]), array([1, 2]), array([1, 0]), array([0, 1])) + + In the multiclass case: + >>> y_true = np.array([0, 1, 2, 0, 1, 2]) + >>> y_pred = np.array([0, 2, 1, 0, 0, 1]) + >>> _tp_tn_fp_fn(y_true, y_pred) + (array([2, 0, 0]), array([3, 2, 3]), array([1, 2, 1]), array([0, 2, 2])) + + In the multilabel case with binary indicator format: + + >>> _tp_tn_fp_fn(np.array([[0.0, 1.0], [1.0, 1.0]]), np.zeros((2, 2))) + (array([0, 0]), array([1, 0]), array([0, 0]), array([1, 2])) + + and with a list of labels format: + + >>> _tp_tn_fp_fn([(1, 2), (3,)], [(1, 2), tuple()]) # doctest: +ELLIPSIS + (array([1, 1, 0]), array([1, 1, 1]), array([0, 0, 0]), array([0, 0, 1])) + + """ + y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) + + if labels is None: + labels = unique_labels(y_true, y_pred) + else: + labels = np.asarray(labels, dtype=np.int) + + n_labels = labels.size + true_pos = np.zeros((n_labels), dtype=np.int) + false_pos = np.zeros((n_labels), dtype=np.int) + false_neg = np.zeros((n_labels), dtype=np.int) + + if is_multilabel(y_true): + # Handle mix representation + if type(y_true) != type(y_pred): + labels = unique_labels(y_true, y_pred) + lb = LabelBinarizer() + lb.fit([labels.tolist()]) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + + if is_label_indicator_matrix(y_true): + true_pos = np.sum(np.logical_and(y_true == pos_label, + y_pred == pos_label), axis=0) + false_pos = np.sum(np.logical_and(y_true != pos_label, + y_pred == pos_label), axis=0) + false_neg = np.sum(np.logical_and(y_true == pos_label, + y_pred != pos_label), axis=0) + + else: + for true, pred in zip(y_true, y_pred): + for i, label_i in enumerate(labels): + label_i_in_true = label_i in true + label_i_in_pred = label_i in pred + true_pos[i] += label_i_in_true and label_i_in_pred + false_pos[i] += not label_i_in_true and label_i_in_pred + false_neg[i] += label_i_in_true and not label_i_in_pred + + else: + y_true, y_pred = check_arrays(y_true, y_pred) + y_true, y_pred = _check_1d_array(y_true, y_pred) + + for i, label_i in enumerate(labels): + true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i) + false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i) + false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i) + + # Compute the true_neg using the tp, fp and fn + if hasattr(y_true, "shape"): + n_samples = (np.max(y_true.shape) if _is_1d(y_true) + else y_true.shape[0]) + else: + n_samples = len(y_true) + + true_neg = n_samples - true_pos - false_pos - false_neg + + return true_pos, true_neg, false_pos, false_neg + + def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1, average=None): """Compute precision, recall, F-measure and support for each class @@ -1284,16 +1471,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, The support is the number of occurrences of each class in ``y_true``. - If ``pos_label is None``, this function returns the average precision, - recall and F-measure if ``average`` is one of ``'micro'``, ``'macro'``, - ``'weighted'``. + If ``pos_label is None`` and in binary classification, this function returns + the average precision, recall and F-measure if ``average`` is one of + ``'micro'``, ``'macro'``, ``'weighted'``. Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. beta : float, 1.0 by default @@ -1304,7 +1491,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel classification, + it is used to infer what is a positive label in the label indicator + matrix format. average : string, [None (default), 'micro', 'macro', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, @@ -1315,7 +1504,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, Average over classes (does not take imbalance into account). ``'micro'``: Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + implies that ``precision == recall == F1``. In multilabel + classification, this is true only if every sample has a label. ``'weighted'``: Average weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1365,22 +1555,52 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, >>> from sklearn.metrics import precision_recall_fscore_support >>> y_true = np.array([0, 1, 2, 0, 1, 2]) >>> y_pred = np.array([0, 2, 1, 0, 0, 1]) - >>> precision_recall_fscore_support(y_true, y_pred, average='macro')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='micro')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import precision_recall_fscore_support + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS + (0.44..., 1.0, 0.59..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS + (0.44..., 1.0, 0.61..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + (1.0, 0.44..., 0.59..., None) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS + (0.66..., 0.66..., 0.66..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS + (1.0, 0.66..., 0.80..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + (0.5, 1.0, 0.5, None) + """ if beta <= 0: raise ValueError("beta should be >0 in the F-beta score") + beta2 = beta ** 2 - y_true, y_pred = check_arrays(y_true, y_pred) - y_true, y_pred = _check_1d_array(y_true, y_pred) + y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) if labels is None: labels = unique_labels(y_true, y_pred) @@ -1388,16 +1608,62 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, labels = np.asarray(labels, dtype=np.int) n_labels = labels.size - true_pos = np.empty(n_labels, dtype=np.long) - false_pos = np.empty(n_labels, dtype=np.long) - false_neg = np.empty(n_labels, dtype=np.long) - support = np.empty(n_labels, dtype=np.long) - for i, label_i in enumerate(labels): - true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i) - false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i) - false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i) - support[i] = np.sum(y_true == label_i) + if is_multilabel(y_true): + # Handle mix representation + if type(y_true) != type(y_pred): + labels = unique_labels(y_true, y_pred) + lb = LabelBinarizer() + lb.fit([labels.tolist()]) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + + # The weighted multilabel case must be handled separatly + # since it can't be computed using true/false negative/positive + if average == "weighted": + if is_label_indicator_matrix(y_true): + y_true_pos_label = y_true == pos_label + y_pred_pos_label = y_pred == pos_label + size_inter = np.sum(np.logical_and(y_true_pos_label, + y_pred_pos_label), axis=1) + size_true = np.sum(y_true_pos_label, axis=1) + size_pred = np.sum(y_pred_pos_label, axis=1) + + else: + size_inter = np.empty(len(y_true)) + size_true = np.empty(len(y_true)) + size_pred = np.empty(len(y_true)) + for i, (true, pred) in enumerate(zip(y_true, y_pred)): + true_set = set(true) + pred_set = set(pred) + size_inter[i] = len(true_set & pred_set) + size_pred[i] = len(pred_set) + size_true[i] = len(true_set) + + try: + # oddly, we may get an "invalid" rather than a "divide" error here + old_err_settings = np.seterr(divide='ignore', invalid='ignore') + + precision = size_inter / size_true + recall = size_inter / size_pred + f_score = ((1 + beta2 ** 2) * size_inter / + (beta2 * size_pred + size_true)) + finally: + np.seterr(**old_err_settings) + + precision[size_true == 0.] = 1.0 + recall[size_pred == 0.] = 1.0 + f_score[(beta2 * size_pred + size_true) == 0.] = 1.0 + + precision = np.mean(precision) + recall = np.mean(recall) + f_score = np.mean(f_score) + + return precision, recall, f_score, None + + true_pos, _, false_pos, false_neg = _tp_tn_fp_fn(y_true, y_pred, labels, + pos_label) + support = true_pos + false_neg try: # oddly, we may get an "invalid" rather than a "divide" error here @@ -1412,13 +1678,12 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, recall[(true_pos + false_neg) == 0] = 0.0 # fbeta score - beta2 = beta ** 2 - fscore = divide((1 + beta2) * precision * recall, - beta2 * precision + recall, - dtype=np.double) + fscore = np.divide((1 + beta2) * precision * recall, + beta2 * precision + recall, + dtype=np.double) # handle division by 0 in fscore - fscore[(precision + recall) == 0] = 0.0 + fscore[(beta2 * precision + recall) == 0] = 0.0 finally: np.seterr(**old_err_settings) @@ -1436,8 +1701,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, average_options = (None, 'micro', 'macro', 'weighted') if average == 'micro': avg_precision = divide(true_pos.sum(), - true_pos.sum() + false_pos.sum(), - dtype=np.double) + true_pos.sum() + false_pos.sum(), + dtype=np.double) avg_recall = divide(true_pos.sum(), true_pos.sum() + false_neg.sum(), dtype=np.double) @@ -1448,14 +1713,28 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_precision = np.mean(precision) avg_recall = np.mean(recall) avg_fscore = np.mean(fscore) - elif average == 'weighted': + + elif average == 'weighted' and not is_multilabel(y_true): avg_precision = np.average(precision, weights=support) avg_recall = np.average(recall, weights=support) avg_fscore = np.average(fscore, weights=support) + else: raise ValueError('average has to be one of ' + str(average_options)) + avg_precision = (avg_precision + if not np.isnan(avg_precision) + else 0) + + avg_recall = (avg_recall + if not np.isnan(avg_recall) + else 0) + + avg_fscore = (avg_fscore + if not np.isnan(avg_fscore) + else 0) + return avg_precision, avg_recall, avg_fscore, None @@ -1472,10 +1751,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1483,7 +1762,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel classification, + it is used to infer what is a positive label in the label indicator + matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1494,7 +1775,8 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, Average over classes (does not take imbalance into account). ``'micro'``: Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + implies that ``precision == recall == F1``. In multilabel + classification, this is true only if every sample has a label. ``'weighted'``: Average weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1525,12 +1807,43 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, 0.22... >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.33... - >>> precision_score(y_true, y_pred, average='weighted')\ - # doctest: +ELLIPSIS + >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.22... >>> precision_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS array([ 0.66..., 0. , 0. ]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import precision_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.44... + >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.44... + >>> precision_score(y_true, y_pred, average='weighted') + 1.0 + >>> precision_score(y_true, y_pred, average=None) + ... # doctest: +ELLIPSIS + array([ 0.33..., 0.66..., 0.33...]) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 1.0 + >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + 0.5 + >>> precision_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) + + """ p, _, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, @@ -1550,10 +1863,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1561,7 +1874,9 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel classification, + it is used to infer what is a positive label in the label indicator + matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1572,7 +1887,8 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): Average over classes (does not take imbalance into account). ``'micro'``: Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + implies that ``precision == recall == F1``. In multilabel + classification, this is true only if every sample has a label. ``'weighted'``: Average weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1608,6 +1924,33 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import recall_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> recall_score(y_true, y_pred, average='macro') + 1.0 + >>> recall_score(y_true, y_pred, average='micro') + 1.0 + >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.44... + >>> recall_score(y_true, y_pred, average=None) + array([ 1., 1., 1.]) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> recall_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='weighted') + 1.0 + >>> recall_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) """ _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 6d36434f54a72..7409f1f818e4b 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -68,14 +68,47 @@ "f2_score": lambda y1, y2: fbeta_score(y1, y2, beta=2), "f0.5_score": lambda y1, y2: fbeta_score(y1, y2, beta=0.5), "matthews_corrcoef_score": matthews_corrcoef, - "auc_score": auc_score, "average_precision_score": average_precision_score, + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), + "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, "explained_variance_score": explained_variance_score, - "r2_score": r2_score} + "r2_score": r2_score +} METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score ": lambda y1, y2, normalize: @@ -101,6 +134,38 @@ "unnormalized_zero_one_loss": lambda y1, y2: zero_one_loss(y1, y2, normalize=False), + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), } SYMETRIC_METRICS = { @@ -119,17 +184,54 @@ lambda y1, y2: zero_one_loss(y1, y2, normalize=False), "f1_score": f1_score, + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "matthews_corrcoef_score": matthews_corrcoef, "mean_absolute_error": mean_absolute_error, - "mean_squared_error": mean_squared_error} + "mean_squared_error": mean_squared_error +} NOT_SYMETRIC_METRICS = { + "explained_variance_score": explained_variance_score, + "r2_score": r2_score, + "precision_score": precision_score, "recall_score": recall_score, "f2_score": lambda y1, y2: fbeta_score(y1, y2, beta=2), "f0.5_score": lambda y1, y2: fbeta_score(y1, y2, beta=0.5), - "explained_variance_score": explained_variance_score, - "r2_score": r2_score} + + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), +} THRESHOLDED_METRICS = { "auc_score": auc_score, @@ -956,6 +1058,7 @@ def test_multioutput_regression_invariance_to_dimension_shuffling(): def test_multilabel_representation_invariance(): + # Generate some data n_classes = 4 n_samples = 50 @@ -1127,6 +1230,10 @@ def test_multilabel_accuracy_score_subset_accuracy(): assert_equal(1.0, accuracy_score(y1, y1)) assert_equal(1.0, accuracy_score(y2, y2)) assert_equal(0.0, accuracy_score(y2, [(), ()])) + assert_equal(1, accuracy_score(y1, y2, normalize=False)) + assert_equal(2, accuracy_score(y1, y1, normalize=False)) + assert_equal(2, accuracy_score(y2, y2, normalize=False)) + assert_equal(0, accuracy_score(y2, [(), ()], normalize=False)) def test_multilabel_jaccard_similarity_score(): @@ -1244,3 +1351,195 @@ def test_normalize_option_multilabel_classification(): / n_samples, measure, err_msg="Failed with %s" % name) + + +def test_precision_recall_f1_score_multilabel_1(): + """ Test precision_recall_f1_score on a crafted multilabel example + """ + # First crafted example + y_true_ll = [(0,), (1,), (2, 3)] + y_pred_ll = [(1,), (1,), (2, 0)] + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 1, 1, 0] + #fn = [1, 0, 0, 1] + #fp = [1, 1, 0, 0] + + # Check per class + assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 1, 1, 1], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(1.5 / 4, p) + assert_almost_equal(0.5, r) + assert_almost_equal(2.5 / 1.5 * 0.25, f) + assert_equal(None, s) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(0.5, p) + assert_almost_equal(0.5, r) + assert_almost_equal(0.5, f) + assert_equal(None, s) + + # Check weigted + # |h(x_i) inter y_i | = [0, 1, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(0.5, p) + assert_almost_equal(0.5, r) + assert_almost_equal(0.5, f) + assert_equal(None, s) + + +def test_precision_recall_f1_score_multilabel_2(): + """ Test precision_recall_f1_score on a crafted multilabel example 2 + """ + # Second crafted example + y_true_ll = [(1,), (2,), (2, 3)] + y_pred_ll = [(4,), (4,), (2, 1)] + lb = LabelBinarizer() + lb.fit([range(1, 5)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + # tp = [ 0. 1. 0. 0.] + # fp = [ 1. 0. 0. 2.] + # fn = [ 1. 1. 1. 0.] + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(0.25, p) + assert_almost_equal(0.25, r) + assert_almost_equal(2 * 0.25 * 0.25 / 0.5, f) + assert_equal(None, s) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(0.25, p) + assert_almost_equal(0.125, r) + assert_almost_equal(2 / 12, f) + assert_equal(None, s) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + # Check weigted + # |h(x_i) inter y_i | = [0, 0, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + assert_almost_equal(1 / 6, p) + assert_almost_equal(1 / 6, r) + assert_almost_equal(2 / 4 * 1 / 3, f) + assert_equal(None, s) + + +def test_precision_recall_f1_score_with_an_empty_prediction(): + y_true_ll = [(1,), (0,), (2, 1,)] + y_pred_ll = [tuple(), (3,), (2, 1)] + + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + # true_pos = [ 0. 1. 1. 0.] + # false_pos = [ 0. 0. 0. 1.] + # false_neg = [ 1. 1. 0. 0.] + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(0.5, p) + assert_almost_equal(1.5 / 4, r) + assert_almost_equal(2.5 / (4 * 1.5), f) + assert_equal(None, s) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(2 / 3, p) + assert_almost_equal(0.5, r) + assert_almost_equal(2 / 3 / (2 / 3 + 0.5), f) + assert_equal(None, s) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + # Check weigted + # |h(x_i) inter y_i | = [0, 0, 2] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [0, 1, 2] + assert_almost_equal(1 / 3, p) + assert_almost_equal(2 / 3, r) + assert_almost_equal(1 / 3, f) + assert_equal(None, s) + + +def test_precision_recall_f1_no_labels(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 0, 0] + #fn = [0, 0, 0] + #fp = [0, 0, 0] + + # Check per class + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(None, s) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(None, s) + + # # Check weigted + # |h(x_i) inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 1) + assert_almost_equal(r, 1) + assert_almost_equal(f, 1) + assert_equal(None, s) From 66394abff080620ccb880cd07fd11375d847be25 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 13:38:00 +0200 Subject: [PATCH 02/15] ENH add multilabel support to classification_report --- doc/modules/model_evaluation.rst | 2 +- sklearn/metrics/metrics.py | 12 ++- sklearn/metrics/tests/test_metrics.py | 136 +++++++++++++++++++++++++- 3 files changed, 145 insertions(+), 5 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 55f64e830ef59..d21e19bba5035 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -45,7 +45,6 @@ Others also work in the multiclass case: .. autosummary:: :template: function.rst - classification_report confusion_matrix @@ -55,6 +54,7 @@ And some also work in the multilabel case: :template: function.rst accuracy_score + classification_report f1_score fbeta_score hamming_loss diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 285c7efe0f045..4b4a31d6d2041 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1985,15 +1985,16 @@ def zero_one_score(y_true, y_pred): ############################################################################### # Multiclass utility function ############################################################################### -def classification_report(y_true, y_pred, labels=None, target_names=None): +def classification_report(y_true, y_pred, labels=None, target_names=None, + pos_label=1): """Build a text report showing the main classification metrics Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array, shape = [n_labels] @@ -2002,6 +2003,10 @@ def classification_report(y_true, y_pred, labels=None, target_names=None): target_names : list of strings Optional display names matching the labels (same order). + pos_label : int, 1 by default + In multilabel classification, it is used to infer what is a + positive label in the label indicator matrix format. + Returns ------- report : string @@ -2051,6 +2056,7 @@ class 2 1.00 1.00 1.00 2 p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, labels=labels, + pos_label=pos_label, average=None) for i, label in enumerate(labels): diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 7409f1f818e4b..22b107c381238 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -168,6 +168,45 @@ lambda y1, y2: recall_score(y1, y2, average="macro"), } +MULTILABELS_METRICS_WITH_POS_LABELS = { + "jaccard_similarity_score": jaccard_similarity_score, + "unormalized_jaccard_similarity_score": lambda y1, y2, pos_label=1: + jaccard_similarity_score(y1, y2, pos_label=pos_label, normalize=False), + + "weighted_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="weighted", beta=0.5), + "weighted_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="weighted"), + "weighted_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="weighted", beta=2), + "weighted_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="weighted"), + "weighted_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="weighted"), + + "micro_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=0.5), + "micro_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="micro"), + "micro_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=2), + "micro_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="micro"), + "micro_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="micro"), + + "macro_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="macro", beta=0.5), + "macro_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="macro"), + "macro_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="macro", beta=2), + "macro_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="macro"), + "macro_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="macro"), +} + SYMETRIC_METRICS = { "accuracy_score": accuracy_score, "unormalized_accuracy_score": @@ -680,7 +719,25 @@ def test_confusion_matrix_multiclass_subset_labels(): [24, 3]]) -def test_classification_report(): +def test_classification_report_binary_classification_with_pos_label(): + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=True) + + print y_true + expected_report = """\ + precision recall f1-score support + + 0 0.73 0.88 0.80 25 + 1 0.85 0.68 0.76 25 + +avg / total 0.79 0.78 0.78 50 +""" + for pos_label in [0, 1]: + report = classification_report(y_true, y_pred, pos_label=pos_label) + assert_equal(report, expected_report) + + +def test_classification_report_multiclass(): """Test performance report""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) @@ -723,6 +780,58 @@ def test_classification_report(): assert_equal(report, expected_report) +def test_multilabel_classification_report(): + + n_classes = 4 + n_samples = 50 + _, y_true_ll = make_multilabel_classification(n_features=1, + n_classes=n_classes, + random_state=0, + n_samples=n_samples) + _, y_pred_ll = make_multilabel_classification(n_features=1, + n_classes=n_classes, + random_state=1, + n_samples=n_samples) + + expected_report = """\ + precision recall f1-score support + + 0 0.39 0.73 0.51 15 + 1 0.57 0.75 0.65 28 + 2 0.33 0.11 0.17 18 + 3 0.44 0.50 0.47 24 + +avg / total 0.45 0.54 0.47 85 +""" + + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + report = classification_report(y_true, y_pred) + assert_equal(report, expected_report) + + # With a given pos_label + pos_label = 5 + y_true_bi = y_true_bi * pos_label + y_pred_bi = y_pred_bi * pos_label + + expected_report = """\ + precision recall f1-score support + + 0 0.39 0.73 0.51 15 + 1 0.57 0.75 0.65 28 + 2 0.33 0.11 0.17 18 + 3 0.44 0.50 0.47 24 + +avg / total 0.45 0.54 0.47 85 +""" + report = classification_report(y_true_bi, y_pred_bi, pos_label=pos_label) + assert_equal(report, expected_report) + + def test_precision_recall_curve(): y_true, _, probas_pred = make_prediction(binary=True) _test_precision_recall_curve(y_true, probas_pred) @@ -1543,3 +1652,28 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(r, 1) assert_almost_equal(f, 1) assert_equal(None, s) + + +def test_multilabel_invariance_with_pos_labels(): + n_classes = 4 + n_samples = 50 + _, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes, + random_state=0, n_samples=n_samples) + _, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes, + random_state=1, n_samples=n_samples) + + lb = LabelBinarizer().fit([range(n_classes)]) + y1_binary_indicator = lb.transform(y1) + y2_binary_indicator = lb.transform(y2) + + for name, metric in MULTILABELS_METRICS_WITH_POS_LABELS.items(): + measure = metric(y1, y2) + + for pos_label in [1, 3]: + assert_almost_equal(measure, + metric(y1_binary_indicator * pos_label, + y2_binary_indicator * pos_label, + pos_label=pos_label), + err_msg="%s is not representation invariant" + "with pos_label=%s" + % (metric, pos_label)) From af64f040bfb17b773938951720698e4179c31bad Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 13:45:43 +0200 Subject: [PATCH 03/15] DOC remove example --- doc/modules/model_evaluation.rst | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index d21e19bba5035..664fdd7806529 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -596,22 +596,6 @@ Here an example where ``average`` is set to ``None``:: ... # doctest: +ELLIPSIS (array([ 0.66..., 0. , 0. ]), array([ 1., 0., 0.]), array([ 0.71..., 0. , 0. ]), array([2, 2, 2]...)) - -Those functions also support the multilabel case. - - >>> from sklearn import metrics - >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) - >>> y_pred = np.ones((3, 3)) - - >>> metrics.f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS - 0.59... - >>> metrics.f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS - 0.61... - >>> metrics.f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS - 0.59... - >>> metrics.f1_score(y_true, y_pred, average=None) - array([ 0.5, 0.8, 0.5]) - Hinge loss ---------- From 21155fcc72c5f1196f3b878185590a01a1324ef0 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 13:53:42 +0200 Subject: [PATCH 04/15] pep8 --- sklearn/metrics/metrics.py | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 4b4a31d6d2041..3215821c278ff 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -114,7 +114,6 @@ def _check_1d_array(y1, y2, ravel=False): Examples -------- - >>> from numpy import array >>> from sklearn.metrics.metrics import _check_1d_array >>> _check_1d_array([1, 2], [[3, 4]]) (array([1, 2]), array([3, 4])) @@ -1106,9 +1105,9 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. In multilabel classification, - it is used to infer what is a positive label in the label indicator - matrix format. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1221,9 +1220,9 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. In multilabel classification, - it is used to infer what is a positive label in the label indicator - matrix format. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1307,7 +1306,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, and with a list of labels format: - >>> from sklearn.metrics import f1_score + >>> from sklearn.metrics import fbeta_score >>> y_true = [(1, 2), (3,)] >>> y_pred = [(1, 2), tuple()] >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) @@ -1471,9 +1470,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, The support is the number of occurrences of each class in ``y_true``. - If ``pos_label is None`` and in binary classification, this function returns - the average precision, recall and F-measure if ``average`` is one of - ``'micro'``, ``'macro'``, ``'weighted'``. + If ``pos_label is None`` and in binary classification, this function + returns the average precision, recall and F-measure if ``average`` + is one of ``'micro'``, ``'macro'``, ``'weighted'``. Parameters ---------- @@ -1491,9 +1490,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. In multilabel classification, - it is used to infer what is a positive label in the label indicator - matrix format. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. average : string, [None (default), 'micro', 'macro', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, @@ -1582,7 +1581,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, and with a list of labels format: - >>> from sklearn.metrics import f1_score + >>> from sklearn.metrics import precision_recall_fscore_support >>> y_true = [(1, 2), (3,)] >>> y_pred = [(1, 2), tuple()] >>> precision_recall_fscore_support(y_true, y_pred, average='macro') @@ -1641,7 +1640,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, size_true[i] = len(true_set) try: - # oddly, we may get an "invalid" rather than a "divide" error here + # oddly, we may get an "invalid" rather than a "divide" error + # here old_err_settings = np.seterr(divide='ignore', invalid='ignore') precision = size_inter / size_true @@ -1762,9 +1762,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. In multilabel classification, - it is used to infer what is a positive label in the label indicator - matrix format. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1830,7 +1830,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, and with a list of labels format: - >>> from sklearn.metrics import f1_score + >>> from sklearn.metrics import precision_score >>> y_true = [(1, 2), (3,)] >>> y_pred = [(1, 2), tuple()] >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS @@ -1874,9 +1874,9 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. In multilabel classification, - it is used to infer what is a positive label in the label indicator - matrix format. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. average : string, [None, 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, @@ -1940,7 +1940,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): and with a list of labels format: - >>> from sklearn.metrics import f1_score + >>> from sklearn.metrics import recall_score >>> y_true = [(1, 2), (3,)] >>> y_pred = [(1, 2), tuple()] >>> recall_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS From 7d89dc9807adc345201546d61b48da3dd0a26949 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 13:58:20 +0200 Subject: [PATCH 05/15] Update what's new? --- doc/whats_new.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5f9aac42242ef..b9a6a42540cd8 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -26,9 +26,15 @@ Changelog `Gilles Louppe`_. See the :ref:`AdaBoost ` section of the user guide for details and examples. - - :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss` support - multi-label classification and two new metrics :func:`metrics.hamming_loss` - and :func:`metrics.jaccard_similarity_score` + - Multi-label classification output now support to + :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.precision_scoreclassification_report`, + :func:`metrics.precision_score` and :func:`metrics.recal_score` + by `Arnaud Joly`_. + + - Two new metrics :func:`metrics.hamming_loss` and + :func:`metrics.jaccard_similarity_score` are added with multi-label support by `Arnaud Joly`_. - Speed and memory usage improvements in From 6bd66405bf1a299b5b9f38e951267c62c4c111a7 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 14:10:32 +0200 Subject: [PATCH 06/15] ENH refactor test_multiclass to use the new metrics --- sklearn/tests/test_multiclass.py | 33 +++++++------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index af61e00d47832..bc7e760ce84c5 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -13,6 +13,9 @@ from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score + from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge, @@ -31,30 +34,6 @@ n_classes = 3 -# FIXME: - should use sets -# - should move to metrics module -def multilabel_precision(Y_true, Y_pred): - n_predictions = 0 - n_correct = 0 - for i in range(len(Y_true)): - n_predictions += len(Y_pred[i]) - for label in Y_pred[i]: - if label in Y_true[i]: - n_correct += 1 - return float(n_correct) / n_predictions - - -def multilabel_recall(Y_true, Y_pred): - n_labels = 0 - n_correct = 0 - for i in range(len(Y_true)): - n_labels += len(Y_true[i]) - for label in Y_pred[i]: - if label in Y_true[i]: - n_correct += 1 - return float(n_correct) / n_labels - - def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovr.predict, []) @@ -141,9 +120,11 @@ def test_ovr_multilabel_dataset(): clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) assert_true(clf.multilabel_) - assert_almost_equal(multilabel_precision(Y_test, Y_pred), prec, + assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), + prec, decimal=2) - assert_almost_equal(multilabel_recall(Y_test, Y_pred), recall, + assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"), + recall, decimal=2) From fad38ab78661058c11b6e77527c584df2b111fa4 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 14:34:40 +0200 Subject: [PATCH 07/15] FIX compatibility divide issue --- sklearn/metrics/metrics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 3215821c278ff..7ec94d07f1d31 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1678,9 +1678,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, recall[(true_pos + false_neg) == 0] = 0.0 # fbeta score - fscore = np.divide((1 + beta2) * precision * recall, - beta2 * precision + recall, - dtype=np.double) + fscore = divide((1 + beta2) * precision * recall, + beta2 * precision + recall) # handle division by 0 in fscore fscore[(beta2 * precision + recall) == 0] = 0.0 From 1a43117961c7329a1a47524e2b36f654d41af50f Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 14:46:50 +0200 Subject: [PATCH 08/15] FIX set dtype in np.empty --- sklearn/metrics/metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 7ec94d07f1d31..ed18a430a9867 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -963,7 +963,7 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True, pos_label=1): np.seterr(**old_err_settings) else: - score = np.empty(len(y_true)) + score = np.empty(len(y_true), dtype=np.float) for i, (true, pred) in enumerate(zip(y_pred, y_true)): true_set = set(true) pred_set = set(pred) @@ -1629,9 +1629,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, size_pred = np.sum(y_pred_pos_label, axis=1) else: - size_inter = np.empty(len(y_true)) - size_true = np.empty(len(y_true)) - size_pred = np.empty(len(y_true)) + size_inter = np.empty(len(y_true), dtype=np.int) + size_true = np.empty(len(y_true), dtype=np.int) + size_pred = np.empty(len(y_true), dtype=np.int) for i, (true, pred) in enumerate(zip(y_true, y_pred)): true_set = set(true) pred_set = set(pred) From 5a42e165278afbdaf57da14da4168a9127dc9ab0 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Tue, 7 May 2013 15:25:30 +0200 Subject: [PATCH 09/15] FIX correct type comparison --- sklearn/metrics/metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index ed18a430a9867..9064d7da88c4e 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1651,9 +1651,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, finally: np.seterr(**old_err_settings) - precision[size_true == 0.] = 1.0 - recall[size_pred == 0.] = 1.0 - f_score[(beta2 * size_pred + size_true) == 0.] = 1.0 + precision[size_true == 0] = 1.0 + recall[size_pred == 0] = 1.0 + f_score[(beta2 * size_pred + size_true) == 0] = 1.0 precision = np.mean(precision) recall = np.mean(recall) From a045330294b73c9fbc11670f3321af6ad3636071 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 8 May 2013 09:49:32 +0200 Subject: [PATCH 10/15] COSMIT fix argument order in assert --- sklearn/metrics/tests/test_metrics.py | 347 +++++++++++--------------- 1 file changed, 152 insertions(+), 195 deletions(-) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 22b107c381238..054f61294e1b0 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -1026,60 +1026,50 @@ def test_format_invariance_with_1d_vectors(): measure = metric(y1, y2) - assert_almost_equal(measure, - metric(y1_list, y2_list), + assert_almost_equal(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant" - "with list" % metric) + "with list" % name) - assert_almost_equal(measure, - metric(y1_1d, y2_1d), + assert_almost_equal(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant" - "with np-array-1d" % metric) + "with np-array-1d" % name) - assert_almost_equal(measure, - metric(y1_column, y2_column), + assert_almost_equal(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant " - "with np-array-column" % metric) + "with np-array-column" % name) - assert_almost_equal(measure, - metric(y1_row, y2_row), + assert_almost_equal(metric(y1_row, y2_row), measure, err_msg="%s is not representation invariant " - "with np-array-row" % metric) + "with np-array-row" % name) # Mix format support - assert_almost_equal(measure, - metric(y1_1d, y2_list), + assert_almost_equal(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant " - "with mix np-array-1d and list" % metric) + "with mix np-array-1d and list" % name) - assert_almost_equal(measure, - metric(y1_list, y2_1d), + assert_almost_equal(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant " - "with mix np-array-1d and list" % metric) + "with mix np-array-1d and list" % name) - assert_almost_equal(measure, - metric(y1_1d, y2_column), + assert_almost_equal(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_column, y2_1d), + assert_almost_equal(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_list, y2_column), + assert_almost_equal(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant" "with mix list and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_column, y2_list), + assert_almost_equal(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant" "with mix list and np-array-column" - % metric) + % name) # At the moment, these mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) @@ -1093,7 +1083,7 @@ def test_format_invariance_with_1d_vectors(): def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) + assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4) with warnings.catch_warnings(): # Test deprecated pos_label @@ -1103,23 +1093,17 @@ def test_hinge_loss_binary(): y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) + + assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4) with warnings.catch_warnings(): # Test deprecated pos_label - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, - pos_label=2, neg_label=0)) + assert_equal(hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0), + 1.2 / 4) def test_multioutput_regression(): - y_true = np.array([[1, 0, 0, 1], - [0, 1, 1, 1], - [1, 1, 0, 1], - ]) - - y_pred = np.array([[0, 0, 0, 1], - [1, 0, 1, 1], - [0, 0, 0, 1], - ]) + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) error = mean_squared_error(y_true, y_pred) assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.) @@ -1134,15 +1118,8 @@ def test_multioutput_regression(): def test_multioutput_number_of_output_differ(): - y_true = np.array([[1, 0, 0, 1], - [0, 1, 1, 1], - [1, 1, 0, 1], - ]) - - y_pred = np.array([[0, 0], - [1, 0], - [0, 0], - ]) + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0], [1, 0], [0, 0]]) assert_raises(ValueError, mean_squared_error, y_true, y_pred) assert_raises(ValueError, mean_absolute_error, y_true, y_pred) @@ -1162,8 +1139,8 @@ def test_multioutput_regression_invariance_to_dimension_shuffling(): for _ in xrange(3): perm = rng.permutation(n_dims) - assert_almost_equal(error, - metric(y_true[:, perm], y_pred[:, perm])) + assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), + error) def test_multilabel_representation_invariance(): @@ -1203,57 +1180,48 @@ def test_multilabel_representation_invariance(): measure = metric(y1, y2) # Check representation invariance - assert_almost_equal(measure, - metric(y1_binary_indicator, y2_binary_indicator), + assert_almost_equal(metric(y1_binary_indicator, y2_binary_indicator), + measure, err_msg="%s failed representation invariance " "between list of list of labels format " "and dense binary indicator format." % name) # Check invariance with redundant labels with list of labels - assert_almost_equal(measure, - metric(y1, y2_redundant), + assert_almost_equal(metric(y1, y2_redundant), measure, err_msg="%s failed rendundant label invariance" % name) - assert_almost_equal(measure, - metric(y1_redundant, y2_redundant), + assert_almost_equal(metric(y1_redundant, y2_redundant), measure, err_msg="%s failed rendundant label invariance" % name) - assert_almost_equal(measure, - metric(y1_redundant, y2), + assert_almost_equal(metric(y1_redundant, y2), measure, err_msg="%s failed rendundant label invariance" % name) # Check shuffling invariance with list of labels - assert_almost_equal(measure, - metric(y1_shuffle, y2_shuffle), + assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure, err_msg="%s failed shuffling invariance " "with list of list of labels format." % name) # Check shuffling invariance with dense binary indicator matrix - assert_almost_equal(measure, - metric(y1_shuffle_binary_indicator, - y2_shuffle_binary_indicator), + assert_almost_equal(metric(y1_shuffle_binary_indicator, + y2_shuffle_binary_indicator), measure, err_msg="%s failed shuffling invariance " " with dense binary indicator format." % name) # Check invariance with mix input representation - assert_almost_equal(measure, - metric(y1, - y2_binary_indicator), + assert_almost_equal(metric(y1, y2_binary_indicator), measure, err_msg="%s failed mix input representation" "invariance: y_true in list of list of " "labels format and y_pred in dense binary" "indicator format" % name) - assert_almost_equal(measure, - metric(y1_binary_indicator, - y2), + assert_almost_equal(metric(y1_binary_indicator, y2), measure, err_msg="%s failed mix input representation" "invariance: y_true in dense binary " "indicator format and y_pred in list of " @@ -1263,134 +1231,124 @@ def test_multilabel_representation_invariance(): def test_multilabel_zero_one_loss_subset(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(0.5, zero_one_loss(y1, y2)) - assert_equal(0.0, zero_one_loss(y1, y1)) - assert_equal(0.0, zero_one_loss(y2, y2)) - assert_equal(1.0, zero_one_loss(y2, np.logical_not(y2))) - assert_equal(1.0, zero_one_loss(y1, np.logical_not(y1))) - assert_equal(1.0, zero_one_loss(y1, np.zeros(y1.shape))) - assert_equal(1.0, zero_one_loss(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(zero_one_loss(y1, y2), 0.5) + assert_equal(zero_one_loss(y1, y1), 0) + assert_equal(zero_one_loss(y2, y2), 0) + assert_equal(zero_one_loss(y2, np.logical_not(y2)), 1) + assert_equal(zero_one_loss(y1, np.logical_not(y1)), 1) + assert_equal(zero_one_loss(y1, np.zeros(y1.shape)), 1) + assert_equal(zero_one_loss(y2, np.zeros(y1.shape)), 1) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.5, zero_one_loss(y1, y2)) - assert_equal(0.0, zero_one_loss(y1, y1)) - assert_equal(0.0, zero_one_loss(y2, y2)) - assert_equal(1.0, zero_one_loss(y2, [(), ()])) - assert_equal(1.0, zero_one_loss(y2, [tuple(), (10, )])) + assert_equal(zero_one_loss(y1, y2), 0.5) + assert_equal(zero_one_loss(y1, y1), 0) + assert_equal(zero_one_loss(y2, y2), 0) + assert_equal(zero_one_loss(y2, [(), ()]), 1) + assert_equal(zero_one_loss(y2, [tuple(), (10, )]), 1) def test_multilabel_hamming_loss(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(1 / 6., hamming_loss(y1, y2)) - assert_equal(0.0, hamming_loss(y1, y1)) - assert_equal(0.0, hamming_loss(y2, y2)) - assert_equal(1.0, hamming_loss(y2, np.logical_not(y2))) - assert_equal(1.0, hamming_loss(y1, np.logical_not(y1))) - assert_equal(4. / 6, hamming_loss(y1, np.zeros(y1.shape))) - assert_equal(0.5, hamming_loss(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(hamming_loss(y1, y2), 1 / 6) + assert_equal(hamming_loss(y1, y1), 0) + assert_equal(hamming_loss(y2, y2), 0) + assert_equal(hamming_loss(y2, np.logical_not(y2)), 1) + assert_equal(hamming_loss(y1, np.logical_not(y1)), 1) + assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6) + assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5) # List of tuple of label y1 = [(1, 2,), (0, 2,)] - y2 = [(2,), (0, 2,)] - assert_equal(1 / 6., hamming_loss(y1, y2)) - assert_equal(0.0, hamming_loss(y1, y1)) - assert_equal(0.0, hamming_loss(y2, y2)) - assert_equal(0.75, hamming_loss(y2, [(), ()])) - assert_equal(0.625, hamming_loss(y1, [tuple(), (10, )])) - assert_almost_equal(0.1818, hamming_loss(y2, [tuple(), (10, )], - classes=np.arange(11)), 2) + assert_equal(hamming_loss(y1, y2), 1 / 6) + assert_equal(hamming_loss(y1, y1), 0) + assert_equal(hamming_loss(y2, y2), 0) + assert_equal(hamming_loss(y2, [(), ()]), 0.75) + assert_equal(hamming_loss(y1, [tuple(), (10, )]), 0.625) + assert_almost_equal(hamming_loss(y2, [tuple(), (10, )], + classes=np.arange(11)), 0.1818, 2) def test_multilabel_accuracy_score_subset_accuracy(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(0.5, accuracy_score(y1, y2)) - assert_equal(1.0, accuracy_score(y1, y1)) - assert_equal(1.0, accuracy_score(y2, y2)) - assert_equal(0.0, accuracy_score(y2, np.logical_not(y2))) - assert_equal(0.0, accuracy_score(y1, np.logical_not(y1))) - assert_equal(0.0, accuracy_score(y1, np.zeros(y1.shape))) - assert_equal(0.0, accuracy_score(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(accuracy_score(y1, y2), 0.5) + assert_equal(accuracy_score(y1, y1), 1) + assert_equal(accuracy_score(y2, y2), 1) + assert_equal(accuracy_score(y2, np.logical_not(y2)), 0) + assert_equal(accuracy_score(y1, np.logical_not(y1)), 0) + assert_equal(accuracy_score(y1, np.zeros(y1.shape)), 0) + assert_equal(accuracy_score(y2, np.zeros(y1.shape)), 0) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.5, accuracy_score(y1, y2)) - assert_equal(1.0, accuracy_score(y1, y1)) - assert_equal(1.0, accuracy_score(y2, y2)) - assert_equal(0.0, accuracy_score(y2, [(), ()])) - assert_equal(1, accuracy_score(y1, y2, normalize=False)) - assert_equal(2, accuracy_score(y1, y1, normalize=False)) - assert_equal(2, accuracy_score(y2, y2, normalize=False)) - assert_equal(0, accuracy_score(y2, [(), ()], normalize=False)) + assert_equal(accuracy_score(y1, y2), 0.5) + assert_equal(accuracy_score(y1, y1), 1) + assert_equal(accuracy_score(y2, y2), 1) + assert_equal(accuracy_score(y2, [(), ()]), 0) + assert_equal(accuracy_score(y1, y2, normalize=False), 1) + assert_equal(accuracy_score(y1, y1, normalize=False), 2) + assert_equal(accuracy_score(y2, y2, normalize=False), 2) + assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0) def test_multilabel_jaccard_similarity_score(): # Dense label indicator matrix format - y1 = np.array([[0.0, 1.0, 1.0], - [1.0, 0.0, 1.0]]) - y2 = np.array([[0.0, 0.0, 1.0], - [1.0, 0.0, 1.0]]) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) # size(y1 \inter y2) = [1, 2] # size(y1 \union y2) = [2, 2] - assert_equal(0.75, jaccard_similarity_score(y1, y2)) - assert_equal(1.0, jaccard_similarity_score(y1, y1)) - - assert_equal(1.0, jaccard_similarity_score(y2, y2)) - assert_equal(0.0, jaccard_similarity_score(y2, np.logical_not(y2))) - assert_equal(0.0, jaccard_similarity_score(y1, np.logical_not(y1))) - assert_equal(0.0, jaccard_similarity_score(y1, np.zeros(y1.shape))) - assert_equal(0.0, jaccard_similarity_score(y2, np.zeros(y1.shape))) + assert_equal(jaccard_similarity_score(y1, y2), 0.75) + assert_equal(jaccard_similarity_score(y1, y1), 1) + assert_equal(jaccard_similarity_score(y2, y2), 1) + assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0) + assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0) + assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0) + assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0) # With a given pos_label - assert_equal(0.75, jaccard_similarity_score(y1, y2, pos_label=0)) - assert_equal(0.5, jaccard_similarity_score(y2, np.zeros(y1.shape), - pos_label=0)) - assert_equal(1, jaccard_similarity_score(y1, y2, pos_label=10)) + assert_equal(jaccard_similarity_score(y1, y2, pos_label=0), 0.75) + assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape), + pos_label=0), 0.5) + assert_equal(jaccard_similarity_score(y1, y2, pos_label=10), 1) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.75, jaccard_similarity_score(y1, y2)) - assert_equal(1.0, jaccard_similarity_score(y1, y1)) - assert_equal(1.0, jaccard_similarity_score(y2, y2)) - assert_equal(0.0, jaccard_similarity_score(y2, [(), ()])) + assert_equal(jaccard_similarity_score(y1, y2), 0.75) + assert_equal(jaccard_similarity_score(y1, y1), 1) + assert_equal(jaccard_similarity_score(y2, y2), 1) + assert_equal(jaccard_similarity_score(y2, [(), ()]), 0) # |y3 inter y4 | = [0, 1, 1] # |y3 union y4 | = [2, 1, 3] y3 = [(0,), (1,), (3,)] y4 = [(4,), (4,), (5, 6)] - assert_almost_equal(0, jaccard_similarity_score(y3, y4)) + assert_almost_equal(jaccard_similarity_score(y3, y4), 0) # |y5 inter y6 | = [0, 1, 1] # |y5 union y6 | = [2, 1, 3] y5 = [(0,), (1,), (2, 3)] y6 = [(1,), (1,), (2, 0)] - assert_almost_equal((1 + 1. / 3) / 3, jaccard_similarity_score(y5, y6)) + assert_almost_equal(jaccard_similarity_score(y5, y6), (1 + 1 / 3) / 3) def test_normalize_option_binary_classification(): @@ -1457,8 +1415,7 @@ def test_normalize_option_multilabel_classification(): msg="We failed to test correctly the normalize option") assert_almost_equal(metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) - / n_samples, - measure, + / n_samples, measure, err_msg="Failed with %s" % name) @@ -1489,18 +1446,18 @@ def test_precision_recall_f1_score_multilabel_1(): # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") - assert_almost_equal(1.5 / 4, p) - assert_almost_equal(0.5, r) - assert_almost_equal(2.5 / 1.5 * 0.25, f) - assert_equal(None, s) + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert_equal(s, None) # Check micro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") - assert_almost_equal(0.5, p) - assert_almost_equal(0.5, r) - assert_almost_equal(0.5, f) - assert_equal(None, s) + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert_equal(s, None) # Check weigted # |h(x_i) inter y_i | = [0, 1, 1] @@ -1508,10 +1465,10 @@ def test_precision_recall_f1_score_multilabel_1(): # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") - assert_almost_equal(0.5, p) - assert_almost_equal(0.5, r) - assert_almost_equal(0.5, f) - assert_equal(None, s) + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert_equal(s, None) def test_precision_recall_f1_score_multilabel_2(): @@ -1539,17 +1496,17 @@ def test_precision_recall_f1_score_multilabel_2(): p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") - assert_almost_equal(0.25, p) - assert_almost_equal(0.25, r) - assert_almost_equal(2 * 0.25 * 0.25 / 0.5, f) - assert_equal(None, s) + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.25) + assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5) + assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") - assert_almost_equal(0.25, p) - assert_almost_equal(0.125, r) - assert_almost_equal(2 / 12, f) - assert_equal(None, s) + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.125) + assert_almost_equal(f, 2 / 12) + assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") @@ -1557,10 +1514,10 @@ def test_precision_recall_f1_score_multilabel_2(): # |h(x_i) inter y_i | = [0, 0, 1] # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] - assert_almost_equal(1 / 6, p) - assert_almost_equal(1 / 6, r) - assert_almost_equal(2 / 4 * 1 / 3, f) - assert_equal(None, s) + assert_almost_equal(p, 1 / 6) + assert_almost_equal(r, 1 / 6) + assert_almost_equal(f, 2 / 4 * 1 / 3) + assert_equal(s, None) def test_precision_recall_f1_score_with_an_empty_prediction(): @@ -1586,17 +1543,17 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") - assert_almost_equal(0.5, p) - assert_almost_equal(1.5 / 4, r) - assert_almost_equal(2.5 / (4 * 1.5), f) - assert_equal(None, s) + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 1.5 / 4) + assert_almost_equal(f, 2.5 / (4 * 1.5)) + assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") - assert_almost_equal(2 / 3, p) - assert_almost_equal(0.5, r) - assert_almost_equal(2 / 3 / (2 / 3 + 0.5), f) - assert_equal(None, s) + assert_almost_equal(p, 2 / 3) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) + assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") @@ -1604,10 +1561,10 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] - assert_almost_equal(1 / 3, p) - assert_almost_equal(2 / 3, r) - assert_almost_equal(1 / 3, f) - assert_equal(None, s) + assert_almost_equal(p, 1 / 3) + assert_almost_equal(r, 2 / 3) + assert_almost_equal(f, 1 / 3) + assert_equal(s, None) def test_precision_recall_f1_no_labels(): @@ -1632,7 +1589,7 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(p, 0) assert_almost_equal(r, 0) assert_almost_equal(f, 0) - assert_equal(None, s) + assert_equal(s, None) # Check micro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, @@ -1640,7 +1597,7 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(p, 0) assert_almost_equal(r, 0) assert_almost_equal(f, 0) - assert_equal(None, s) + assert_equal(s, None) # # Check weigted # |h(x_i) inter y_i | = [0, 0, 0] @@ -1651,7 +1608,7 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(p, 1) assert_almost_equal(r, 1) assert_almost_equal(f, 1) - assert_equal(None, s) + assert_equal(s, None) def test_multilabel_invariance_with_pos_labels(): @@ -1670,10 +1627,10 @@ def test_multilabel_invariance_with_pos_labels(): measure = metric(y1, y2) for pos_label in [1, 3]: - assert_almost_equal(measure, - metric(y1_binary_indicator * pos_label, + assert_almost_equal(metric(y1_binary_indicator * pos_label, y2_binary_indicator * pos_label, pos_label=pos_label), + measure, err_msg="%s is not representation invariant" "with pos_label=%s" % (metric, pos_label)) From 2b47f79473ea42ae2fa7c06de711ce2339e9cd09 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 8 May 2013 13:29:52 +0200 Subject: [PATCH 11/15] ENH faster tp, fp, fn count with list of list of labels --- sklearn/metrics/metrics.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 9064d7da88c4e..a9a817c401b27 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1419,13 +1419,17 @@ def _tp_tn_fp_fn(y_true, y_pred, labels=None, pos_label=1): y_pred != pos_label), axis=0) else: + idx_to_label = dict((label_i, i) + for i, label_i in enumerate(labels)) + for true, pred in zip(y_true, y_pred): - for i, label_i in enumerate(labels): - label_i_in_true = label_i in true - label_i_in_pred = label_i in pred - true_pos[i] += label_i_in_true and label_i_in_pred - false_pos[i] += not label_i_in_true and label_i_in_pred - false_neg[i] += label_i_in_true and not label_i_in_pred + true_set = np.array([idx_to_label[l] for l in set(true)], + dtype=np.int) + pred_set = np.array([idx_to_label[l] for l in set(pred)], + dtype=np.int) + true_pos[np.intersect1d(true_set, pred_set)] += 1 + false_pos[np.setdiff1d(pred_set, true_set)] += 1 + false_neg[np.setdiff1d(true_set, pred_set)] += 1 else: y_true, y_pred = check_arrays(y_true, y_pred) From 44a4c9dcf47e81b80cc88b53a8cf2759ceca59fe Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 8 May 2013 14:58:10 +0200 Subject: [PATCH 12/15] ENH remove redundant condition --- sklearn/metrics/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index a9a817c401b27..fed9cdf7ce572 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1717,7 +1717,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_recall = np.mean(recall) avg_fscore = np.mean(fscore) - elif average == 'weighted' and not is_multilabel(y_true): + elif average == 'weighted': avg_precision = np.average(precision, weights=support) avg_recall = np.average(recall, weights=support) avg_fscore = np.average(fscore, weights=support) From dbcb3cd1e2d74d07f9d76ca2b1ab730d6fdd3987 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Wed, 8 May 2013 17:01:47 +0200 Subject: [PATCH 13/15] ENH and FIX average weighted and example --- sklearn/metrics/metrics.py | 233 +++++++++++++++++--------- sklearn/metrics/tests/test_metrics.py | 67 +++++++- 2 files changed, 215 insertions(+), 85 deletions(-) diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index fed9cdf7ce572..04fc771d426d0 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1109,20 +1109,27 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: ``'macro'``: - Average over classes (does not take imbalance into account). + Average and aggregate over classes (does not take imbalance into + account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. In multilabel - classification, this is true only if every sample has a label. + Average over instances (takes imbalance into account) and aggregate + over classes. This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has + a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average weighted by support (takes imbalance into account) and + aggregate over class. Can result in F-score that is not between + precision and recall. + ``'example'``: + Average and aggregate over instance. [3] Only meaningful and + available in multilabel classification. For binary or multiclass + classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1169,6 +1176,8 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.61... >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.65... + >>> f1_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS 0.59... >>> f1_score(y_true, y_pred, average=None) array([ 0.5, 0.8, 0.5]) @@ -1183,6 +1192,8 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.80... >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.66... + >>> f1_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS 0.5 >>> f1_score(y_true, y_pred, average=None) array([ 1., 1., 0.]) @@ -1224,20 +1235,27 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: ``'macro'``: - Average over classes (does not take imbalance into account). + Average and aggregate over classes (does not take imbalance into + account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. In multilabel - classification, this is true only if every sample has a label. + Average over instances (takes imbalance into account) and aggregate + over classes. This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has + a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average weighted by support (takes imbalance into account) and + aggregate over class. Can result in F-score that is not between + precision and recall. + ``'example'``: + Average and aggregate over instance. [3] Only meaningful and + available in multilabel classification. For binary or multiclass + classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1299,6 +1317,9 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, 0.5 >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) ... # doctest: +ELLIPSIS + 0.54... + >>> fbeta_score(y_true, y_pred, average='example', beta=0.5) + ... # doctest: +ELLIPSIS 0.66... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) ... # doctest: +ELLIPSIS @@ -1317,6 +1338,9 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, 0.90... >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average='example', beta=0.5) + ... # doctest: +ELLIPSIS 0.42... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) array([ 1., 1., 0.]) @@ -1498,20 +1522,28 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted'] + average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: ``'macro'``: - Average over classes (does not take imbalance into account). + Average and aggregate over classes (does not take imbalance into + account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. In multilabel - classification, this is true only if every sample has a label. + Average over instances (takes imbalance into account) and aggregate + over classes. This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has + a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average weighted by support (takes imbalance into account) and + aggregate over class. Can result in F-score that is not between + precision and recall. + ``'example'``: + Average and aggregate over instance. [3] Only meaningful and + available in multilabel classification. For binary or multiclass + classification, use the :func:`accuracy_score` function instead. + Returns ------- @@ -1535,6 +1567,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, .. [2] `Wikipedia entry for the F1-score `_ + .. [3] `Discriminative Methods for Multi-labeled Classification Advances + in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu + Godbole, Sunita Sarawagi + ` Examples -------- @@ -1581,6 +1617,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, (0.44..., 1.0, 0.61..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS + (0.499..., 1.0, 0.65..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='example') + ... # doctest: +ELLIPSIS (1.0, 0.44..., 0.59..., None) and with a list of labels format: @@ -1596,6 +1635,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, (1.0, 0.66..., 0.80..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS + (0.66..., 0.66..., 0.66..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='example') + ... # doctest: +ELLIPSIS (0.5, 1.0, 0.5, None) """ @@ -1612,18 +1654,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, n_labels = labels.size - if is_multilabel(y_true): - # Handle mix representation - if type(y_true) != type(y_pred): - labels = unique_labels(y_true, y_pred) - lb = LabelBinarizer() - lb.fit([labels.tolist()]) - y_true = lb.transform(y_true) - y_pred = lb.transform(y_pred) + if average == "example": + if is_multilabel(y_true): + # Handle mix representation + if type(y_true) != type(y_pred): + labels = unique_labels(y_true, y_pred) + lb = LabelBinarizer() + lb.fit([labels.tolist()]) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) - # The weighted multilabel case must be handled separatly - # since it can't be computed using true/false negative/positive - if average == "weighted": if is_label_indicator_matrix(y_true): y_true_pos_label = y_true == pos_label y_pred_pos_label = y_pred == pos_label @@ -1642,28 +1682,32 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, size_inter[i] = len(true_set & pred_set) size_pred[i] = len(pred_set) size_true[i] = len(true_set) + else: + raise ValueError("Example-based precision, recall, fscore is " + "not meaning full outside multilabe" + "classification. See the accuracy_score instead.") - try: - # oddly, we may get an "invalid" rather than a "divide" error - # here - old_err_settings = np.seterr(divide='ignore', invalid='ignore') - - precision = size_inter / size_true - recall = size_inter / size_pred - f_score = ((1 + beta2 ** 2) * size_inter / - (beta2 * size_pred + size_true)) - finally: - np.seterr(**old_err_settings) + try: + # oddly, we may get an "invalid" rather than a "divide" error + # here + old_err_settings = np.seterr(divide='ignore', invalid='ignore') + + precision = size_inter / size_true + recall = size_inter / size_pred + f_score = ((1 + beta2 ** 2) * size_inter / + (beta2 * size_pred + size_true)) + finally: + np.seterr(**old_err_settings) - precision[size_true == 0] = 1.0 - recall[size_pred == 0] = 1.0 - f_score[(beta2 * size_pred + size_true) == 0] = 1.0 + precision[size_true == 0] = 1.0 + recall[size_pred == 0] = 1.0 + f_score[(beta2 * size_pred + size_true) == 0] = 1.0 - precision = np.mean(precision) - recall = np.mean(recall) - f_score = np.mean(f_score) + precision = np.mean(precision) + recall = np.mean(recall) + f_score = np.mean(f_score) - return precision, recall, f_score, None + return precision, recall, f_score, None true_pos, _, false_pos, false_neg = _tp_tn_fp_fn(y_true, y_pred, labels, pos_label) @@ -1701,7 +1745,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, return (precision[pos_label_idx], recall[pos_label_idx], fscore[pos_label_idx], support[pos_label_idx]) else: - average_options = (None, 'micro', 'macro', 'weighted') + average_options = (None, 'micro', 'macro', 'weighted', 'example') if average == 'micro': avg_precision = divide(true_pos.sum(), true_pos.sum() + false_pos.sum(), @@ -1712,32 +1756,35 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), beta2 * avg_precision + avg_recall, dtype=np.double) + + if np.isnan(avg_precision): + avg_precision = 0. + + if np.isnan(avg_recall): + avg_recall = 0. + + if np.isnan(avg_fscore): + avg_fscore = 0. + elif average == 'macro': avg_precision = np.mean(precision) avg_recall = np.mean(recall) avg_fscore = np.mean(fscore) elif average == 'weighted': - avg_precision = np.average(precision, weights=support) - avg_recall = np.average(recall, weights=support) - avg_fscore = np.average(fscore, weights=support) + if np.all(support == 0): + avg_precision = 0. + avg_recall = 0. + avg_fscore = 0. + else: + avg_precision = np.average(precision, weights=support) + avg_recall = np.average(recall, weights=support) + avg_fscore = np.average(fscore, weights=support) else: raise ValueError('average has to be one of ' + str(average_options)) - avg_precision = (avg_precision - if not np.isnan(avg_precision) - else 0) - - avg_recall = (avg_recall - if not np.isnan(avg_recall) - else 0) - - avg_fscore = (avg_fscore - if not np.isnan(avg_fscore) - else 0) - return avg_precision, avg_recall, avg_fscore, None @@ -1769,20 +1816,27 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: ``'macro'``: - Average over classes (does not take imbalance into account). + Average and aggregate over classes (does not take imbalance into + account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. In multilabel - classification, this is true only if every sample has a label. + Average over instances (takes imbalance into account) and aggregate + over classes. This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has + a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average weighted by support (takes imbalance into account) and + aggregate over class. Can result in F-score that is not between + precision and recall. + ``'example'``: + Average and aggregate over instance. [3] Only meaningful and + available in multilabel classification. For binary or multiclass + classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1826,6 +1880,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.44... >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + 0.49... + >>> precision_score(y_true, y_pred, average='example') 1.0 >>> precision_score(y_true, y_pred, average=None) ... # doctest: +ELLIPSIS @@ -1842,6 +1899,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, 1.0 >>> precision_score(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS + 0.66... + >>> precision_score(y_true, y_pred, average='example') + ... # doctest: +ELLIPSIS 0.5 >>> precision_score(y_true, y_pred, average=None) array([ 1., 1., 0.]) @@ -1881,20 +1941,27 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: ``'macro'``: - Average over classes (does not take imbalance into account). + Average and aggregate over classes (does not take imbalance into + account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. In multilabel - classification, this is true only if every sample has a label. + Average over instances (takes imbalance into account) and aggregate + over classes. This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has + a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average weighted by support (takes imbalance into account) and + aggregate over class. Can result in F-score that is not between + precision and recall. + ``'example'``: + Average and aggregate over instance. [3] Only meaningful and + available in multilabel classification. For binary or multiclass + classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1937,6 +2004,8 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> recall_score(y_true, y_pred, average='micro') 1.0 >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 1.0 + >>> recall_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS 0.44... >>> recall_score(y_true, y_pred, average=None) array([ 1., 1., 1.]) @@ -1950,7 +2019,9 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): 0.66... >>> recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.66... - >>> recall_score(y_true, y_pred, average='weighted') + >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='example') 1.0 >>> recall_score(y_true, y_pred, average=None) array([ 1., 1., 0.]) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 054f61294e1b0..30d1cdeb59784 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -145,6 +145,17 @@ "weighted_recall_score": lambda y1, y2: recall_score(y1, y2, average="weighted"), + "example_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="example", beta=0.5), + "example_f1_score": + lambda y1, y2: f1_score(y1, y2, average="example"), + "example_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="example", beta=2), + "example_precision_score": + lambda y1, y2: precision_score(y1, y2, average="example"), + "example_recall_score": + lambda y1, y2: recall_score(y1, y2, average="example"), + "micro_f0.5_score": lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), "micro_f1_score": @@ -184,6 +195,17 @@ "weighted_recall_score": lambda y1, y2, pos_label=1: recall_score(y1, y2, pos_label=pos_label, average="weighted"), + "example_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="example", beta=0.5), + "example_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="example"), + "example_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="example", beta=2), + "example_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="example"), + "example_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="example"), + "micro_f0.5_score": lambda y1, y2, pos_label=1: fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=0.5), "micro_f1_score": lambda y1, y2, pos_label=1: @@ -635,6 +657,13 @@ def test_precision_recall_f1_score_multiclass(): fs = f1_score(y_true, y_pred, average='weighted') assert_array_almost_equal(fs, 0.47, 2) + assert_raises(ValueError, precision_score, y_true, y_pred, + average="example") + assert_raises(ValueError, recall_score, y_true, y_pred, average="example") + assert_raises(ValueError, f1_score, y_true, y_pred, average="example") + assert_raises(ValueError, fbeta_score, y_true, y_pred, average="example", + beta=0.5) + # same prediction but with and explicit label ordering p, r, f, s = precision_recall_fscore_support( y_true, y_pred, labels=[0, 2, 1], average=None) @@ -1459,12 +1488,20 @@ def test_precision_recall_f1_score_multilabel_1(): assert_almost_equal(f, 0.5) assert_equal(s, None) + # Check weigted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert_equal(s, None) + # Check weigted # |h(x_i) inter y_i | = [0, 1, 1] # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="weighted") + average="example") assert_almost_equal(p, 0.5) assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) @@ -1510,6 +1547,13 @@ def test_precision_recall_f1_score_multilabel_2(): p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") + assert_almost_equal(p, 2 / 4) + assert_almost_equal(r, 1 / 4) + assert_almost_equal(f, 2 / 3 * 2 / 4) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="example") # Check weigted # |h(x_i) inter y_i | = [0, 0, 1] # |y_i| = [1, 1, 2] @@ -1557,7 +1601,13 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") - # Check weigted + assert_almost_equal(p, 3 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, (2 / 1.5 + 1) / 4) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="example") # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] @@ -1576,6 +1626,7 @@ def test_precision_recall_f1_no_labels(): #tp = [0, 0, 0] #fn = [0, 0, 0] #fp = [0, 0, 0] + #support = [0, 0, 0] # Check per class assert_array_almost_equal(p, [0, 0, 0], 2) @@ -1599,12 +1650,20 @@ def test_precision_recall_f1_no_labels(): assert_almost_equal(f, 0) assert_equal(s, None) - # # Check weigted + # Check weighted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + # # Check example # |h(x_i) inter y_i | = [0, 0, 0] # |y_i| = [0, 0, 0] # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="weighted") + average="example") assert_almost_equal(p, 1) assert_almost_equal(r, 1) assert_almost_equal(f, 1) From eafa3e7db8a81fdf3d52aa94b2ef22e44b38b90c Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Fri, 10 May 2013 09:10:09 +0200 Subject: [PATCH 14/15] FIX confusion between example-base and weighted-based precision recall f-score --- doc/modules/model_evaluation.rst | 60 ++++++++++----- sklearn/metrics/metrics.py | 121 ++++++++++++++----------------- 2 files changed, 99 insertions(+), 82 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 664fdd7806529..5aec4461f581f 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -460,13 +460,19 @@ Moreover, these notions can be further extended. The functions :func:`precision_score` and :func:`recall_score` support an argument called ``average`` which defines the type of averaging: - * ``"macro"``: average over classes (does not take imbalance into account). - * ``"micro"``: average over instances (takes imbalance into account). - * ``"weighted"``: average weighted by support (takes imbalance into account). - It can result in F1 score that is not between precision and recall. + * ``"macro"``: average over classes (does not take imbalance + into account). + * ``"micro"``: aggregate classes and average over instances + (takes imbalance into account). This implies that + ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has a label. + * ``"weighted"``: average over classes weighted by support (takes imbalance + into account). Can result in F-score that is not between + precision and recall. + * ``'example'``: average over instances. Only available and + meaningful with multilabel data. * ``None``: no averaging is performed. - Let's define some notations: * :math:`n_\text{labels}` and :math:`n_\text{samples}` denotes respectively @@ -477,11 +483,13 @@ Let's define some notations: * :math:`tp_j`, :math:`fp_j` and :math:`fn_j` respectively the number of true positives, false positives and false negatives for the :math:`j`-th label; + * :math:`w_j = \frac{tp_j + fn_j}{\sum_{k=0}^{n_\text{labels} - 1} tp_k + f + n_k}` is the weighted support associated to the :math:`j`-th label; * :math:`y_i` is the set of true label and :math:`\hat{y}_i` is the set of predicted for the :math:`i`-th sample; -The macro precision, recall and :math:`F_\beta` are averaged over all labels +The macro precision, recall and :math:`F_\beta` is defined as .. math:: @@ -495,7 +503,7 @@ The macro precision, recall and :math:`F_\beta` are averaged over all labels \texttt{macro\_{}F\_{}beta} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} {F_\beta}_j. -The micro precision, recall and :math:`F_\beta` are averaged over all instances +The micro precision, recall and :math:`F_\beta` is defined as .. math:: @@ -509,21 +517,34 @@ The micro precision, recall and :math:`F_\beta` are averaged over all instances \texttt{micro\_{}F\_{}beta} = (1 + \beta^2) \frac{\texttt{micro\_{}precision} \times \texttt{micro\_{}recall}}{\beta^2 \texttt{micro\_{}precision} + \texttt{micro\_{}recall}}. +The weighted precision, recall and :math:`F_\beta` is defined as + +.. math:: + + \texttt{weighted\_{}precision} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j \texttt{precision}_j, + +.. math:: + + \texttt{weighted\_{}recall} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j \texttt{recall}_j, + +.. math:: + + \texttt{weighted\_{}F\_{}beta} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j {F_\beta}_j. -The weighted precision, recall and :math:`F_\beta` are averaged weighted by -their support + +The example precision, recall and :math:`F_\beta` is defined as .. math:: - \texttt{weighted\_{}precision}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|y_i|}, + \texttt{example\_{}precision}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|y_i|}, .. math:: - \texttt{weighted\_{}recall}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|\hat{y}_i|}, + \texttt{example\_{}recall}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|\hat{y}_i|}, .. math:: - \texttt{weighted\_{}F\_{}beta}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} (1 + \beta^2)\frac{|y_i \cap \hat{y}_i|}{\beta^2 |\hat{y}_i| + |y_i|}. + \texttt{example\_{}F\_{}beta}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} (1 + \beta^2)\frac{|y_i \cap \hat{y}_i|}{\beta^2 |\hat{y}_i| + |y_i|}. Here an example where ``average`` is set to ``average`` to ``macro``:: @@ -546,15 +567,20 @@ Here an example where ``average`` is set to to ``micro``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.recall_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.f1_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.fbeta_score(y_true, y_pred, average='micro', beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) Here an example where ``average`` is set to to ``weighted``:: diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 04fc771d426d0..38b3fb9b339fd 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1109,27 +1109,26 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] + average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'example'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: - Average and aggregate over classes (does not take imbalance into - account). + Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account) and aggregate - over classes. This implies that ``precision == recall == F1``. - In multilabel classification, this is true only if every sample has - a label. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account) and - aggregate over class. Can result in F-score that is not between + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between precision and recall. - ``'example'``: - Average and aggregate over instance. [3] Only meaningful and - available in multilabel classification. For binary or multiclass - classification, use the :func:`accuracy_score` function instead. + Returns ------- @@ -1235,27 +1234,25 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] + average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'example'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: - Average and aggregate over classes (does not take imbalance into - account). + Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account) and aggregate - over classes. This implies that ``precision == recall == F1``. - In multilabel classification, this is true only if every sample has - a label. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account) and - aggregate over class. Can result in F-score that is not between + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between precision and recall. - ``'example'``: - Average and aggregate over instance. [3] Only meaningful and - available in multilabel classification. For binary or multiclass - classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1522,27 +1519,25 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] + average : string, [None (default), 'example', 'micro', 'macro', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'example'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: - Average and aggregate over classes (does not take imbalance into - account). + Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account) and aggregate - over classes. This implies that ``precision == recall == F1``. - In multilabel classification, this is true only if every sample has - a label. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account) and - aggregate over class. Can result in F-score that is not between + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between precision and recall. - ``'example'``: - Average and aggregate over instance. [3] Only meaningful and - available in multilabel classification. For binary or multiclass - classification, use the :func:`accuracy_score` function instead. Returns @@ -1816,27 +1811,25 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] + average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'example'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: - Average and aggregate over classes (does not take imbalance into - account). + Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account) and aggregate - over classes. This implies that ``precision == recall == F1``. - In multilabel classification, this is true only if every sample has - a label. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account) and - aggregate over class. Can result in F-score that is not between + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between precision and recall. - ``'example'``: - Average and aggregate over instance. [3] Only meaningful and - available in multilabel classification. For binary or multiclass - classification, use the :func:`accuracy_score` function instead. Returns ------- @@ -1941,27 +1934,25 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted', 'example'] + average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'example'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: - Average and aggregate over classes (does not take imbalance into - account). + Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account) and aggregate - over classes. This implies that ``precision == recall == F1``. - In multilabel classification, this is true only if every sample has - a label. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account) and - aggregate over class. Can result in F-score that is not between + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between precision and recall. - ``'example'``: - Average and aggregate over instance. [3] Only meaningful and - available in multilabel classification. For binary or multiclass - classification, use the :func:`accuracy_score` function instead. Returns ------- From f1b2d6878ce2b2e1b6d77eed7a9a1c92c6be11a3 Mon Sep 17 00:00:00 2001 From: Arnaud Joly Date: Sun, 19 May 2013 19:33:30 +0200 Subject: [PATCH 15/15] ENH rename average='example' to average='samples' --- doc/modules/model_evaluation.rst | 14 +++---- sklearn/metrics/metrics.py | 58 +++++++++++++-------------- sklearn/metrics/tests/test_metrics.py | 56 +++++++++++++------------- 3 files changed, 64 insertions(+), 64 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5aec4461f581f..843fe96e7902e 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -466,11 +466,11 @@ Moreover, these notions can be further extended. The functions (takes imbalance into account). This implies that ``precision == recall == F1``. In multilabel classification, this is true only if every sample has a label. + * ``'samples'``: average over instances. Only available and + meaningful with multilabel data. * ``"weighted"``: average over classes weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. - * ``'example'``: average over instances. Only available and - meaningful with multilabel data. * ``None``: no averaging is performed. Let's define some notations: @@ -532,7 +532,7 @@ The weighted precision, recall and :math:`F_\beta` is defined as \texttt{weighted\_{}F\_{}beta} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j {F_\beta}_j. -The example precision, recall and :math:`F_\beta` is defined as +The sample-based precision, recall and :math:`F_\beta` is defined as .. math:: @@ -546,7 +546,7 @@ The example precision, recall and :math:`F_\beta` is defined as \texttt{example\_{}F\_{}beta}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} (1 + \beta^2)\frac{|y_i \cap \hat{y}_i|}{\beta^2 |\hat{y}_i| + |y_i|}. -Here an example where ``average`` is set to ``average`` to ``macro``:: +Here is an example where ``average`` is set to ``average`` to ``macro``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] @@ -562,7 +562,7 @@ Here an example where ``average`` is set to ``average`` to ``macro``:: >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='macro') # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) -Here an example where ``average`` is set to to ``micro``:: +Here is an example where ``average`` is set to to ``micro``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] @@ -583,7 +583,7 @@ Here an example where ``average`` is set to to ``micro``:: ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) -Here an example where ``average`` is set to to ``weighted``:: +Here is an example where ``average`` is set to to ``weighted``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] @@ -603,7 +603,7 @@ Here an example where ``average`` is set to to ``weighted``:: ... average='weighted') # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) -Here an example where ``average`` is set to ``None``:: +Here is an example where ``average`` is set to ``None``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index 38b3fb9b339fd..5eef2868b12f5 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -1109,12 +1109,12 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: - ``'example'``: + ``'samples'``: Average over instance. Only meaningful and available in multilabel classification. ``'macro'``: @@ -1176,7 +1176,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): 0.61... >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.65... - >>> f1_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS + >>> f1_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS 0.59... >>> f1_score(y_true, y_pred, average=None) array([ 0.5, 0.8, 0.5]) @@ -1192,7 +1192,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): 0.80... >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.66... - >>> f1_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS + >>> f1_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS 0.5 >>> f1_score(y_true, y_pred, average=None) array([ 1., 1., 0.]) @@ -1234,12 +1234,12 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: - ``'example'``: + ``'samples'``: Average over instance. Only meaningful and available in multilabel classification. ``'macro'``: @@ -1315,7 +1315,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) ... # doctest: +ELLIPSIS 0.54... - >>> fbeta_score(y_true, y_pred, average='example', beta=0.5) + >>> fbeta_score(y_true, y_pred, average='samples', beta=0.5) ... # doctest: +ELLIPSIS 0.66... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) @@ -1336,7 +1336,7 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) ... # doctest: +ELLIPSIS 0.66... - >>> fbeta_score(y_true, y_pred, average='example', beta=0.5) + >>> fbeta_score(y_true, y_pred, average='samples', beta=0.5) ... # doctest: +ELLIPSIS 0.42... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) @@ -1519,14 +1519,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None (default), 'example', 'micro', 'macro', 'weighted'] + average : string, [None (default), 'micro', 'macro', 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: - ``'example'``: - Average over instance. Only meaningful and available in multilabel - classification. ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: @@ -1534,7 +1531,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, account). This implies that ``precision == recall == F1``. In multilabel classification, this is true only if every sample has a label. - ``'weighted'``: + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. + ``'weighted'``: Average over classes weighted by support (takes imbalance into account). Can result in F-score that is not between precision and recall. @@ -1613,7 +1613,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS (0.499..., 1.0, 0.65..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='example') + >>> precision_recall_fscore_support(y_true, y_pred, average='samples') ... # doctest: +ELLIPSIS (1.0, 0.44..., 0.59..., None) @@ -1631,7 +1631,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS (0.66..., 0.66..., 0.66..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='example') + >>> precision_recall_fscore_support(y_true, y_pred, average='samples') ... # doctest: +ELLIPSIS (0.5, 1.0, 0.5, None) @@ -1649,7 +1649,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, n_labels = labels.size - if average == "example": + if average == "samples": if is_multilabel(y_true): # Handle mix representation if type(y_true) != type(y_pred): @@ -1740,7 +1740,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, return (precision[pos_label_idx], recall[pos_label_idx], fscore[pos_label_idx], support[pos_label_idx]) else: - average_options = (None, 'micro', 'macro', 'weighted', 'example') + average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average == 'micro': avg_precision = divide(true_pos.sum(), true_pos.sum() + false_pos.sum(), @@ -1811,14 +1811,11 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: - ``'example'``: - Average over instance. Only meaningful and available in multilabel - classification. ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: @@ -1826,6 +1823,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, account). This implies that ``precision == recall == F1``. In multilabel classification, this is true only if every sample has a label. + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'weighted'``: Average over classes weighted by support (takes imbalance into account). Can result in F-score that is not between @@ -1875,7 +1875,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, >>> precision_score(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS 0.49... - >>> precision_score(y_true, y_pred, average='example') + >>> precision_score(y_true, y_pred, average='samples') 1.0 >>> precision_score(y_true, y_pred, average=None) ... # doctest: +ELLIPSIS @@ -1893,7 +1893,7 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, >>> precision_score(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS 0.66... - >>> precision_score(y_true, y_pred, average='example') + >>> precision_score(y_true, y_pred, average='samples') ... # doctest: +ELLIPSIS 0.5 >>> precision_score(y_true, y_pred, average=None) @@ -1934,14 +1934,11 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): classification, it is used to infer what is a positive label in the label indicator matrix format. - average : string, [None, 'example', 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: - ``'example'``: - Average over instance. Only meaningful and available in multilabel - classification. ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: @@ -1949,6 +1946,9 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): account). This implies that ``precision == recall == F1``. In multilabel classification, this is true only if every sample has a label. + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'weighted'``: Average over classes weighted by support (takes imbalance into account). Can result in F-score that is not between @@ -1996,7 +1996,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): 1.0 >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 1.0 - >>> recall_score(y_true, y_pred, average='example') # doctest: +ELLIPSIS + >>> recall_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS 0.44... >>> recall_score(y_true, y_pred, average=None) array([ 1., 1., 1.]) @@ -2012,7 +2012,7 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): 0.66... >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.66... - >>> recall_score(y_true, y_pred, average='example') + >>> recall_score(y_true, y_pred, average='samples') 1.0 >>> recall_score(y_true, y_pred, average=None) array([ 1., 1., 0.]) diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 30d1cdeb59784..bb7dc4b5539cc 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -145,16 +145,16 @@ "weighted_recall_score": lambda y1, y2: recall_score(y1, y2, average="weighted"), - "example_f0.5_score": - lambda y1, y2: fbeta_score(y1, y2, average="example", beta=0.5), - "example_f1_score": - lambda y1, y2: f1_score(y1, y2, average="example"), - "example_f2_score": - lambda y1, y2: fbeta_score(y1, y2, average="example", beta=2), - "example_precision_score": - lambda y1, y2: precision_score(y1, y2, average="example"), - "example_recall_score": - lambda y1, y2: recall_score(y1, y2, average="example"), + "samples_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="samples", beta=0.5), + "samples_f1_score": + lambda y1, y2: f1_score(y1, y2, average="samples"), + "samples_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="samples", beta=2), + "samples_precision_score": + lambda y1, y2: precision_score(y1, y2, average="samples"), + "samples_recall_score": + lambda y1, y2: recall_score(y1, y2, average="samples"), "micro_f0.5_score": lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), @@ -195,16 +195,16 @@ "weighted_recall_score": lambda y1, y2, pos_label=1: recall_score(y1, y2, pos_label=pos_label, average="weighted"), - "example_f0.5_score": lambda y1, y2, pos_label=1: - fbeta_score(y1, y2, pos_label=pos_label, average="example", beta=0.5), - "example_f1_score": lambda y1, y2, pos_label=1: - f1_score(y1, y2, pos_label=pos_label, average="example"), - "example_f2_score": lambda y1, y2, pos_label=1: - fbeta_score(y1, y2, pos_label=pos_label, average="example", beta=2), - "example_precision_score": lambda y1, y2, pos_label=1: - precision_score(y1, y2, pos_label=pos_label, average="example"), - "example_recall_score": lambda y1, y2, pos_label=1: - recall_score(y1, y2, pos_label=pos_label, average="example"), + "samples_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="samples", beta=0.5), + "samples_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="samples"), + "samples_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="samples", beta=2), + "samples_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="samples"), + "samples_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="samples"), "micro_f0.5_score": lambda y1, y2, pos_label=1: fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=0.5), @@ -658,10 +658,10 @@ def test_precision_recall_f1_score_multiclass(): assert_array_almost_equal(fs, 0.47, 2) assert_raises(ValueError, precision_score, y_true, y_pred, - average="example") - assert_raises(ValueError, recall_score, y_true, y_pred, average="example") - assert_raises(ValueError, f1_score, y_true, y_pred, average="example") - assert_raises(ValueError, fbeta_score, y_true, y_pred, average="example", + average="samples") + assert_raises(ValueError, recall_score, y_true, y_pred, average="samples") + assert_raises(ValueError, f1_score, y_true, y_pred, average="samples") + assert_raises(ValueError, fbeta_score, y_true, y_pred, average="samples", beta=0.5) # same prediction but with and explicit label ordering @@ -1501,7 +1501,7 @@ def test_precision_recall_f1_score_multilabel_1(): # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="example") + average="samples") assert_almost_equal(p, 0.5) assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) @@ -1553,7 +1553,7 @@ def test_precision_recall_f1_score_multilabel_2(): assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="example") + average="samples") # Check weigted # |h(x_i) inter y_i | = [0, 0, 1] # |y_i| = [1, 1, 2] @@ -1607,7 +1607,7 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): assert_equal(s, None) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="example") + average="samples") # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] @@ -1663,7 +1663,7 @@ def test_precision_recall_f1_no_labels(): # |y_i| = [0, 0, 0] # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, - average="example") + average="samples") assert_almost_equal(p, 1) assert_almost_equal(r, 1) assert_almost_equal(f, 1)