diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5c8d41c033f69..843fe96e7902e 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -45,23 +45,24 @@ Others also work in the multiclass case: .. autosummary:: :template: function.rst - classification_report confusion_matrix - f1_score - fbeta_score - precision_recall_fscore_support - precision_score - recall_score + And some also work in the multilabel case: .. autosummary:: :template: function.rst - accuracy_score - hamming_loss - jaccard_similarity_score - zero_one_loss + accuracy_score + classification_report + f1_score + fbeta_score + hamming_loss + jaccard_similarity_score + precision_recall_fscore_support + precision_score + recall_score + zero_one_loss Some metrics might require probability estimates of the positive class, @@ -459,18 +460,19 @@ Moreover, these notions can be further extended. The functions :func:`precision_score` and :func:`recall_score` support an argument called ``average`` which defines the type of averaging: - * ``"macro"``: average over classes (does not take imbalance into account). - * ``"micro"``: average over instances (takes imbalance into account). - * ``"weighted"``: average weighted by support (takes imbalance into account). - It can result in F1 score that is not between precision and recall. + * ``"macro"``: average over classes (does not take imbalance + into account). + * ``"micro"``: aggregate classes and average over instances + (takes imbalance into account). This implies that + ``precision == recall == F1``. + In multilabel classification, this is true only if every sample has a label. + * ``'samples'``: average over instances. Only available and + meaningful with multilabel data. + * ``"weighted"``: average over classes weighted by support (takes imbalance + into account). Can result in F-score that is not between + precision and recall. * ``None``: no averaging is performed. -.. warning:: - - Currently those functions support only the multiclass case. However the - following definitions are general and remain valid in the multilabel - case. - Let's define some notations: * :math:`n_\text{labels}` and :math:`n_\text{samples}` denotes respectively @@ -481,11 +483,13 @@ Let's define some notations: * :math:`tp_j`, :math:`fp_j` and :math:`fn_j` respectively the number of true positives, false positives and false negatives for the :math:`j`-th label; + * :math:`w_j = \frac{tp_j + fn_j}{\sum_{k=0}^{n_\text{labels} - 1} tp_k + f + n_k}` is the weighted support associated to the :math:`j`-th label; * :math:`y_i` is the set of true label and :math:`\hat{y}_i` is the set of predicted for the :math:`i`-th sample; -The macro precision, recall and :math:`F_\beta` are averaged over all labels +The macro precision, recall and :math:`F_\beta` is defined as .. math:: @@ -499,7 +503,7 @@ The macro precision, recall and :math:`F_\beta` are averaged over all labels \texttt{macro\_{}F\_{}beta} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} {F_\beta}_j. -The micro precision, recall and :math:`F_\beta` are averaged over all instances +The micro precision, recall and :math:`F_\beta` is defined as .. math:: @@ -513,23 +517,36 @@ The micro precision, recall and :math:`F_\beta` are averaged over all instances \texttt{micro\_{}F\_{}beta} = (1 + \beta^2) \frac{\texttt{micro\_{}precision} \times \texttt{micro\_{}recall}}{\beta^2 \texttt{micro\_{}precision} + \texttt{micro\_{}recall}}. +The weighted precision, recall and :math:`F_\beta` is defined as + +.. math:: -The weighted precision, recall and :math:`F_\beta` are averaged weighted by -their support + \texttt{weighted\_{}precision} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j \texttt{precision}_j, .. math:: - \texttt{weighted\_{}precision}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|y_i|}, + \texttt{weighted\_{}recall} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j \texttt{recall}_j, .. math:: - \texttt{weighted\_{}recall}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|\hat{y}_i|}, + \texttt{weighted\_{}F\_{}beta} = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} w_j {F_\beta}_j. + + +The sample-based precision, recall and :math:`F_\beta` is defined as .. math:: - \texttt{weighted\_{}F\_{}beta}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} (1 + \beta^2)\frac{|y_i \cap \hat{y}_i|}{\beta^2 |\hat{y}_i| + |y_i|}. + \texttt{example\_{}precision}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|y_i|}, + +.. math:: -Here an example where ``average`` is set to ``average`` to ``macro``:: + \texttt{example\_{}recall}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \frac{|y_i \cap \hat{y}_i|}{|\hat{y}_i|}, + +.. math:: + + \texttt{example\_{}F\_{}beta}(y,\hat{y}) &= \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} (1 + \beta^2)\frac{|y_i \cap \hat{y}_i|}{\beta^2 |\hat{y}_i| + |y_i|}. + +Here is an example where ``average`` is set to ``average`` to ``macro``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] @@ -545,55 +562,66 @@ Here an example where ``average`` is set to ``average`` to ``macro``:: >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='macro') # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) -Here an example where ``average`` is set to to ``micro``:: +Here is an example where ``average`` is set to to ``micro``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.recall_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.f1_score(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.fbeta_score(y_true, y_pred, average='micro', beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) -Here an example where ``average`` is set to to ``weighted``:: +Here is an example where ``average`` is set to to ``weighted``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.22... - >>> metrics.recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.recall_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.33... - >>> metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... >>> metrics.f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.26... - >>> metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, + ... average='weighted') # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) -Here an example where ``average`` is set to ``None``:: +Here is an example where ``average`` is set to ``None``:: >>> from sklearn import metrics >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> metrics.precision_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS + >>> metrics.precision_score(y_true, y_pred, average=None) + ... # doctest: +ELLIPSIS array([ 0.66..., 0. , 0. ]) >>> metrics.recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) >>> metrics.f1_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS array([ 0.8, 0. , 0. ]) - >>> metrics.fbeta_score(y_true, y_pred, average=None, beta=0.5) # doctest: +ELLIPSIS + >>> metrics.fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS array([ 0.71..., 0. , 0. ]) - >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5) # doctest: +ELLIPSIS + >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5) + ... # doctest: +ELLIPSIS (array([ 0.66..., 0. , 0. ]), array([ 1., 0., 0.]), array([ 0.71..., 0. , 0. ]), array([2, 2, 2]...)) - Hinge loss ---------- diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 5f9aac42242ef..b9a6a42540cd8 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -26,9 +26,15 @@ Changelog `Gilles Louppe`_. See the :ref:`AdaBoost ` section of the user guide for details and examples. - - :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss` support - multi-label classification and two new metrics :func:`metrics.hamming_loss` - and :func:`metrics.jaccard_similarity_score` + - Multi-label classification output now support to + :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.precision_scoreclassification_report`, + :func:`metrics.precision_score` and :func:`metrics.recal_score` + by `Arnaud Joly`_. + + - Two new metrics :func:`metrics.hamming_loss` and + :func:`metrics.jaccard_similarity_score` are added with multi-label support by `Arnaud Joly`_. - Speed and memory usage improvements in diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index b88d37422cfe2..5eef2868b12f5 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -114,7 +114,6 @@ def _check_1d_array(y1, y2, ravel=False): Examples -------- - >>> from numpy import array >>> from sklearn.metrics.metrics import _check_1d_array >>> _check_1d_array([1, 2], [[3, 4]]) (array([1, 2]), array([3, 4])) @@ -964,7 +963,7 @@ def jaccard_similarity_score(y_true, y_pred, normalize=True, pos_label=1): np.seterr(**old_err_settings) else: - score = np.empty(len(y_true)) + score = np.empty(len(y_true), dtype=np.float) for i, (true, pred) in enumerate(zip(y_pred, y_true)): true_set = set(true) pred_set = set(pred) @@ -1090,15 +1089,15 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): F1 = 2 * (precision * recall) / (precision + recall) - In the multi-class case, this is the weighted average of the F1 score of - each class. + In the multi-class and multi-label case, this is the weighted average of + the F1 score of each class. Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1106,21 +1105,30 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between + precision and recall. + Returns ------- @@ -1157,6 +1165,38 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> f1_score(y_true, y_pred, average=None) array([ 0.8, 0. , 0. ]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import f1_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.59... + >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.61... + >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.65... + >>> f1_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS + 0.59... + >>> f1_score(y_true, y_pred, average=None) + array([ 0.5, 0.8, 0.5]) + + and with a list of labels format: + + >>> from sklearn.metrics import f1_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.80... + >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.66... + >>> f1_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS + 0.5 + >>> f1_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) + """ return fbeta_score(y_true, y_pred, 1, labels=labels, pos_label=pos_label, average=average) @@ -1176,10 +1216,10 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. beta: float @@ -1190,21 +1230,29 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between + precision and recall. Returns ------- @@ -1240,19 +1288,60 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, >>> from sklearn.metrics import fbeta_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] - >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... - >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS 0.33... - >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS 0.23... - >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)\ - # doctest: +ELLIPSIS + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS array([ 0.71..., 0. , 0. ]) + + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import fbeta_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.49... + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + 0.5 + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS + 0.54... + >>> fbeta_score(y_true, y_pred, average='samples', beta=0.5) + ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + ... # doctest: +ELLIPSIS + array([ 0.38..., 0.71..., 0.38...]) + + and with a list of labels format: + + >>> from sklearn.metrics import fbeta_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + ... # doctest: +ELLIPSIS + 0.90... + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + ... # doctest: +ELLIPSIS + 0.66... + >>> fbeta_score(y_true, y_pred, average='samples', beta=0.5) + ... # doctest: +ELLIPSIS + 0.42... + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + array([ 1., 1., 0.]) + """ _, _, f, _ = precision_recall_fscore_support(y_true, y_pred, beta=beta, @@ -1262,6 +1351,128 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, return f +def _tp_tn_fp_fn(y_true, y_pred, labels=None, pos_label=1): + """Compute the number of true/false positives/negative for each class + + Parameters + ---------- + y_true : array-like or list of labels or label indicator matrix + Ground truth (correct) labels. + + y_pred : array-like or list of labels or label indicator matrix + Predicted labels, as returned by a classifier. + + labels : array, shape = [n_labels], optional + Integer array of labels. + + pos_label : int, 1 by default + In multilabel classification, it is used to infer what is a positive + label in the label indicator matrix format. + + Returns + ------- + true_pos : array of int, shape = [n_unique_labels] + Number of true positives + + true_neg : array of int, shape = [n_unique_labels] + Number of true negative + + false_pos : array of int, shape = [n_unique_labels] + Number of false positives + + false_pos : array of int, shape = [n_unique_labels] + Number of false positives + + Examples + -------- + In the binary case: + + >>> from sklearn.metrics.metrics import _tp_tn_fp_fn + >>> y_pred = [0, 1, 0, 0] + >>> y_true = [0, 1, 0, 1] + >>> _tp_tn_fp_fn(y_true, y_pred) + (array([2, 1]), array([1, 2]), array([1, 0]), array([0, 1])) + + In the multiclass case: + >>> y_true = np.array([0, 1, 2, 0, 1, 2]) + >>> y_pred = np.array([0, 2, 1, 0, 0, 1]) + >>> _tp_tn_fp_fn(y_true, y_pred) + (array([2, 0, 0]), array([3, 2, 3]), array([1, 2, 1]), array([0, 2, 2])) + + In the multilabel case with binary indicator format: + + >>> _tp_tn_fp_fn(np.array([[0.0, 1.0], [1.0, 1.0]]), np.zeros((2, 2))) + (array([0, 0]), array([1, 0]), array([0, 0]), array([1, 2])) + + and with a list of labels format: + + >>> _tp_tn_fp_fn([(1, 2), (3,)], [(1, 2), tuple()]) # doctest: +ELLIPSIS + (array([1, 1, 0]), array([1, 1, 1]), array([0, 0, 0]), array([0, 0, 1])) + + """ + y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) + + if labels is None: + labels = unique_labels(y_true, y_pred) + else: + labels = np.asarray(labels, dtype=np.int) + + n_labels = labels.size + true_pos = np.zeros((n_labels), dtype=np.int) + false_pos = np.zeros((n_labels), dtype=np.int) + false_neg = np.zeros((n_labels), dtype=np.int) + + if is_multilabel(y_true): + # Handle mix representation + if type(y_true) != type(y_pred): + labels = unique_labels(y_true, y_pred) + lb = LabelBinarizer() + lb.fit([labels.tolist()]) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + + if is_label_indicator_matrix(y_true): + true_pos = np.sum(np.logical_and(y_true == pos_label, + y_pred == pos_label), axis=0) + false_pos = np.sum(np.logical_and(y_true != pos_label, + y_pred == pos_label), axis=0) + false_neg = np.sum(np.logical_and(y_true == pos_label, + y_pred != pos_label), axis=0) + + else: + idx_to_label = dict((label_i, i) + for i, label_i in enumerate(labels)) + + for true, pred in zip(y_true, y_pred): + true_set = np.array([idx_to_label[l] for l in set(true)], + dtype=np.int) + pred_set = np.array([idx_to_label[l] for l in set(pred)], + dtype=np.int) + true_pos[np.intersect1d(true_set, pred_set)] += 1 + false_pos[np.setdiff1d(pred_set, true_set)] += 1 + false_neg[np.setdiff1d(true_set, pred_set)] += 1 + + else: + y_true, y_pred = check_arrays(y_true, y_pred) + y_true, y_pred = _check_1d_array(y_true, y_pred) + + for i, label_i in enumerate(labels): + true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i) + false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i) + false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i) + + # Compute the true_neg using the tp, fp and fn + if hasattr(y_true, "shape"): + n_samples = (np.max(y_true.shape) if _is_1d(y_true) + else y_true.shape[0]) + else: + n_samples = len(y_true) + + true_neg = n_samples - true_pos - false_pos - false_neg + + return true_pos, true_neg, false_pos, false_neg + + def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1, average=None): """Compute precision, recall, F-measure and support for each class @@ -1284,16 +1495,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, The support is the number of occurrences of each class in ``y_true``. - If ``pos_label is None``, this function returns the average precision, - recall and F-measure if ``average`` is one of ``'micro'``, ``'macro'``, - ``'weighted'``. + If ``pos_label is None`` and in binary classification, this function + returns the average precision, recall and F-measure if ``average`` + is one of ``'micro'``, ``'macro'``, ``'weighted'``. Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. beta : float, 1.0 by default @@ -1304,9 +1515,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. - average : string, [None (default), 'micro', 'macro', 'weighted'] + average : string, [None (default), 'micro', 'macro', 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: @@ -1314,11 +1527,18 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. - ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. + ``'weighted'``: + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between + precision and recall. + Returns ------- @@ -1342,6 +1562,10 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, .. [2] `Wikipedia entry for the F1-score `_ + .. [3] `Discriminative Methods for Multi-labeled Classification Advances + in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu + Godbole, Sunita Sarawagi + ` Examples -------- @@ -1365,22 +1589,58 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, >>> from sklearn.metrics import precision_recall_fscore_support >>> y_true = np.array([0, 1, 2, 0, 1, 2]) >>> y_pred = np.array([0, 2, 1, 0, 0, 1]) - >>> precision_recall_fscore_support(y_true, y_pred, average='macro')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='micro')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) - >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')\ - # doctest: +ELLIPSIS + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import precision_recall_fscore_support + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS + (0.44..., 1.0, 0.59..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS + (0.44..., 1.0, 0.61..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + (0.499..., 1.0, 0.65..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='samples') + ... # doctest: +ELLIPSIS + (1.0, 0.44..., 0.59..., None) + + and with a list of labels format: + + >>> from sklearn.metrics import precision_recall_fscore_support + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + ... # doctest: +ELLIPSIS + (0.66..., 0.66..., 0.66..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + ... # doctest: +ELLIPSIS + (1.0, 0.66..., 0.80..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + (0.66..., 0.66..., 0.66..., None) + >>> precision_recall_fscore_support(y_true, y_pred, average='samples') + ... # doctest: +ELLIPSIS + (0.5, 1.0, 0.5, None) + """ if beta <= 0: raise ValueError("beta should be >0 in the F-beta score") + beta2 = beta ** 2 - y_true, y_pred = check_arrays(y_true, y_pred) - y_true, y_pred = _check_1d_array(y_true, y_pred) + y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) if labels is None: labels = unique_labels(y_true, y_pred) @@ -1388,16 +1648,65 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, labels = np.asarray(labels, dtype=np.int) n_labels = labels.size - true_pos = np.empty(n_labels, dtype=np.long) - false_pos = np.empty(n_labels, dtype=np.long) - false_neg = np.empty(n_labels, dtype=np.long) - support = np.empty(n_labels, dtype=np.long) - for i, label_i in enumerate(labels): - true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i) - false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i) - false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i) - support[i] = np.sum(y_true == label_i) + if average == "samples": + if is_multilabel(y_true): + # Handle mix representation + if type(y_true) != type(y_pred): + labels = unique_labels(y_true, y_pred) + lb = LabelBinarizer() + lb.fit([labels.tolist()]) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + + if is_label_indicator_matrix(y_true): + y_true_pos_label = y_true == pos_label + y_pred_pos_label = y_pred == pos_label + size_inter = np.sum(np.logical_and(y_true_pos_label, + y_pred_pos_label), axis=1) + size_true = np.sum(y_true_pos_label, axis=1) + size_pred = np.sum(y_pred_pos_label, axis=1) + + else: + size_inter = np.empty(len(y_true), dtype=np.int) + size_true = np.empty(len(y_true), dtype=np.int) + size_pred = np.empty(len(y_true), dtype=np.int) + for i, (true, pred) in enumerate(zip(y_true, y_pred)): + true_set = set(true) + pred_set = set(pred) + size_inter[i] = len(true_set & pred_set) + size_pred[i] = len(pred_set) + size_true[i] = len(true_set) + else: + raise ValueError("Example-based precision, recall, fscore is " + "not meaning full outside multilabe" + "classification. See the accuracy_score instead.") + + try: + # oddly, we may get an "invalid" rather than a "divide" error + # here + old_err_settings = np.seterr(divide='ignore', invalid='ignore') + + precision = size_inter / size_true + recall = size_inter / size_pred + f_score = ((1 + beta2 ** 2) * size_inter / + (beta2 * size_pred + size_true)) + finally: + np.seterr(**old_err_settings) + + precision[size_true == 0] = 1.0 + recall[size_pred == 0] = 1.0 + f_score[(beta2 * size_pred + size_true) == 0] = 1.0 + + precision = np.mean(precision) + recall = np.mean(recall) + f_score = np.mean(f_score) + + return precision, recall, f_score, None + + true_pos, _, false_pos, false_neg = _tp_tn_fp_fn(y_true, y_pred, labels, + pos_label) + support = true_pos + false_neg try: # oddly, we may get an "invalid" rather than a "divide" error here @@ -1412,13 +1721,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, recall[(true_pos + false_neg) == 0] = 0.0 # fbeta score - beta2 = beta ** 2 fscore = divide((1 + beta2) * precision * recall, - beta2 * precision + recall, - dtype=np.double) + beta2 * precision + recall) # handle division by 0 in fscore - fscore[(precision + recall) == 0] = 0.0 + fscore[(beta2 * precision + recall) == 0] = 0.0 finally: np.seterr(**old_err_settings) @@ -1433,25 +1740,42 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, return (precision[pos_label_idx], recall[pos_label_idx], fscore[pos_label_idx], support[pos_label_idx]) else: - average_options = (None, 'micro', 'macro', 'weighted') + average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average == 'micro': avg_precision = divide(true_pos.sum(), - true_pos.sum() + false_pos.sum(), - dtype=np.double) + true_pos.sum() + false_pos.sum(), + dtype=np.double) avg_recall = divide(true_pos.sum(), true_pos.sum() + false_neg.sum(), dtype=np.double) avg_fscore = divide((1 + beta2) * (avg_precision * avg_recall), beta2 * avg_precision + avg_recall, dtype=np.double) + + if np.isnan(avg_precision): + avg_precision = 0. + + if np.isnan(avg_recall): + avg_recall = 0. + + if np.isnan(avg_fscore): + avg_fscore = 0. + elif average == 'macro': avg_precision = np.mean(precision) avg_recall = np.mean(recall) avg_fscore = np.mean(fscore) + elif average == 'weighted': - avg_precision = np.average(precision, weights=support) - avg_recall = np.average(recall, weights=support) - avg_fscore = np.average(fscore, weights=support) + if np.all(support == 0): + avg_precision = 0. + avg_recall = 0. + avg_fscore = 0. + else: + avg_precision = np.average(precision, weights=support) + avg_recall = np.average(recall, weights=support) + avg_fscore = np.average(fscore, weights=support) + else: raise ValueError('average has to be one of ' + str(average_options)) @@ -1472,10 +1796,10 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1483,9 +1807,11 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: @@ -1493,11 +1819,17 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between + precision and recall. Returns ------- @@ -1525,12 +1857,49 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, 0.22... >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.33... - >>> precision_score(y_true, y_pred, average='weighted')\ - # doctest: +ELLIPSIS + >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS 0.22... >>> precision_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS array([ 0.66..., 0. , 0. ]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import precision_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.44... + >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.44... + >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + 0.49... + >>> precision_score(y_true, y_pred, average='samples') + 1.0 + >>> precision_score(y_true, y_pred, average=None) + ... # doctest: +ELLIPSIS + array([ 0.33..., 0.66..., 0.33...]) + + and with a list of labels format: + + >>> from sklearn.metrics import precision_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 1.0 + >>> precision_score(y_true, y_pred, average='weighted') + ... # doctest: +ELLIPSIS + 0.66... + >>> precision_score(y_true, y_pred, average='samples') + ... # doctest: +ELLIPSIS + 0.5 + >>> precision_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) + + """ p, _, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, @@ -1550,10 +1919,10 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array @@ -1561,9 +1930,11 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): pos_label : int, 1 by default If ``average`` is not ``None`` and the classification target is binary, - only this class's scores will be returned. + only this class's scores will be returned. In multilabel + classification, it is used to infer what is a positive label in the + label indicator matrix format. - average : string, [None, 'micro', 'macro', 'weighted' (default)] + average : string, [None, 'micro', 'macro', 'samples', 'weighted' (default)] If ``None``, the scores for each class are returned. Otherwise, unless ``pos_label`` is given in binary classification, this determines the type of averaging performed on the data: @@ -1571,11 +1942,17 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): ``'macro'``: Average over classes (does not take imbalance into account). ``'micro'``: - Average over instances (takes imbalance into account). This - implies that ``precision == recall == F1``. + Aggregate classes and average over instances (takes imbalance into + account). This implies that ``precision == recall == F1``. + In multilabel classification, this is true only if every sample + has a label. + ``'samples'``: + Average over instance. Only meaningful and available in multilabel + classification. ``'weighted'``: - Average weighted by support (takes imbalance into account). Can - result in F-score that is not between precision and recall. + Average over classes weighted by support (takes imbalance into + account). Can result in F-score that is not between + precision and recall. Returns ------- @@ -1608,6 +1985,37 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): >>> recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) + In the multilabel case with binary indicator format: + + >>> from sklearn.metrics import recall_score + >>> y_true = np.array([[0.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + >>> y_pred = np.ones((3, 3)) + >>> recall_score(y_true, y_pred, average='macro') + 1.0 + >>> recall_score(y_true, y_pred, average='micro') + 1.0 + >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 1.0 + >>> recall_score(y_true, y_pred, average='samples') # doctest: +ELLIPSIS + 0.44... + >>> recall_score(y_true, y_pred, average=None) + array([ 1., 1., 1.]) + + and with a list of labels format: + + >>> from sklearn.metrics import recall_score + >>> y_true = [(1, 2), (3,)] + >>> y_pred = [(1, 2), tuple()] + >>> recall_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS + 0.66... + >>> recall_score(y_true, y_pred, average='samples') + 1.0 + >>> recall_score(y_true, y_pred, average=None) + array([ 1., 1., 0.]) """ _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, @@ -1642,15 +2050,16 @@ def zero_one_score(y_true, y_pred): ############################################################################### # Multiclass utility function ############################################################################### -def classification_report(y_true, y_pred, labels=None, target_names=None): +def classification_report(y_true, y_pred, labels=None, target_names=None, + pos_label=1): """Build a text report showing the main classification metrics Parameters ---------- - y_true : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Ground truth (correct) target values. - y_pred : array, shape = [n_samples] + y_true : array-like or list of labels or label indicator matrix Estimated targets as returned by a classifier. labels : array, shape = [n_labels] @@ -1659,6 +2068,10 @@ def classification_report(y_true, y_pred, labels=None, target_names=None): target_names : list of strings Optional display names matching the labels (same order). + pos_label : int, 1 by default + In multilabel classification, it is used to infer what is a + positive label in the label indicator matrix format. + Returns ------- report : string @@ -1708,6 +2121,7 @@ class 2 1.00 1.00 1.00 2 p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, labels=labels, + pos_label=pos_label, average=None) for i, label in enumerate(labels): diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 6d36434f54a72..bb7dc4b5539cc 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -68,14 +68,47 @@ "f2_score": lambda y1, y2: fbeta_score(y1, y2, beta=2), "f0.5_score": lambda y1, y2: fbeta_score(y1, y2, beta=0.5), "matthews_corrcoef_score": matthews_corrcoef, - "auc_score": auc_score, "average_precision_score": average_precision_score, + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), + "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, "explained_variance_score": explained_variance_score, - "r2_score": r2_score} + "r2_score": r2_score +} METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score ": lambda y1, y2, normalize: @@ -101,6 +134,99 @@ "unnormalized_zero_one_loss": lambda y1, y2: zero_one_loss(y1, y2, normalize=False), + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "samples_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="samples", beta=0.5), + "samples_f1_score": + lambda y1, y2: f1_score(y1, y2, average="samples"), + "samples_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="samples", beta=2), + "samples_precision_score": + lambda y1, y2: precision_score(y1, y2, average="samples"), + "samples_recall_score": + lambda y1, y2: recall_score(y1, y2, average="samples"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), +} + +MULTILABELS_METRICS_WITH_POS_LABELS = { + "jaccard_similarity_score": jaccard_similarity_score, + "unormalized_jaccard_similarity_score": lambda y1, y2, pos_label=1: + jaccard_similarity_score(y1, y2, pos_label=pos_label, normalize=False), + + "weighted_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="weighted", beta=0.5), + "weighted_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="weighted"), + "weighted_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="weighted", beta=2), + "weighted_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="weighted"), + "weighted_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="weighted"), + + "samples_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="samples", beta=0.5), + "samples_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="samples"), + "samples_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="samples", beta=2), + "samples_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="samples"), + "samples_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="samples"), + + "micro_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=0.5), + "micro_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="micro"), + "micro_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="micro", beta=2), + "micro_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="micro"), + "micro_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="micro"), + + "macro_f0.5_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="macro", beta=0.5), + "macro_f1_score": lambda y1, y2, pos_label=1: + f1_score(y1, y2, pos_label=pos_label, average="macro"), + "macro_f2_score": lambda y1, y2, pos_label=1: + fbeta_score(y1, y2, pos_label=pos_label, average="macro", beta=2), + "macro_precision_score": lambda y1, y2, pos_label=1: + precision_score(y1, y2, pos_label=pos_label, average="macro"), + "macro_recall_score": lambda y1, y2, pos_label=1: + recall_score(y1, y2, pos_label=pos_label, average="macro"), } SYMETRIC_METRICS = { @@ -119,17 +245,54 @@ lambda y1, y2: zero_one_loss(y1, y2, normalize=False), "f1_score": f1_score, + "weighted_f1_score": + lambda y1, y2: f1_score(y1, y2, average="weighted"), + "micro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="micro"), + "macro_f1_score": + lambda y1, y2: f1_score(y1, y2, average="macro"), + "matthews_corrcoef_score": matthews_corrcoef, "mean_absolute_error": mean_absolute_error, - "mean_squared_error": mean_squared_error} + "mean_squared_error": mean_squared_error +} NOT_SYMETRIC_METRICS = { + "explained_variance_score": explained_variance_score, + "r2_score": r2_score, + "precision_score": precision_score, "recall_score": recall_score, "f2_score": lambda y1, y2: fbeta_score(y1, y2, beta=2), "f0.5_score": lambda y1, y2: fbeta_score(y1, y2, beta=0.5), - "explained_variance_score": explained_variance_score, - "r2_score": r2_score} + + "weighted_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=0.5), + "weighted_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="weighted", beta=2), + "weighted_precision_score": + lambda y1, y2: precision_score(y1, y2, average="weighted"), + "weighted_recall_score": + lambda y1, y2: recall_score(y1, y2, average="weighted"), + + "micro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=0.5), + "micro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="micro", beta=2), + "micro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="micro"), + "micro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="micro"), + + "macro_f0.5_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=0.5), + "macro_f2_score": + lambda y1, y2: fbeta_score(y1, y2, average="macro", beta=2), + "macro_precision_score": + lambda y1, y2: precision_score(y1, y2, average="macro"), + "macro_recall_score": + lambda y1, y2: recall_score(y1, y2, average="macro"), +} THRESHOLDED_METRICS = { "auc_score": auc_score, @@ -494,6 +657,13 @@ def test_precision_recall_f1_score_multiclass(): fs = f1_score(y_true, y_pred, average='weighted') assert_array_almost_equal(fs, 0.47, 2) + assert_raises(ValueError, precision_score, y_true, y_pred, + average="samples") + assert_raises(ValueError, recall_score, y_true, y_pred, average="samples") + assert_raises(ValueError, f1_score, y_true, y_pred, average="samples") + assert_raises(ValueError, fbeta_score, y_true, y_pred, average="samples", + beta=0.5) + # same prediction but with and explicit label ordering p, r, f, s = precision_recall_fscore_support( y_true, y_pred, labels=[0, 2, 1], average=None) @@ -578,7 +748,25 @@ def test_confusion_matrix_multiclass_subset_labels(): [24, 3]]) -def test_classification_report(): +def test_classification_report_binary_classification_with_pos_label(): + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=True) + + print y_true + expected_report = """\ + precision recall f1-score support + + 0 0.73 0.88 0.80 25 + 1 0.85 0.68 0.76 25 + +avg / total 0.79 0.78 0.78 50 +""" + for pos_label in [0, 1]: + report = classification_report(y_true, y_pred, pos_label=pos_label) + assert_equal(report, expected_report) + + +def test_classification_report_multiclass(): """Test performance report""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) @@ -621,6 +809,58 @@ def test_classification_report(): assert_equal(report, expected_report) +def test_multilabel_classification_report(): + + n_classes = 4 + n_samples = 50 + _, y_true_ll = make_multilabel_classification(n_features=1, + n_classes=n_classes, + random_state=0, + n_samples=n_samples) + _, y_pred_ll = make_multilabel_classification(n_features=1, + n_classes=n_classes, + random_state=1, + n_samples=n_samples) + + expected_report = """\ + precision recall f1-score support + + 0 0.39 0.73 0.51 15 + 1 0.57 0.75 0.65 28 + 2 0.33 0.11 0.17 18 + 3 0.44 0.50 0.47 24 + +avg / total 0.45 0.54 0.47 85 +""" + + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + report = classification_report(y_true, y_pred) + assert_equal(report, expected_report) + + # With a given pos_label + pos_label = 5 + y_true_bi = y_true_bi * pos_label + y_pred_bi = y_pred_bi * pos_label + + expected_report = """\ + precision recall f1-score support + + 0 0.39 0.73 0.51 15 + 1 0.57 0.75 0.65 28 + 2 0.33 0.11 0.17 18 + 3 0.44 0.50 0.47 24 + +avg / total 0.45 0.54 0.47 85 +""" + report = classification_report(y_true_bi, y_pred_bi, pos_label=pos_label) + assert_equal(report, expected_report) + + def test_precision_recall_curve(): y_true, _, probas_pred = make_prediction(binary=True) _test_precision_recall_curve(y_true, probas_pred) @@ -815,60 +1055,50 @@ def test_format_invariance_with_1d_vectors(): measure = metric(y1, y2) - assert_almost_equal(measure, - metric(y1_list, y2_list), + assert_almost_equal(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant" - "with list" % metric) + "with list" % name) - assert_almost_equal(measure, - metric(y1_1d, y2_1d), + assert_almost_equal(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant" - "with np-array-1d" % metric) + "with np-array-1d" % name) - assert_almost_equal(measure, - metric(y1_column, y2_column), + assert_almost_equal(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant " - "with np-array-column" % metric) + "with np-array-column" % name) - assert_almost_equal(measure, - metric(y1_row, y2_row), + assert_almost_equal(metric(y1_row, y2_row), measure, err_msg="%s is not representation invariant " - "with np-array-row" % metric) + "with np-array-row" % name) # Mix format support - assert_almost_equal(measure, - metric(y1_1d, y2_list), + assert_almost_equal(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant " - "with mix np-array-1d and list" % metric) + "with mix np-array-1d and list" % name) - assert_almost_equal(measure, - metric(y1_list, y2_1d), + assert_almost_equal(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant " - "with mix np-array-1d and list" % metric) + "with mix np-array-1d and list" % name) - assert_almost_equal(measure, - metric(y1_1d, y2_column), + assert_almost_equal(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_column, y2_1d), + assert_almost_equal(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_list, y2_column), + assert_almost_equal(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant" "with mix list and np-array-column" - % metric) + % name) - assert_almost_equal(measure, - metric(y1_column, y2_list), + assert_almost_equal(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant" "with mix list and np-array-column" - % metric) + % name) # At the moment, these mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) @@ -882,7 +1112,7 @@ def test_format_invariance_with_1d_vectors(): def test_hinge_loss_binary(): y_true = np.array([-1, 1, 1, -1]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) + assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4) with warnings.catch_warnings(): # Test deprecated pos_label @@ -892,23 +1122,17 @@ def test_hinge_loss_binary(): y_true = np.array([0, 2, 2, 0]) pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision)) + + assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4) with warnings.catch_warnings(): # Test deprecated pos_label - assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, - pos_label=2, neg_label=0)) + assert_equal(hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0), + 1.2 / 4) def test_multioutput_regression(): - y_true = np.array([[1, 0, 0, 1], - [0, 1, 1, 1], - [1, 1, 0, 1], - ]) - - y_pred = np.array([[0, 0, 0, 1], - [1, 0, 1, 1], - [0, 0, 0, 1], - ]) + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) error = mean_squared_error(y_true, y_pred) assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.) @@ -923,15 +1147,8 @@ def test_multioutput_regression(): def test_multioutput_number_of_output_differ(): - y_true = np.array([[1, 0, 0, 1], - [0, 1, 1, 1], - [1, 1, 0, 1], - ]) - - y_pred = np.array([[0, 0], - [1, 0], - [0, 0], - ]) + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0], [1, 0], [0, 0]]) assert_raises(ValueError, mean_squared_error, y_true, y_pred) assert_raises(ValueError, mean_absolute_error, y_true, y_pred) @@ -951,11 +1168,12 @@ def test_multioutput_regression_invariance_to_dimension_shuffling(): for _ in xrange(3): perm = rng.permutation(n_dims) - assert_almost_equal(error, - metric(y_true[:, perm], y_pred[:, perm])) + assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), + error) def test_multilabel_representation_invariance(): + # Generate some data n_classes = 4 n_samples = 50 @@ -991,57 +1209,48 @@ def test_multilabel_representation_invariance(): measure = metric(y1, y2) # Check representation invariance - assert_almost_equal(measure, - metric(y1_binary_indicator, y2_binary_indicator), + assert_almost_equal(metric(y1_binary_indicator, y2_binary_indicator), + measure, err_msg="%s failed representation invariance " "between list of list of labels format " "and dense binary indicator format." % name) # Check invariance with redundant labels with list of labels - assert_almost_equal(measure, - metric(y1, y2_redundant), + assert_almost_equal(metric(y1, y2_redundant), measure, err_msg="%s failed rendundant label invariance" % name) - assert_almost_equal(measure, - metric(y1_redundant, y2_redundant), + assert_almost_equal(metric(y1_redundant, y2_redundant), measure, err_msg="%s failed rendundant label invariance" % name) - assert_almost_equal(measure, - metric(y1_redundant, y2), + assert_almost_equal(metric(y1_redundant, y2), measure, err_msg="%s failed rendundant label invariance" % name) # Check shuffling invariance with list of labels - assert_almost_equal(measure, - metric(y1_shuffle, y2_shuffle), + assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure, err_msg="%s failed shuffling invariance " "with list of list of labels format." % name) # Check shuffling invariance with dense binary indicator matrix - assert_almost_equal(measure, - metric(y1_shuffle_binary_indicator, - y2_shuffle_binary_indicator), + assert_almost_equal(metric(y1_shuffle_binary_indicator, + y2_shuffle_binary_indicator), measure, err_msg="%s failed shuffling invariance " " with dense binary indicator format." % name) # Check invariance with mix input representation - assert_almost_equal(measure, - metric(y1, - y2_binary_indicator), + assert_almost_equal(metric(y1, y2_binary_indicator), measure, err_msg="%s failed mix input representation" "invariance: y_true in list of list of " "labels format and y_pred in dense binary" "indicator format" % name) - assert_almost_equal(measure, - metric(y1_binary_indicator, - y2), + assert_almost_equal(metric(y1_binary_indicator, y2), measure, err_msg="%s failed mix input representation" "invariance: y_true in dense binary " "indicator format and y_pred in list of " @@ -1051,130 +1260,124 @@ def test_multilabel_representation_invariance(): def test_multilabel_zero_one_loss_subset(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(0.5, zero_one_loss(y1, y2)) - assert_equal(0.0, zero_one_loss(y1, y1)) - assert_equal(0.0, zero_one_loss(y2, y2)) - assert_equal(1.0, zero_one_loss(y2, np.logical_not(y2))) - assert_equal(1.0, zero_one_loss(y1, np.logical_not(y1))) - assert_equal(1.0, zero_one_loss(y1, np.zeros(y1.shape))) - assert_equal(1.0, zero_one_loss(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(zero_one_loss(y1, y2), 0.5) + assert_equal(zero_one_loss(y1, y1), 0) + assert_equal(zero_one_loss(y2, y2), 0) + assert_equal(zero_one_loss(y2, np.logical_not(y2)), 1) + assert_equal(zero_one_loss(y1, np.logical_not(y1)), 1) + assert_equal(zero_one_loss(y1, np.zeros(y1.shape)), 1) + assert_equal(zero_one_loss(y2, np.zeros(y1.shape)), 1) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.5, zero_one_loss(y1, y2)) - assert_equal(0.0, zero_one_loss(y1, y1)) - assert_equal(0.0, zero_one_loss(y2, y2)) - assert_equal(1.0, zero_one_loss(y2, [(), ()])) - assert_equal(1.0, zero_one_loss(y2, [tuple(), (10, )])) + assert_equal(zero_one_loss(y1, y2), 0.5) + assert_equal(zero_one_loss(y1, y1), 0) + assert_equal(zero_one_loss(y2, y2), 0) + assert_equal(zero_one_loss(y2, [(), ()]), 1) + assert_equal(zero_one_loss(y2, [tuple(), (10, )]), 1) def test_multilabel_hamming_loss(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(1 / 6., hamming_loss(y1, y2)) - assert_equal(0.0, hamming_loss(y1, y1)) - assert_equal(0.0, hamming_loss(y2, y2)) - assert_equal(1.0, hamming_loss(y2, np.logical_not(y2))) - assert_equal(1.0, hamming_loss(y1, np.logical_not(y1))) - assert_equal(4. / 6, hamming_loss(y1, np.zeros(y1.shape))) - assert_equal(0.5, hamming_loss(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(hamming_loss(y1, y2), 1 / 6) + assert_equal(hamming_loss(y1, y1), 0) + assert_equal(hamming_loss(y2, y2), 0) + assert_equal(hamming_loss(y2, np.logical_not(y2)), 1) + assert_equal(hamming_loss(y1, np.logical_not(y1)), 1) + assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6) + assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5) # List of tuple of label y1 = [(1, 2,), (0, 2,)] - y2 = [(2,), (0, 2,)] - assert_equal(1 / 6., hamming_loss(y1, y2)) - assert_equal(0.0, hamming_loss(y1, y1)) - assert_equal(0.0, hamming_loss(y2, y2)) - assert_equal(0.75, hamming_loss(y2, [(), ()])) - assert_equal(0.625, hamming_loss(y1, [tuple(), (10, )])) - assert_almost_equal(0.1818, hamming_loss(y2, [tuple(), (10, )], - classes=np.arange(11)), 2) + assert_equal(hamming_loss(y1, y2), 1 / 6) + assert_equal(hamming_loss(y1, y1), 0) + assert_equal(hamming_loss(y2, y2), 0) + assert_equal(hamming_loss(y2, [(), ()]), 0.75) + assert_equal(hamming_loss(y1, [tuple(), (10, )]), 0.625) + assert_almost_equal(hamming_loss(y2, [tuple(), (10, )], + classes=np.arange(11)), 0.1818, 2) def test_multilabel_accuracy_score_subset_accuracy(): # Dense label indicator matrix format - y1 = np.array([[0, 1, 1], - [1, 0, 1]]) - y2 = np.array([[0, 0, 1], - [1, 0, 1]]) - - assert_equal(0.5, accuracy_score(y1, y2)) - assert_equal(1.0, accuracy_score(y1, y1)) - assert_equal(1.0, accuracy_score(y2, y2)) - assert_equal(0.0, accuracy_score(y2, np.logical_not(y2))) - assert_equal(0.0, accuracy_score(y1, np.logical_not(y1))) - assert_equal(0.0, accuracy_score(y1, np.zeros(y1.shape))) - assert_equal(0.0, accuracy_score(y2, np.zeros(y1.shape))) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert_equal(accuracy_score(y1, y2), 0.5) + assert_equal(accuracy_score(y1, y1), 1) + assert_equal(accuracy_score(y2, y2), 1) + assert_equal(accuracy_score(y2, np.logical_not(y2)), 0) + assert_equal(accuracy_score(y1, np.logical_not(y1)), 0) + assert_equal(accuracy_score(y1, np.zeros(y1.shape)), 0) + assert_equal(accuracy_score(y2, np.zeros(y1.shape)), 0) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.5, accuracy_score(y1, y2)) - assert_equal(1.0, accuracy_score(y1, y1)) - assert_equal(1.0, accuracy_score(y2, y2)) - assert_equal(0.0, accuracy_score(y2, [(), ()])) + assert_equal(accuracy_score(y1, y2), 0.5) + assert_equal(accuracy_score(y1, y1), 1) + assert_equal(accuracy_score(y2, y2), 1) + assert_equal(accuracy_score(y2, [(), ()]), 0) + assert_equal(accuracy_score(y1, y2, normalize=False), 1) + assert_equal(accuracy_score(y1, y1, normalize=False), 2) + assert_equal(accuracy_score(y2, y2, normalize=False), 2) + assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0) def test_multilabel_jaccard_similarity_score(): # Dense label indicator matrix format - y1 = np.array([[0.0, 1.0, 1.0], - [1.0, 0.0, 1.0]]) - y2 = np.array([[0.0, 0.0, 1.0], - [1.0, 0.0, 1.0]]) + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) # size(y1 \inter y2) = [1, 2] # size(y1 \union y2) = [2, 2] - assert_equal(0.75, jaccard_similarity_score(y1, y2)) - assert_equal(1.0, jaccard_similarity_score(y1, y1)) - - assert_equal(1.0, jaccard_similarity_score(y2, y2)) - assert_equal(0.0, jaccard_similarity_score(y2, np.logical_not(y2))) - assert_equal(0.0, jaccard_similarity_score(y1, np.logical_not(y1))) - assert_equal(0.0, jaccard_similarity_score(y1, np.zeros(y1.shape))) - assert_equal(0.0, jaccard_similarity_score(y2, np.zeros(y1.shape))) + assert_equal(jaccard_similarity_score(y1, y2), 0.75) + assert_equal(jaccard_similarity_score(y1, y1), 1) + assert_equal(jaccard_similarity_score(y2, y2), 1) + assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0) + assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0) + assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0) + assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0) # With a given pos_label - assert_equal(0.75, jaccard_similarity_score(y1, y2, pos_label=0)) - assert_equal(0.5, jaccard_similarity_score(y2, np.zeros(y1.shape), - pos_label=0)) - assert_equal(1, jaccard_similarity_score(y1, y2, pos_label=10)) + assert_equal(jaccard_similarity_score(y1, y2, pos_label=0), 0.75) + assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape), + pos_label=0), 0.5) + assert_equal(jaccard_similarity_score(y1, y2, pos_label=10), 1) # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] - assert_equal(0.75, jaccard_similarity_score(y1, y2)) - assert_equal(1.0, jaccard_similarity_score(y1, y1)) - assert_equal(1.0, jaccard_similarity_score(y2, y2)) - assert_equal(0.0, jaccard_similarity_score(y2, [(), ()])) + assert_equal(jaccard_similarity_score(y1, y2), 0.75) + assert_equal(jaccard_similarity_score(y1, y1), 1) + assert_equal(jaccard_similarity_score(y2, y2), 1) + assert_equal(jaccard_similarity_score(y2, [(), ()]), 0) # |y3 inter y4 | = [0, 1, 1] # |y3 union y4 | = [2, 1, 3] y3 = [(0,), (1,), (3,)] y4 = [(4,), (4,), (5, 6)] - assert_almost_equal(0, jaccard_similarity_score(y3, y4)) + assert_almost_equal(jaccard_similarity_score(y3, y4), 0) # |y5 inter y6 | = [0, 1, 1] # |y5 union y6 | = [2, 1, 3] y5 = [(0,), (1,), (2, 3)] y6 = [(1,), (1,), (2, 0)] - assert_almost_equal((1 + 1. / 3) / 3, jaccard_similarity_score(y5, y6)) + assert_almost_equal(jaccard_similarity_score(y5, y6), (1 + 1 / 3) / 3) def test_normalize_option_binary_classification(): @@ -1241,6 +1444,252 @@ def test_normalize_option_multilabel_classification(): msg="We failed to test correctly the normalize option") assert_almost_equal(metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) - / n_samples, - measure, + / n_samples, measure, err_msg="Failed with %s" % name) + + +def test_precision_recall_f1_score_multilabel_1(): + """ Test precision_recall_f1_score on a crafted multilabel example + """ + # First crafted example + y_true_ll = [(0,), (1,), (2, 3)] + y_pred_ll = [(1,), (1,), (2, 0)] + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 1, 1, 0] + #fn = [1, 0, 0, 1] + #fp = [1, 1, 0, 0] + + # Check per class + assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 1, 1, 1], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert_equal(s, None) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert_equal(s, None) + + # Check weigted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert_equal(s, None) + + # Check weigted + # |h(x_i) inter y_i | = [0, 1, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert_equal(s, None) + + +def test_precision_recall_f1_score_multilabel_2(): + """ Test precision_recall_f1_score on a crafted multilabel example 2 + """ + # Second crafted example + y_true_ll = [(1,), (2,), (2, 3)] + y_pred_ll = [(4,), (4,), (2, 1)] + lb = LabelBinarizer() + lb.fit([range(1, 5)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + # tp = [ 0. 1. 0. 0.] + # fp = [ 1. 0. 0. 2.] + # fn = [ 1. 1. 1. 0.] + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.25) + assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.125) + assert_almost_equal(f, 2 / 12) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 2 / 4) + assert_almost_equal(r, 1 / 4) + assert_almost_equal(f, 2 / 3 * 2 / 4) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + # Check weigted + # |h(x_i) inter y_i | = [0, 0, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + assert_almost_equal(p, 1 / 6) + assert_almost_equal(r, 1 / 6) + assert_almost_equal(f, 2 / 4 * 1 / 3) + assert_equal(s, None) + + +def test_precision_recall_f1_score_with_an_empty_prediction(): + y_true_ll = [(1,), (0,), (2, 1,)] + y_pred_ll = [tuple(), (3,), (2, 1)] + + lb = LabelBinarizer() + lb.fit([range(4)]) + y_true_bi = lb.transform(y_true_ll) + y_pred_bi = lb.transform(y_pred_ll) + + for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: + # true_pos = [ 0. 1. 1. 0.] + # false_pos = [ 0. 0. 0. 1.] + # false_neg = [ 1. 1. 0. 0.] + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 1.5 / 4) + assert_almost_equal(f, 2.5 / (4 * 1.5)) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 2 / 3) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 3 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, (2 / 1.5 + 1) / 4) + assert_equal(s, None) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + # |h(x_i) inter y_i | = [0, 0, 2] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [0, 1, 2] + assert_almost_equal(p, 1 / 3) + assert_almost_equal(r, 2 / 3) + assert_almost_equal(f, 1 / 3) + assert_equal(s, None) + + +def test_precision_recall_f1_no_labels(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average=None) + #tp = [0, 0, 0] + #fn = [0, 0, 0] + #fp = [0, 0, 0] + #support = [0, 0, 0] + + # Check per class + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="macro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="micro") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + # Check weighted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="weighted") + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + # # Check example + # |h(x_i) inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, + average="samples") + assert_almost_equal(p, 1) + assert_almost_equal(r, 1) + assert_almost_equal(f, 1) + assert_equal(s, None) + + +def test_multilabel_invariance_with_pos_labels(): + n_classes = 4 + n_samples = 50 + _, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes, + random_state=0, n_samples=n_samples) + _, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes, + random_state=1, n_samples=n_samples) + + lb = LabelBinarizer().fit([range(n_classes)]) + y1_binary_indicator = lb.transform(y1) + y2_binary_indicator = lb.transform(y2) + + for name, metric in MULTILABELS_METRICS_WITH_POS_LABELS.items(): + measure = metric(y1, y2) + + for pos_label in [1, 3]: + assert_almost_equal(metric(y1_binary_indicator * pos_label, + y2_binary_indicator * pos_label, + pos_label=pos_label), + measure, + err_msg="%s is not representation invariant" + "with pos_label=%s" + % (metric, pos_label)) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index af61e00d47832..bc7e760ce84c5 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -13,6 +13,9 @@ from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score + from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge, @@ -31,30 +34,6 @@ n_classes = 3 -# FIXME: - should use sets -# - should move to metrics module -def multilabel_precision(Y_true, Y_pred): - n_predictions = 0 - n_correct = 0 - for i in range(len(Y_true)): - n_predictions += len(Y_pred[i]) - for label in Y_pred[i]: - if label in Y_true[i]: - n_correct += 1 - return float(n_correct) / n_predictions - - -def multilabel_recall(Y_true, Y_pred): - n_labels = 0 - n_correct = 0 - for i in range(len(Y_true)): - n_labels += len(Y_true[i]) - for label in Y_pred[i]: - if label in Y_true[i]: - n_correct += 1 - return float(n_correct) / n_labels - - def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovr.predict, []) @@ -141,9 +120,11 @@ def test_ovr_multilabel_dataset(): clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) assert_true(clf.multilabel_) - assert_almost_equal(multilabel_precision(Y_test, Y_pred), prec, + assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), + prec, decimal=2) - assert_almost_equal(multilabel_recall(Y_test, Y_pred), recall, + assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"), + recall, decimal=2)