jwchennlp
diff --git a/‎sklearn/metrics/metrics.py
Lines changed: 29 additions & 3 deletions b/‎sklearn/metrics/metrics.py
Lines changed: 29 additions & 3 deletions
diff --git a/‎sklearn/metrics/tests/test_metrics.py
Lines changed: 32 additions & 6 deletions b/‎sklearn/metrics/tests/test_metrics.py
Lines changed: 32 additions & 6 deletions
@@ -257,6 +257,7 @@ def hinge_loss(y_true, pred_decision, pos_label=None, neg_label=None):
                       "release 0.15.", DeprecationWarning)
 
     # TODO: multi-class hinge-loss
+    y_true, pred_decision = check_arrays(y_true, pred_decision)
 
     # the rest of the code assumes that positive and negative labels
     # are encoded as +1 and -1 respectively
@@ -265,7 +266,14 @@ def hinge_loss(y_true, pred_decision, pos_label=None, neg_label=None):
     else:
         y_true = LabelBinarizer(neg_label=-1).fit_transform(y_true)[:, 0]
 
-    margin = y_true * np.asarray(pred_decision)
+    if pred_decision.ndim == 2 and pred_decision.shape[1] != 1:
+        raise ValueError("Multi-class hinge loss not supported")
+    pred_decision = np.ravel(pred_decision)
+
+    try:
+        margin = y_true * pred_decision
+    except TypeError:
+        raise ValueError("pred_decision should be an array of floats.")
     losses = 1 - margin
     # The hinge doesn't penalize good enough predictions.
     losses[losses <= 0] = 0
@@ -1015,10 +1023,28 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True):
     if T.shape[1] == 1:
         T = np.append(1 - T, T, axis=1)
 
-    # Clip and renormalize
+    # Clipping
     Y = np.clip(y_pred, eps, 1 - eps)
-    Y /= Y.sum(axis=1)[:, np.newaxis]
 
+    # This happens in cases when elements in y_pred have type "str".
+    if not isinstance(Y, np.ndarray):
+        raise ValueError("y_pred should be an array of floats.")
+
+    # If y_pred is of single dimension, assume y_true to be binary
+    # and then check.
+    if Y.ndim == 1:
+        Y = Y[:, np.newaxis]
+    if Y.shape[1] == 1:
+        Y = np.append(1 - Y, Y, axis=1)
+
+    # Check if dimensions are consistent.
+    T, Y = check_arrays(T, Y)
+    if T.shape[1] != Y.shape[1]:
+        raise ValueError("y_true and y_pred have different number of classes "
+                         "%d, %d" % (T.shape[1], Y.shape[1]))
+
+    # Renormalize
+    Y /= Y.sum(axis=1)[:, np.newaxis]
     loss = -(T * np.log(Y)).sum()
     return loss / T.shape[0] if normalize else loss
 
 
@@ -147,6 +147,8 @@
 }
 
 THRESHOLDED_METRICS = {
+    "log_loss": log_loss,
+    "hinge_loss": hinge_loss,
     "roc_auc_score": roc_auc_score,
     "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
     "samples_roc_auc": partial(roc_auc_score, average="samples"),
@@ -198,7 +200,7 @@
 
 # Metrics with a "pos_label" argument
 METRICS_WITH_POS_LABEL = [
-    "roc_curve",
+    "roc_curve", "hinge_loss",
 
     "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",
 
@@ -238,7 +240,7 @@
 # Threshold-based metrics with "multilabel-indicator" format support
 THRESHOLDED_MULTILABEL_METRICS = [
     "roc_auc_score", "weighted_roc_auc", "samples_roc_auc",
-    "micro_roc_auc", "macro_roc_auc",
+    "micro_roc_auc", "macro_roc_auc", "log_loss",
 
     "average_precision_score", "weighted_average_precision_score",
     "samples_average_precision_score", "micro_average_precision_score",
@@ -303,7 +305,7 @@
     "micro_recall_score",
 
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
-    "macro_recall_score",
+    "macro_recall_score", "log_loss", "hinge_loss"
 ]
 
 ###############################################################################
@@ -1496,9 +1498,22 @@ def test_invariance_string_vs_numbers_labels():
                                err_msg="{0} failed string vs number  "
                                        "invariance test".format(name))
 
-    # TODO Currently not supported
-    for name, metrics in THRESHOLDED_METRICS.items():
-        assert_raises(ValueError, metrics, y1_str, y2_str)
+    for name, metric in THRESHOLDED_METRICS.items():
+        if name in ("log_loss", "hinge_loss"):
+            measure_with_number = metric(y1, y2)
+            measure_with_str = metric(y1_str, y2)
+            assert_array_equal(measure_with_number, measure_with_str,
+                               err_msg="{0} failed string vs number invariance "
+                                       "test".format(name))
+
+            measure_with_strobj = metric(y1_str.astype('O'), y2)
+            assert_array_equal(measure_with_number, measure_with_strobj,
+                               err_msg="{0} failed string object vs number "
+                                       "invariance test".format(name))
+        else:
+            # TODO those metrics doesn't support string label yet
+            assert_raises(ValueError, metric, y1_str, y2)
+            assert_raises(ValueError, metric, y1_str.astype('O'), y2)
 
 
 @ignore_warnings
@@ -2370,6 +2385,17 @@ def test_log_loss():
     loss = log_loss(y_true, y_pred, normalize=True, eps=.1)
     assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, .1, .9)))
 
+    # raise error if number of classes are not equal.
+    y_true = [1, 0, 2]
+    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1]]
+    assert_raises(ValueError, log_loss, y_true, y_pred)
+
+    # case when y_true is a string array object
+    y_true = ["ham", "spam", "spam", "ham"]
+    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]
+    loss = log_loss(y_true, y_pred)
+    assert_almost_equal(loss, 1.0383217, decimal=6)
+
 
 @ignore_warnings
 def _check_averaging(metric, y_true, y_pred, y_true_binarize, y_pred_binarize,