amueller
diff --git a/‎sklearn/cross_validation.py
Lines changed: 3 additions & 0 deletions b/‎sklearn/cross_validation.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎sklearn/grid_search.py
Lines changed: 5 additions & 0 deletions b/‎sklearn/grid_search.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎sklearn/metrics/tests/test_score_objects.py
Lines changed: 14 additions & 1 deletion b/‎sklearn/metrics/tests/test_score_objects.py
Lines changed: 14 additions & 1 deletion
@@ -1067,6 +1067,9 @@ def _cross_val_score(estimator, X, y, scorer, train, test, verbose,
         score = estimator.score(X_test, y_test)
     else:
         score = scorer(estimator, X_test, y_test)
+        if not isinstance(score, numbers.Number):
+            raise ValueError("scoring must return a number, got %s (%s)"
+                             " instead." % (str(score), type(score)))
     if verbose > 1:
         print("score: %f" % score)
     return score
 
@@ -10,6 +10,7 @@
 from itertools import product
 import time
 import warnings
+import numbers
 
 import numpy as np
 
@@ -123,6 +124,10 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, scorer,
         else:
             this_score = clf.score(X_test)
 
+    if not isinstance(this_score, numbers.Number):
+        raise ValueError("scoring must return a number, got %s (%s)"
+                         " instead." % (str(this_score), type(this_score)))
+
     if verbose > 2:
         msg += ", score=%f" % this_score
     if verbose > 1:
 
@@ -9,7 +9,8 @@
 from sklearn.linear_model import Ridge, LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.datasets import make_blobs, load_diabetes
-from sklearn.cross_validation import train_test_split
+from sklearn.cross_validation import train_test_split, cross_val_score
+from sklearn.grid_search import GridSearchCV
 
 
 def test_classification_scores():
@@ -74,3 +75,15 @@ def test_unsupervised_scores():
     score1 = scorers['ari'](km, X_test, y_test)
     score2 = adjusted_rand_score(y_test, km.predict(X_test))
     assert_almost_equal(score1, score2)
+
+
+def test_raises_on_score_list():
+    # test that when a list of scores is returned, we raise proper errors.
+    X, y = make_blobs(random_state=0)
+    f1_scorer_no_average = AsScorer(f1_score, average=None)
+    clf = DecisionTreeClassifier()
+    assert_raises(ValueError, cross_val_score, clf, X, y,
+                  scoring=f1_scorer_no_average)
+    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
+                               param_grid={'max_depth': [1, 2]})
+    assert_raises(ValueError, grid_search.fit, X, y)