scikit-learn · AlexanderFabisch · Dec 31, 2013 · Jan 1, 2014 · Jan 1, 2014 · Jan 1, 2014
diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py
@@ -0,0 +1,57 @@
+"""
+========================
+Plotting Learning Curves
+========================
+
+A learning curve shows the validation and training score of a learning
+algorithm for varying numbers of training samples. It is a tool to
+find out how much we benefit from adding more training data. If both
+the validation score and the training score converge to a value that is
+too low, we will not benefit much from more training data and we will
+probably have to use a learning algorithm or a parametrization of the
+current learning algorithm that can learn more complex concepts (i.e.
+has a lower bias).
+
+In this example, on the left side the learning curve of a naive Bayes
+classifier is shown for the digits dataset. Note that the training score
+and the cross-validation score are both not very good at the end. However,
+the shape of the curve can be found in more complex datasets very often:
+the training score is very high at the beginning and decreases and the
+cross-validation score is very low at the beginning and increases. On the
+right side we see the learning curve of an SVM with RBF kernel. We can
+see clearly that the training score is still around the maximum and the
+validation score could be increased with more training samples.
+"""
+print(__doc__)
+
+import matplotlib.pyplot as plt
+from sklearn.naive_bayes import GaussianNB
+from sklearn.svm import SVC
+from sklearn.datasets import load_digits
+from sklearn.learning_curve import learning_curve
+
+
+digits = load_digits()
+X, y = digits.data, digits.target
+
10000

+plt.figure()
+plt.title("Learning Curve (Naive Bayes)")
+plt.xlabel("Training examples")
+plt.ylabel("Score")
+train_sizes, train_scores, test_scores = learning_curve(
+    GaussianNB(), X, y, cv=10, n_jobs=1)
+plt.plot(train_sizes, train_scores, label="Training score")
+plt.plot(train_sizes, test_scores, label="Cross-validation score")
+plt.legend(loc="best")
+
+plt.figure()
+plt.title("Learning Curve (SVM, RBF kernel, $\gamma=0.001$)")
+plt.xlabel("Training examples")
+plt.ylabel("Score")
+train_sizes, train_scores, test_scores = learning_curve(
+    SVC(gamma=0.001), X, y, cv=10, n_jobs=1)
+plt.plot(train_sizes, train_scores, label="Training score")
+plt.plot(train_sizes, test_scores, label="Cross-validation score")
+plt.legend(loc="best")
+
+plt.show()
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
@@ -236,59 +236,79 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer,
         print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.'))
 
     # update parameters of the classifier after a copy of its base structure
-    clf = clone(base_estimator)
-    clf.set_params(**parameters)
+    estimator = clone(base_estimator)
+    estimator.set_params(**parameters)
 
-    if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel):
+    X_train, y_train = _split(estimator, X, y, train)
+    X_test, y_test = _split(estimator, X, y, test, train)
+    _fit(estimator.fit, X_train, y_train, **fit_params)
+    this_score = _score(estimator, X_test, y_test, scorer)
+
+    if verbose > 2:
+        msg += ", score=%f" % this_score
+    if verbose > 1:
+        end_msg = "%s -%s" % (msg,
+                              logger.short_format_time(time.time() -
+                                                       start_time))
+        print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
+
+    return this_score, parameters, _num_samples(X_test)
+
+
+def _split(estimator, X, y, indices, train_indices=None):
+    """Create subset of dataset."""
+    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
         # cannot compute the kernel values with custom function
         raise ValueError("Cannot use a custom kernel function. "
                          "Precompute the kernel matrix instead.")
 
     if not hasattr(X, "shape"):
-        if getattr(base_estimator, "_pairwise", False):
+        if getattr(estimator, "_pairwise", False):
             raise ValueError("Precomputed kernels or affinity matrices have "
                              "to be passed as arrays or sparse matrices.")
-        X_train = [X[idx] for idx in train]
-        X_test = [X[idx] for idx in test]
+        X_subset = [X[idx] for idx in indices]
     else:
-        if getattr(base_estimator, "_pairwise", False):
+        if getattr(estimator, "_pairwise", False):
             # X is a precomputed square kernel matrix
             if X.shape[0] != X.shape[1]:
                 raise ValueError("X should be a square kernel matrix")
-            X_train = X[np.ix_(train, train)]
-            X_test = X[np.ix_(test, train)]
+            if train_indices is None:
+                X_subset = X[np.ix_(indices, indices)]
+            else:
+                X_subset = X[np.ix_(indices, train_indices)]
         else:
-            X_train = X[safe_mask(X, train)]
-            X_test = X[safe_mask(X, test)]
+            X_subset = X[safe_mask(X, indices)]
 
     if y is not None:
-        y_test = y[safe_mask(y, test)]
-        y_train = y[safe_mask(y, train)]
-        clf.fit(X_train, y_train, **fit_params)
+        y_subset = y[safe_mask(y, indices)]
+    else:
+        y_subset = None
+
+    return X_subset, y_subset
+
 
-        if scorer is not None:
-            this_score = scorer(clf, X_test, y_test)
+def _fit(fit_function, X_train, y_train, **fit_params):
+    """Fit and estimator on a given training set."""
+    if y_train is None:
+        fit_function(X_train, **fit_params)
+    else:
+        fit_function(X_train, y_train, **fit_params)
+
+
+def _score(estimator, X_test, y_test, scorer):
+    """Compute the score of an estimator on a given test set."""
+    if y_test is None:
+        if scorer is None:
+            this_score = estimator.score(X_test)
         else:
-            this_score = clf.score(X_test, y_test)
+            this_score = scorer(estimator, X_test)
     else:
-        clf.fit(X_train, **fit_params)
-        if scorer is not None:
-            this_score = scorer(clf, X_test)
+        if scorer is None:
+            this_score = estimator.score(X_test, y_test)
         else:
-            this_score = clf.score(X_test)
-
-    if not isinstance(this_score, numbers.Number):
-        raise ValueError("scoring must return a number, got %s (%s)"
-                         " instead." % (str(this_score), type(this_score)))
+            this_score = scorer(estimator, X_test, y_test)
 
-    if verbose > 2:
-        msg += ", score=%f" % this_score
-    if verbose > 1:
-        end_msg = "%s -%s" % (msg,
-                              logger.short_format_time(time.time() -
-                                                       start_time))
-        print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
-    return this_score, parameters, _num_samples(X_test)
+    return this_score
 
 
 def _check_param_grid(param_grid):
@@ -331,6 +351,24 @@ def __repr__(self):
             self.parameters)
 
 
+def _check_scorable(estimator, scoring=None, loss_func=None, score_func=None):
+    """Check that estimator can be fitted and score can be computed."""
+    if (not hasattr(estimator, 'fit') or
+            not (hasattr(estimator, 'predict')
+                    or hasattr(estimator, 'score'))):
+        raise TypeError("estimator should a be an estimator implementing"
+                        " 'fit' and 'predict' or 'score' methods,"
+                        " %s (type %s) was passed" %
+                        (estimator, type(estimator)))
+    if (scoring is None and loss_func is None and score_func
+            is None):
+        if not hasattr(estimator, 'score'):
+            raise TypeError(
+                "If no scoring is specified, the estimator passed "
+                "should have a 'score' method. The estimator %s "
+                "does not." % estimator)
+
+
 class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
                                       MetaEstimatorMixin)):
     """Base class for hyper parameter search with cross-validation."""
@@ -351,7 +389,8 @@ def __init__(self, estimator, scoring=None, loss_func=None,
         self.cv = cv
         self.verbose = verbose
         self.pre_dispatch = pre_dispatch
-        self._check_estimator()
+        _check_scorable(self.estimator, scoring=self.scoring,
+                        loss_func=self.loss_func, score_func=self.score_func)
 
     def score(self, X, y=None):
         """Returns the score on the given test data and labels, if the search
@@ -396,24 +435,7 @@ def decision_function(self):
     @property
     def transform(self):
         return self.best_estimator_.transform
-
-    def _check_estimator(self):
-        """Check that estimator can be fitted and score can be computed."""
-        if (not hasattr(self.estimator, 'fit') or
-                not (hasattr(self.estimator, 'predict')
-                     or hasattr(self.estimator, 'score'))):
-            raise TypeError("estimator should a be an estimator implementing"
-                            " 'fit' and 'predict' or 'score' methods,"
-                            " %s (type %s) was passed" %
-                            (self.estimator, type(self.estimator)))
-        if (self.scoring is None and self.loss_func is None and self.score_func
-                is None):
-            if not hasattr(self.estimator, 'score'):
-                raise TypeError(
-                    "If no scoring is specified, the estimator passed "
-                    "should have a 'score' method. The estimator %s "
-                    "does not." % self.estimator)
-
+
     def _fit(self, X, y, parameter_iterable):
         """Actual fitting,  performing the search over parameters."""