From 0ff07a0ed9c8888c46a19f95a6d77143d7def62d Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Tue, 31 Dec 2013 11:32:08 +0100 Subject: [PATCH 01/30] Add prototype for learning curve --- sklearn/learning_curve.py | 66 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 sklearn/learning_curve.py diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py new file mode 100644 index 0000000000000..93d724fefdf4b --- /dev/null +++ b/sklearn/learning_curve.py @@ -0,0 +1,66 @@ +import numpy as np +from sklearn import clone # TODO relative import +from sklearn.cross_validation import KFold # TODO relative import +from sklearn.externals.joblib import Parallel, delayed # TODO relative import +#from .metrics import _deprecate_loss_and_score_funcs # TODO relative import + +def learning_curve(estimator, X, y, n_samples_range, + loss_func=None, scoring='accuracy', n_cv_folds=10, n_jobs=1, + verbose=False, random_state=None): + n_max_samples = np.max(n_samples_range) + n_samples = X.shape[0] + n_required_samples = int(n_max_samples * (1 + 1.0 / n_cv_folds)) + if n_samples < n_required_samples: + # TODO test case + raise ValueError( + "For %d-fold cross-validation with %d training examples, " + "%d samples are required (got %d)." + % (n_cv_folds, n_max_samples, n_required_samples, n_samples)) + + # TODO check if scoring is possible (see BaseGridSearch) + + #scorer = _deprecate_loss_and_score_funcs(loss_func=loss_func, scoring=scoring) # TODO enable with relative import + from sklearn.metrics import accuracy_score + scorer = accuracy_score + + scores = [] + for n_train_samples in n_samples_range: + # TODO maybe take random indices instead of the first slice_length? + slice_length = int(n_train_samples * (1 + 1.0 / n_cv_folds)) + cv = KFold(n=slice_length, n_folds=n_cv_folds, + random_state=random_state) + + out = Parallel( + n_jobs=n_jobs, verbose=verbose)( # TODO set pre_dispatch parameter? + delayed(fit_estimator)(estimator, X, y, train, test, scorer, verbose) + for train, test in cv) + scores.append(np.mean(out, axis=0)) + scores = np.array(scores) + print scores + return scores[:, 0], scores[:, 1] + +def fit_estimator(base_estimator, X, y, train, test, scorer, verbose): + estimator = clone(base_estimator) + estimator.fit(X[train], y[train]) + y_test_pred = estimator.predict(X[test]) + y_train_pred = estimator.predict(X[train]) + train_score = scorer(y[train], y_train_pred) + test_score = scorer(y[test], y_test_pred) + return train_score, test_score + +if __name__ == "__main__": + #from sklearn.linear_model import RidgeClassifier + #estimator = RidgeClassifier(alpha=100) + from sklearn.svm import SVC + estimator = SVC(gamma=0.001) + + from sklearn.datasets import load_digits + digits = load_digits() + X, y = digits.data, digits.target + + n_samples_range = np.arange(10, 1611, 100) + train_scores, test_scores = learning_curve(estimator, X, y, n_samples_range, n_jobs=2, verbose=False) + import pylab + pylab.plot(n_samples_range, train_scores) + pylab.plot(n_samples_range, test_scores) + pylab.show() From be6e185fe115a7c188d9bf55c578003ecc1bfa27 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 02:16:24 +0100 Subject: [PATCH 02/30] Less complicated conditions --- sklearn/grid_search.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 12cc5ce9af25b..f4d2888f1829c 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -261,21 +261,21 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] - if y is not None: + if y is None: + clf.fit(X_train, **fit_params) + if scorer is None: + this_score = clf.score(X_test) + else: + this_score = scorer(clf, X_test) + else: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] clf.fit(X_train, y_train, **fit_params) - if scorer is not None: - this_score = scorer(clf, X_test, y_test) - else: + if scorer is None: this_score = clf.score(X_test, y_test) - else: - clf.fit(X_train, **fit_params) - if scorer is not None: - this_score = scorer(clf, X_test) else: - this_score = clf.score(X_test) + this_score = scorer(clf, X_test, y_test) if not isinstance(this_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" From 96fc07854d973ab475ef702e50aa39164fd98c25 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 10:25:57 +0100 Subject: [PATCH 03/30] Clean up learning curve, separate example --- examples/plot_learning_curve.py | 21 +++++++ sklearn/learning_curve.py | 98 +++++++++++++++++++-------------- 2 files changed, 77 insertions(+), 42 deletions(-) create mode 100644 examples/plot_learning_curve.py diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py new file mode 100644 index 0000000000000..53da810b04cad --- /dev/null +++ b/examples/plot_learning_curve.py @@ -0,0 +1,21 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.naive_bayes import GaussianNB +from sklearn.datasets import load_digits +from sklearn.learning_curve import learning_curve # TODO should be: from sklearn import learning_curve + +if __name__ == "__main__": + estimator = GaussianNB() + digits = load_digits() + X, y = digits.data, digits.target + + n_samples_range, train_scores, test_scores = learning_curve( + estimator, X, y, step_size=50, n_jobs=4, verbose=False) + + plt.title("Learning Curves (Naive Bayes on Digits Dataset)") + plt.xlabel("Training examples") + plt.ylabel("Score (Accuracy)") + plt.plot(n_samples_range, train_scores, label="Training score") + plt.plot(n_samples_range, test_scores, label="Cross-validation score") + plt.legend(loc="best") + plt.show() diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 93d724fefdf4b..752284d8eb3f8 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -1,66 +1,80 @@ import numpy as np -from sklearn import clone # TODO relative import -from sklearn.cross_validation import KFold # TODO relative import -from sklearn.externals.joblib import Parallel, delayed # TODO relative import -#from .metrics import _deprecate_loss_and_score_funcs # TODO relative import +from .base import clone +from .cross_validation import KFold +from .externals.joblib import Parallel, delayed +from .metrics.scorer import _deprecate_loss_and_score_funcs + +def learning_curve(estimator, X, y, n_samples_range=None, step_size=1, + n_cv_folds=10, loss_func=None, scoring=None, + n_jobs=1, verbose=False, random_state=None): + # TODO tests, doc + # TODO allow y to be None for unsupervised learning + # TODO test different n_cv_folds / dataset sizes / etc. (there could be bugs) + # TODO there is a huge overlap with grid search -> refactoring + # TODO exploit incremental learning?! (might be a bit complicated with CV) -def learning_curve(estimator, X, y, n_samples_range, - loss_func=None, scoring='accuracy', n_cv_folds=10, n_jobs=1, - verbose=False, random_state=None): - n_max_samples = np.max(n_samples_range) n_samples = X.shape[0] - n_required_samples = int(n_max_samples * (1 + 1.0 / n_cv_folds)) + max_fold_size = n_samples / n_cv_folds + + if n_samples_range is None: + if step_size is None or step_size < 1: + raise ValueError("Define either a range of training set sizes or " + "a proper step size.") + n_samples_range = np.arange(n_cv_folds-1, n_samples-max_fold_size+1, + step_size) + + n_max_samples = np.max(n_samples_range) + n_required_samples = n_max_samples + max_fold_size if n_samples < n_required_samples: - # TODO test case raise ValueError( "For %d-fold cross-validation with %d training examples, " "%d samples are required (got %d)." % (n_cv_folds, n_max_samples, n_required_samples, n_samples)) - # TODO check if scoring is possible (see BaseGridSearch) - - #scorer = _deprecate_loss_and_score_funcs(loss_func=loss_func, scoring=scoring) # TODO enable with relative import - from sklearn.metrics import accuracy_score - scorer = accuracy_score + # TODO copied from BaseGridSearch -> move to utils? .base? where? + if (not hasattr(estimator, 'fit') or + not (hasattr(estimator, 'predict') + or hasattr(estimator, 'score'))): + raise TypeError("estimator should a be an estimator implementing" + " 'fit' and 'predict' or 'score' methods," + " %s (type %s) was passed" % + (estimator, type(estimator))) + if scoring is None and loss_func is None: + if not hasattr(estimator, 'score'): + raise TypeError( + "If no scoring is specified, the estimator passed " + "should have a 'score' method. The estimator %s " + "does not." % estimator) + scorer = _deprecate_loss_and_score_funcs(loss_func=loss_func, + scoring=scoring) scores = [] for n_train_samples in n_samples_range: # TODO maybe take random indices instead of the first slice_length? - slice_length = int(n_train_samples * (1 + 1.0 / n_cv_folds)) + fold_size = (n_train_samples+1) / n_cv_folds + slice_length = n_train_samples + fold_size cv = KFold(n=slice_length, n_folds=n_cv_folds, random_state=random_state) out = Parallel( - n_jobs=n_jobs, verbose=verbose)( # TODO set pre_dispatch parameter? - delayed(fit_estimator)(estimator, X, y, train, test, scorer, verbose) + # TODO set pre_dispatch parameter? what is it good for? + n_jobs=n_jobs, verbose=verbose)( + delayed(_fit_estimator)(estimator, X, y, train, test, scorer, + verbose) for train, test in cv) scores.append(np.mean(out, axis=0)) scores = np.array(scores) - print scores - return scores[:, 0], scores[:, 1] -def fit_estimator(base_estimator, X, y, train, test, scorer, verbose): + return n_samples_range, scores[:, 0], scores[:, 1] + +def _fit_estimator(base_estimator, X, y, train, test, scorer, verbose): + # TODO similar to fit_grid_point from grid search, refactor estimator = clone(base_estimator) estimator.fit(X[train], y[train]) - y_test_pred = estimator.predict(X[test]) - y_train_pred = estimator.predict(X[train]) - train_score = scorer(y[train], y_train_pred) - test_score = scorer(y[test], y_test_pred) + if scorer is None: + train_score = estimator.score(X[train], y[train]) + test_score = estimator.score(X[test], y[test]) + else: + train_score = scorer(estimator, X[train], y[train]) + test_score = scorer(estimator, X[test], y[test]) return train_score, test_score - -if __name__ == "__main__": - #from sklearn.linear_model import RidgeClassifier - #estimator = RidgeClassifier(alpha=100) - from sklearn.svm import SVC - estimator = SVC(gamma=0.001) - - from sklearn.datasets import load_digits - digits = load_digits() - X, y = digits.data, digits.target - - n_samples_range = np.arange(10, 1611, 100) - train_scores, test_scores = learning_curve(estimator, X, y, n_samples_range, n_jobs=2, verbose=False) - import pylab - pylab.plot(n_samples_range, train_scores) - pylab.plot(n_samples_range, test_scores) - pylab.show() From 2f6373d1a960c6a0ba2a02319a7c9f9f914395e4 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 15:29:43 +0100 Subject: [PATCH 04/30] Adress some of jnothman's comments * use check_cv(...) to allow different cross validation strategies * allow fractional sample ranges * fix training and test set (with subslices of training set) --- examples/plot_learning_curve.py | 5 ++- sklearn/learning_curve.py | 76 ++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 53da810b04cad..117f4c76f05ce 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -2,6 +2,7 @@ import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.datasets import load_digits +from sklearn.cross_validation import KFold from sklearn.learning_curve import learning_curve # TODO should be: from sklearn import learning_curve if __name__ == "__main__": @@ -10,11 +11,11 @@ X, y = digits.data, digits.target n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, step_size=50, n_jobs=4, verbose=False) + estimator, X, y, cv=KFold(n=X.shape[0], n_folds=10), n_jobs=4, verbose=False) plt.title("Learning Curves (Naive Bayes on Digits Dataset)") plt.xlabel("Training examples") - plt.ylabel("Score (Accuracy)") + plt.ylabel("Score") plt.plot(n_samples_range, train_scores, label="Training score") plt.plot(n_samples_range, test_scores, label="Cross-validation score") plt.legend(loc="best") diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 752284d8eb3f8..8934e21b7e316 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -1,35 +1,49 @@ import numpy as np -from .base import clone -from .cross_validation import KFold +from .base import is_classifier, clone +from .cross_validation import _check_cv +from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import _deprecate_loss_and_score_funcs -def learning_curve(estimator, X, y, n_samples_range=None, step_size=1, - n_cv_folds=10, loss_func=None, scoring=None, +def learning_curve(estimator, X, y, + n_samples_range=np.arange(0.1, 1.1, 0.1), cv=None, scoring=None, n_jobs=1, verbose=False, random_state=None): + """ TODO document me + Parameters + ---------- + n_samples_range : array-like with dtype float or int, + If the dtype is float, it is regarded as a fraction of n_samples, i.e. it has to be within ]0, 1]. + """ # TODO tests, doc # TODO allow y to be None for unsupervised learning - # TODO test different n_cv_folds / dataset sizes / etc. (there could be bugs) - # TODO there is a huge overlap with grid search -> refactoring - # TODO exploit incremental learning?! (might be a bit complicated with CV) + # TODO there is an overlap with grid search -> refactoring + # TODO exploit incremental learning - n_samples = X.shape[0] - max_fold_size = n_samples / n_cv_folds + X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) + # Make a list since we will be iterating multiple times over the folds + cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) - if n_samples_range is None: - if step_size is None or step_size < 1: - raise ValueError("Define either a range of training set sizes or " - "a proper step size.") - n_samples_range = np.arange(n_cv_folds-1, n_samples-max_fold_size+1, - step_size) - - n_max_samples = np.max(n_samples_range) - n_required_samples = n_max_samples + max_fold_size - if n_samples < n_required_samples: - raise ValueError( - "For %d-fold cross-validation with %d training examples, " - "%d samples are required (got %d)." - % (n_cv_folds, n_max_samples, n_required_samples, n_samples)) + # Determine range of number of training samples + n_max_training_samples = cv[0][0].shape[0] + n_samples_range = np.asarray(n_samples_range) + n_min_required_samples = np.min(n_samples_range) + n_max_required_samples = np.max(n_samples_range) + if np.issubdtype(n_samples_range.dtype, float): + if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: + raise ValueError("n_samples_range must be within ]0, 1], " + "but is within [%f, %f]." + % (n_min_required_samples, + n_max_required_samples)) + n_samples_range = (n_samples_range * + n_max_training_samples).astype(np.int) + else: + if (n_min_required_samples <= 0 or + n_max_required_samples > n_max_samples): + raise ValueError("n_samples_range must be within ]0, %d], " + "but is within [%d, %d]." + % (n_max_samples, + n_min_required_samples, + n_max_required_samples)) # TODO copied from BaseGridSearch -> move to utils? .base? where? if (not hasattr(estimator, 'fit') or @@ -39,28 +53,22 @@ def learning_curve(estimator, X, y, n_samples_range=None, step_size=1, " 'fit' and 'predict' or 'score' methods," " %s (type %s) was passed" % (estimator, type(estimator))) - if scoring is None and loss_func is None: + if scoring is None: if not hasattr(estimator, 'score'): raise TypeError( "If no scoring is specified, the estimator passed " "should have a 'score' method. The estimator %s " "does not." % estimator) - scorer = _deprecate_loss_and_score_funcs(loss_func=loss_func, - scoring=scoring) + scorer = _deprecate_loss_and_score_funcs(scoring=scoring) scores = [] for n_train_samples in n_samples_range: - # TODO maybe take random indices instead of the first slice_length? - fold_size = (n_train_samples+1) / n_cv_folds - slice_length = n_train_samples + fold_size - cv = KFold(n=slice_length, n_folds=n_cv_folds, - random_state=random_state) - out = Parallel( # TODO set pre_dispatch parameter? what is it good for? n_jobs=n_jobs, verbose=verbose)( - delayed(_fit_estimator)(estimator, X, y, train, test, scorer, - verbose) + delayed(_fit_estimator)( + estimator, X, y, train[:n_train_samples], test, scorer, + verbose) for train, test in cv) scores.append(np.mean(out, axis=0)) scores = np.array(scores) From 811838a69c287a4e48b938476178f614fd6a1472 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 15:53:55 +0100 Subject: [PATCH 05/30] Fit different training set sizes in parallel --- sklearn/learning_curve.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 8934e21b7e316..61953721937bd 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -6,7 +6,7 @@ from .metrics.scorer import _deprecate_loss_and_score_funcs def learning_curve(estimator, X, y, - n_samples_range=np.arange(0.1, 1.1, 0.1), cv=None, scoring=None, + n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, n_jobs=1, verbose=False, random_state=None): """ TODO document me Parameters @@ -28,7 +28,7 @@ def learning_curve(estimator, X, y, n_samples_range = np.asarray(n_samples_range) n_min_required_samples = np.min(n_samples_range) n_max_required_samples = np.max(n_samples_range) - if np.issubdtype(n_samples_range.dtype, float): + if np.issubdtype(n_samples_range.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: raise ValueError("n_samples_range must be within ]0, 1], " "but is within [%f, %f]." @@ -61,19 +61,22 @@ def learning_curve(estimator, X, y, "does not." % estimator) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) - scores = [] - for n_train_samples in n_samples_range: - out = Parallel( - # TODO set pre_dispatch parameter? what is it good for? - n_jobs=n_jobs, verbose=verbose)( - delayed(_fit_estimator)( - estimator, X, y, train[:n_train_samples], test, scorer, - verbose) - for train, test in cv) - scores.append(np.mean(out, axis=0)) - scores = np.array(scores) + out = Parallel( + # TODO use pre_dispatch parameter? what is it good for? + n_jobs=n_jobs, verbose=verbose)( + delayed(_fit_estimator)( + estimator, X, y, train[:n_train_samples], test, scorer, + verbose) + for train, test in cv for n_train_samples in n_samples_range) - return n_samples_range, scores[:, 0], scores[:, 1] + out = np.asarray(out) + train_scores = np.zeros(n_samples_range.shape, dtype=np.float) + test_scores = np.zeros(n_samples_range.shape, dtype=np.float) + for i, n_train_samples in enumerate(n_samples_range): + res_indices = np.where(out[:, 0] == n_train_samples) + train_scores[i], test_scores[i] = out[res_indices[0], 1:].mean(axis=0) + + return n_samples_range, train_scores, test_scores def _fit_estimator(base_estimator, X, y, train, test, scorer, verbose): # TODO similar to fit_grid_point from grid search, refactor @@ -85,4 +88,4 @@ def _fit_estimator(base_estimator, X, y, train, test, scorer, verbose): else: train_score = scorer(estimator, X[train], y[train]) test_score = scorer(estimator, X[test], y[test]) - return train_score, test_score + return train.shape[0], train_score, test_score From 0972b2ed95c7b601e25a1b8505f81bd3b3ae945b Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 16:09:09 +0100 Subject: [PATCH 06/30] FIX: Use correct variable name --- sklearn/learning_curve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 61953721937bd..2941d90c7f111 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -38,10 +38,10 @@ def learning_curve(estimator, X, y, n_max_training_samples).astype(np.int) else: if (n_min_required_samples <= 0 or - n_max_required_samples > n_max_samples): + n_max_required_samples > n_max_training_samples): raise ValueError("n_samples_range must be within ]0, %d], " "but is within [%d, %d]." - % (n_max_samples, + % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) From 30fd1c0d6c9175c64c9a43fe7e46f17429c83320 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 16:16:46 +0100 Subject: [PATCH 07/30] Simplify example --- examples/plot_learning_curve.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 117f4c76f05ce..1e6e7a34a5797 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -2,7 +2,6 @@ import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.datasets import load_digits -from sklearn.cross_validation import KFold from sklearn.learning_curve import learning_curve # TODO should be: from sklearn import learning_curve if __name__ == "__main__": @@ -11,7 +10,7 @@ X, y = digits.data, digits.target n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=KFold(n=X.shape[0], n_folds=10), n_jobs=4, verbose=False) + estimator, X, y, cv=10, n_jobs=4, verbose=False) plt.title("Learning Curves (Naive Bayes on Digits Dataset)") plt.xlabel("Training examples") From a0b11902f1f6d51ae8c2e6c5b5731f676323beb9 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 16:41:39 +0100 Subject: [PATCH 08/30] Add interface documentation --- sklearn/learning_curve.py | 50 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 2941d90c7f111..a53c7a245ebd8 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -7,17 +7,61 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, - n_jobs=1, verbose=False, random_state=None): + n_jobs=1, verbose=0): """ TODO document me + Parameters ---------- - n_samples_range : array-like with dtype float or int, - If the dtype is float, it is regarded as a fraction of n_samples, i.e. it has to be within ]0, 1]. + estimator : object type that implements the "fit" and "predict" methods + An object of that type is instantiated for each validation. + + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + n_samples_range : array-like, shape = [n_ticks,], dtype float or int + Numbers of training examples that will be used to generate the + learning curve. If the dtype is float, it is regarded as a + fraction of n_samples, i.e. it has to be within ]0, 1]. + (default: np.linspace(0.1, 1.0, 10)) + + cv : integer, cross-validation generator or None, optional, default: None + If an integer is passed, it is the number of folds (default 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + n_jobs : integer, optional + Number of jobs to run in parallel (default 1). + + verbose : integer, optional + Controls the verbosity: the higher, the more messages. + + Returns + ------- + n_samples_range : array, shape = [n_ticks,], dtype int + Numbers of training examples that has been used to generate the + learning curve. + + train_scores : array, shape = [n_ticks,] + Scores on training sets. + + test_scores : array, shape = [n_ticks,] + Scores on test set. """ # TODO tests, doc # TODO allow y to be None for unsupervised learning # TODO there is an overlap with grid search -> refactoring # TODO exploit incremental learning + # TODO use verbose argument X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) # Make a list since we will be iterating multiple times over the folds From ef23d627ee811e73f9a9e29ec5db5f7e0ca482da Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 21:52:06 +0100 Subject: [PATCH 09/30] Introduce function _check_scorable() --- sklearn/grid_search.py | 35 ++++++++++++++++++++--------------- sklearn/learning_curve.py | 18 +++--------------- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index f4d2888f1829c..b002059b6cae2 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -331,6 +331,24 @@ def __repr__(self): self.parameters) +def _check_scorable(estimator, scoring=None, loss_func=None, score_func=None): + """Check that estimator can be fitted and score can be computed.""" + if (not hasattr(estimator, 'fit') or + not (hasattr(estimator, 'predict') + or hasattr(estimator, 'score'))): + raise TypeError("estimator should a be an estimator implementing" + " 'fit' and 'predict' or 'score' methods," + " %s (type %s) was passed" % + (estimator, type(estimator))) + if (scoring is None and loss_func is None and score_func + is None): + if not hasattr(estimator, 'score'): + raise TypeError( + "If no scoring is specified, the estimator passed " + "should have a 'score' method. The estimator %s " + "does not." % estimator) + + class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): """Base class for hyper parameter search with cross-validation.""" @@ -398,21 +416,8 @@ def transform(self): return self.best_estimator_.transform def _check_estimator(self): - """Check that estimator can be fitted and score can be computed.""" - if (not hasattr(self.estimator, 'fit') or - not (hasattr(self.estimator, 'predict') - or hasattr(self.estimator, 'score'))): - raise TypeError("estimator should a be an estimator implementing" - " 'fit' and 'predict' or 'score' methods," - " %s (type %s) was passed" % - (self.estimator, type(self.estimator))) - if (self.scoring is None and self.loss_func is None and self.score_func - is None): - if not hasattr(self.estimator, 'score'): - raise TypeError( - "If no scoring is specified, the estimator passed " - "should have a 'score' method. The estimator %s " - "does not." % self.estimator) + _check_scorable(self.estimator, scoring=self.scoring, + loss_func=self.loss_func, score_func=self.score_func) def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index a53c7a245ebd8..7a0fff07259b0 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -4,6 +4,7 @@ from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import _deprecate_loss_and_score_funcs +from .grid_search import _check_scorable def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, @@ -89,21 +90,8 @@ def learning_curve(estimator, X, y, n_min_required_samples, n_max_required_samples)) - # TODO copied from BaseGridSearch -> move to utils? .base? where? - if (not hasattr(estimator, 'fit') or - not (hasattr(estimator, 'predict') - or hasattr(estimator, 'score'))): - raise TypeError("estimator should a be an estimator implementing" - " 'fit' and 'predict' or 'score' methods," - " %s (type %s) was passed" % - (estimator, type(estimator))) - if scoring is None: - if not hasattr(estimator, 'score'): - raise TypeError( - "If no scoring is specified, the estimator passed " - "should have a 'score' method. The estimator %s " - "does not." % estimator) - scorer = _deprecate_loss_and_score_funcs(scoring=scoring) + _check_scorable(estimator, scoring=scoring) + scorer = _deprecate_loss_and_score_funcs(scoring=scoring) out = Parallel( # TODO use pre_dispatch parameter? what is it good for? From 5fccd17cd162f64093b772123fceeb0548bf6ee5 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 22:51:49 +0100 Subject: [PATCH 10/30] Introduce function _split_and_score --- sklearn/grid_search.py | 79 +++++++++++++++++++++++++++------------ sklearn/learning_curve.py | 23 +++++------- 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index b002059b6cae2..e5be710f35ec1 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -235,9 +235,26 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, for k, v in parameters.items())) print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) + this_score, n_test_samples = _split_and_score( + base_estimator, X, y, parameters, train, test, scorer, + return_train_score=False, **fit_params) + + if verbose > 2: + msg += ", score=%f" % this_score + if verbose > 1: + end_msg = "%s -%s" % (msg, + logger.short_format_time(time.time() - + start_time)) + print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + + return this_score, parameters, n_test_samples + + +def _split_and_score(base_estimator, X, y, parameters, train, test, scorer, + return_train_score=False, **fit_params): # update parameters of the classifier after a copy of its base structure - clf = clone(base_estimator) - clf.set_params(**parameters) + estimator = clone(base_estimator) + estimator.set_params(**parameters) if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel): # cannot compute the kernel values with custom function @@ -261,34 +278,48 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] - if y is None: - clf.fit(X_train, **fit_params) - if scorer is None: - this_score = clf.score(X_test) - else: - this_score = scorer(clf, X_test) - else: + if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] - clf.fit(X_train, y_train, **fit_params) + else: + y_test = None + y_train = None - if scorer is None: - this_score = clf.score(X_test, y_test) - else: - this_score = scorer(clf, X_test, y_test) + _fit(estimator, X_train, y_train, **fit_params) + test_score = _score(estimator, X_test, y_test, scorer) - if not isinstance(this_score, numbers.Number): + if not isinstance(test_score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s)" - " instead." % (str(this_score), type(this_score))) + " instead." % (str(test_score), type(test_score))) - if verbose > 2: - msg += ", score=%f" % this_score - if verbose > 1: - end_msg = "%s -%s" % (msg, - logger.short_format_time(time.time() - - start_time)) - print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, parameters, _num_samples(X_test) + if return_train_score: + train_score = _score(estimator, X_train, y_train, scorer) + return (test_score, _num_samples(X_test), train_score, + _num_samples(X_train)) + else: + return test_score, _num_samples(X_test) + + +def _fit(estimator, X_train, y_train, **fit_params): + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + + +def _score(estimator, X_test, y_test, scorer): + if y_test is None: + if scorer is None: + this_score = estimator.score(X_test) + else: + this_score = scorer(estimator, X_test) + else: + if scorer is None: + this_score = estimator.score(X_test, y_test) + else: + this_score = scorer(estimator, X_test, y_test) + + return this_score def _check_param_grid(param_grid): diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 7a0fff07259b0..3dcc49dc0ed23 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -4,7 +4,7 @@ from .utils import check_arrays from .externals.joblib import Parallel, delayed from .metrics.scorer import _deprecate_loss_and_score_funcs -from .grid_search import _check_scorable +from .grid_search import _check_scorable, _split_and_score def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, @@ -97,8 +97,8 @@ def learning_curve(estimator, X, y, # TODO use pre_dispatch parameter? what is it good for? n_jobs=n_jobs, verbose=verbose)( delayed(_fit_estimator)( - estimator, X, y, train[:n_train_samples], test, scorer, - verbose) + estimator, X, y, train, test, n_train_samples, + scorer, verbose) for train, test in cv for n_train_samples in n_samples_range) out = np.asarray(out) @@ -110,14 +110,9 @@ def learning_curve(estimator, X, y, return n_samples_range, train_scores, test_scores -def _fit_estimator(base_estimator, X, y, train, test, scorer, verbose): - # TODO similar to fit_grid_point from grid search, refactor - estimator = clone(base_estimator) - estimator.fit(X[train], y[train]) - if scorer is None: - train_score = estimator.score(X[train], y[train]) - test_score = estimator.score(X[test], y[test]) - else: - train_score = scorer(estimator, X[train], y[train]) - test_score = scorer(estimator, X[test], y[test]) - return train.shape[0], train_score, test_score +def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, + scorer, verbose): + test_score, _, train_score, _ = _split_and_score( + base_estimator, X, y, parameters={}, train=train[:n_train_samples], + test=test, scorer=scorer, return_train_score=True) + return n_train_samples, train_score, test_score From b3cb14e7f1fadbbb40ea36f5c084c2adf6f0ecb9 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 23:13:43 +0100 Subject: [PATCH 11/30] Clean up * remove duplicate numbers of training samples * get rid of loop * change ]0, 1] to (0, 1] --- sklearn/learning_curve.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 3dcc49dc0ed23..7a7fb7dadb314 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -27,7 +27,7 @@ def learning_curve(estimator, X, y, n_samples_range : array-like, shape = [n_ticks,], dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a - fraction of n_samples, i.e. it has to be within ]0, 1]. + fraction of n_samples, i.e. it has to be within (0, 1]. (default: np.linspace(0.1, 1.0, 10)) cv : integer, cross-validation generator or None, optional, default: None @@ -48,9 +48,10 @@ def learning_curve(estimator, X, y, Returns ------- - n_samples_range : array, shape = [n_ticks,], dtype int + n_samples_range : array, shape = [n_unique_ticks,], dtype int Numbers of training examples that has been used to generate the - learning curve. + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. train_scores : array, shape = [n_ticks,] Scores on training sets. @@ -59,8 +60,6 @@ def learning_curve(estimator, X, y, Scores on test set. """ # TODO tests, doc - # TODO allow y to be None for unsupervised learning - # TODO there is an overlap with grid search -> refactoring # TODO exploit incremental learning # TODO use verbose argument @@ -79,8 +78,8 @@ def learning_curve(estimator, X, y, "but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - n_samples_range = (n_samples_range * - n_max_training_samples).astype(np.int) + n_samples_range = np.unique((n_samples_range * + n_max_training_samples).astype(np.int)) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): @@ -99,20 +98,19 @@ def learning_curve(estimator, X, y, delayed(_fit_estimator)( estimator, X, y, train, test, n_train_samples, scorer, verbose) - for train, test in cv for n_train_samples in n_samples_range) + for n_train_samples in n_samples_range for train, test in cv) - out = np.asarray(out) - train_scores = np.zeros(n_samples_range.shape, dtype=np.float) - test_scores = np.zeros(n_samples_range.shape, dtype=np.float) - for i, n_train_samples in enumerate(n_samples_range): - res_indices = np.where(out[:, 0] == n_train_samples) - train_scores[i], test_scores[i] = out[res_indices[0], 1:].mean(axis=0) + out = np.array(out) + n_unique_ticks = n_samples_range.shape[0] + n_cv_folds = out.shape[0]/n_unique_ticks + out = out.reshape(n_unique_ticks, n_cv_folds, 2) + avg_over_cv = out.mean(axis=1).reshape(n_unique_ticks, 2) - return n_samples_range, train_scores, test_scores + return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): test_score, _, train_score, _ = _split_and_score( base_estimator, X, y, parameters={}, train=train[:n_train_samples], test=test, scorer=scorer, return_train_score=True) - return n_train_samples, train_score, test_score + return train_score, test_score From d3e52f635bd00d56192780f5d4bdb42c3745cb5a Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 1 Jan 2014 23:44:44 +0100 Subject: [PATCH 12/30] Add parameter for incremental learning (without impl.) --- sklearn/learning_curve.py | 53 +++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 7a7fb7dadb314..3af56dfd165f7 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -6,8 +6,8 @@ from .metrics.scorer import _deprecate_loss_and_score_funcs from .grid_search import _check_scorable, _split_and_score -def learning_curve(estimator, X, y, - n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, +def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), + cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, verbose=0): """ TODO document me @@ -40,6 +40,10 @@ def learning_curve(estimator, X, y, a scorer callable object / function with signature ``scorer(estimator, X, y)``. + exploit_incremental_learning : boolean, optional, default: False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + n_jobs : integer, optional Number of jobs to run in parallel (default 1). @@ -60,13 +64,16 @@ def learning_curve(estimator, X, y, Scores on test set. """ # TODO tests, doc - # TODO exploit incremental learning # TODO use verbose argument X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) # Make a list since we will be iterating multiple times over the folds cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) + if exploit_incremental_learning and not hasattr(estimator, 'partial_fit'): + raise ValueError('An estimator must support the partial_fit interface ' + 'to exploit incremental learning') + # Determine range of number of training samples n_max_training_samples = cv[0][0].shape[0] n_samples_range = np.asarray(n_samples_range) @@ -80,6 +87,10 @@ def learning_curve(estimator, X, y, n_max_required_samples)) n_samples_range = np.unique((n_samples_range * n_max_training_samples).astype(np.int)) + # TODO we could + # - print a warning + # - *, inverse = np.unique(*, return_inverse=True); return np.take(., inverse) + # if there are duplicate elements else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): @@ -92,25 +103,29 @@ def learning_curve(estimator, X, y, _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) - out = Parallel( - # TODO use pre_dispatch parameter? what is it good for? - n_jobs=n_jobs, verbose=verbose)( - delayed(_fit_estimator)( - estimator, X, y, train, test, n_train_samples, - scorer, verbose) - for n_train_samples in n_samples_range for train, test in cv) - - out = np.array(out) - n_unique_ticks = n_samples_range.shape[0] - n_cv_folds = out.shape[0]/n_unique_ticks - out = out.reshape(n_unique_ticks, n_cv_folds, 2) - avg_over_cv = out.mean(axis=1).reshape(n_unique_ticks, 2) - - return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] + if exploit_incremental_learning: + # TODO exploit incremental learning + pass + else: + out = Parallel( + # TODO use pre_dispatch parameter? what is it good for? + n_jobs=n_jobs, verbose=verbose)( + delayed(_fit_estimator)( + estimator, X, y, train, test, n_train_samples, + scorer, verbose) + for n_train_samples in n_samples_range for train, test in cv) + + out = np.array(out) + n_unique_ticks = n_samples_range.shape[0] + n_cv_folds = out.shape[0]/n_unique_ticks + out = out.reshape(n_unique_ticks, n_cv_folds, 2) + avg_over_cv = out.mean(axis=1).reshape(n_unique_ticks, 2) + + return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): test_score, _, train_score, _ = _split_and_score( - base_estimator, X, y, parameters={}, train=train[:n_train_samples], + base_estimator, X, y, parameters={}, train=train[:n_train_samples], # TODO slice does not work for booleans, slice after indexing test=test, scorer=scorer, return_train_score=True) return train_score, test_score From 08a0b492ba584474b22db43a7704df7563d756b5 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 2 Jan 2014 00:08:40 +0100 Subject: [PATCH 13/30] Fix for boolean indices from cv generator --- sklearn/learning_curve.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 3af56dfd165f7..6dcbac8c7be0d 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -81,7 +81,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), n_max_required_samples = np.max(n_samples_range) if np.issubdtype(n_samples_range.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("n_samples_range must be within ]0, 1], " + raise ValueError("n_samples_range must be within (0, 1], " "but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) @@ -94,7 +94,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): - raise ValueError("n_samples_range must be within ]0, %d], " + raise ValueError("n_samples_range must be within (0, %d], " "but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, @@ -125,7 +125,10 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): + # HACK as long as boolean indices are allowed in cv generators + if train.dtype == np.bool: + train = np.nonzero(train) test_score, _, train_score, _ = _split_and_score( - base_estimator, X, y, parameters={}, train=train[:n_train_samples], # TODO slice does not work for booleans, slice after indexing + base_estimator, X, y, parameters={}, train=train[:n_train_samples], test=test, scorer=scorer, return_train_score=True) return train_score, test_score From cd41b0923e18fb4dd5ce652452180a9ef665dc00 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 2 Jan 2014 15:31:08 +0100 Subject: [PATCH 14/30] Add tests --- examples/plot_learning_curve.py | 2 +- sklearn/grid_search.py | 3 +- sklearn/learning_curve.py | 79 +++++++++++++++----------- sklearn/tests/test_learning_curve.py | 84 ++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 34 deletions(-) create mode 100644 sklearn/tests/test_learning_curve.py diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 1e6e7a34a5797..12602e8ec613b 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.datasets import load_digits -from sklearn.learning_curve import learning_curve # TODO should be: from sklearn import learning_curve +from sklearn.learning_curve import learning_curve if __name__ == "__main__": estimator = GaussianNB() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index e5be710f35ec1..631f77a9a8c09 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -254,7 +254,8 @@ def _split_and_score(base_estimator, X, y, parameters, train, test, scorer, return_train_score=False, **fit_params): # update parameters of the classifier after a copy of its base structure estimator = clone(base_estimator) - estimator.set_params(**parameters) + if len(parameters) > 0: + estimator.set_params(**parameters) if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel): # cannot compute the kernel values with custom function diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 6dcbac8c7be0d..bd6f5c5405eb4 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -1,4 +1,5 @@ import numpy as np +import warnings from .base import is_classifier, clone from .cross_validation import _check_cv from .utils import check_arrays @@ -9,7 +10,10 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, verbose=0): - """ TODO document me + """Learning curve + + Determines cross-validated training and test scores for different training + set sizes. Parameters ---------- @@ -63,49 +67,26 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), test_scores : array, shape = [n_ticks,] Scores on test set. """ - # TODO tests, doc # TODO use verbose argument - X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) - # Make a list since we will be iterating multiple times over the folds - cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) - if exploit_incremental_learning and not hasattr(estimator, 'partial_fit'): raise ValueError('An estimator must support the partial_fit interface ' 'to exploit incremental learning') - # Determine range of number of training samples + X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) + # Make a list since we will be iterating multiple times over the folds + cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) + n_max_training_samples = cv[0][0].shape[0] - n_samples_range = np.asarray(n_samples_range) - n_min_required_samples = np.min(n_samples_range) - n_max_required_samples = np.max(n_samples_range) - if np.issubdtype(n_samples_range.dtype, np.float): - if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("n_samples_range must be within (0, 1], " - "but is within [%f, %f]." - % (n_min_required_samples, - n_max_required_samples)) - n_samples_range = np.unique((n_samples_range * - n_max_training_samples).astype(np.int)) - # TODO we could - # - print a warning - # - *, inverse = np.unique(*, return_inverse=True); return np.take(., inverse) - # if there are duplicate elements - else: - if (n_min_required_samples <= 0 or - n_max_required_samples > n_max_training_samples): - raise ValueError("n_samples_range must be within (0, %d], " - "but is within [%d, %d]." - % (n_max_training_samples, - n_min_required_samples, - n_max_required_samples)) + n_samples_range, n_unique_ticks = _translate_n_samples_range( + n_samples_range, n_max_training_samples) _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) if exploit_incremental_learning: + raise NotImplemented("Incremental learning is not supported yet") # TODO exploit incremental learning - pass else: out = Parallel( # TODO use pre_dispatch parameter? what is it good for? @@ -116,13 +97,47 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), for n_train_samples in n_samples_range for train, test in cv) out = np.array(out) - n_unique_ticks = n_samples_range.shape[0] n_cv_folds = out.shape[0]/n_unique_ticks out = out.reshape(n_unique_ticks, n_cv_folds, 2) avg_over_cv = out.mean(axis=1).reshape(n_unique_ticks, 2) return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] + +def _translate_n_samples_range(n_samples_range, n_max_training_samples): + """Determine range of number of training samples""" + n_samples_range = np.asarray(n_samples_range) + n_ticks = n_samples_range.shape[0] + n_min_required_samples = np.min(n_samples_range) + n_max_required_samples = np.max(n_samples_range) + if np.issubdtype(n_samples_range.dtype, np.float): + if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: + raise ValueError("n_samples_range must be within (0, 1], " + "but is within [%f, %f]." + % (n_min_required_samples, + n_max_required_samples)) + n_samples_range = (n_samples_range * n_max_training_samples + ).astype(np.int) + n_samples_range = np.clip(n_samples_range, 1, n_max_training_samples) + else: + if (n_min_required_samples <= 0 or + n_max_required_samples > n_max_training_samples): + raise ValueError("n_samples_range must be within (0, %d], " + "but is within [%d, %d]." + % (n_max_training_samples, + n_min_required_samples, + n_max_required_samples)) + + n_samples_range = np.unique(n_samples_range) + n_unique_ticks = n_samples_range.shape[0] + if n_ticks > n_unique_ticks: + warnings.warn("Number of ticks will be less than than the size of " + "'n_samples_range' (%d instead of %d)." + % (n_unique_ticks, n_ticks), RuntimeWarning) + + return n_samples_range, n_unique_ticks + + def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): # HACK as long as boolean indices are allowed in cv generators diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py new file mode 100644 index 0000000000000..607bbefd6c07e --- /dev/null +++ b/sklearn/tests/test_learning_curve.py @@ -0,0 +1,84 @@ +import numpy as np +from sklearn.learning_curve import learning_curve +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.datasets import make_classification +from sklearn.svm import SVC + +class MockImprovingClassifier(object): + """Dummy classifier to test the learning curve""" + def __init__(self, n_max_train_samples): + self.n_max_train_samples = n_max_train_samples + self.n_train_samples = 0 + + def fit(self, X_subset, y_subset): + self.X_subset = X_subset + self.y_subset = y_subset + self.n_train_samples = X_subset.shape[0] + return self + + def predict(self, X): + raise NotImplemented + + def score(self, X=None, Y=None): + # training score becomes worse (2 -> 1), test error better (0 -> 1) + if X is self.X_subset: + return 2. - float(self.n_train_samples) / self.n_max_train_samples + else: + return float(self.n_train_samples) / self.n_max_train_samples + + def get_params(self, deep=False): + return {"n_max_train_samples" : self.n_max_train_samples} + + def set_params(self, **params): + self.n_max_train_samples = params["n_max_train_samples"] + return self + + +def test_learning_curve(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockImprovingClassifier(20) + n_samples_range, train_scores, test_scores = learning_curve(estimator, + X, y, cv=3) + assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) + + +def test_incremental_learning_not_possible(): + X, y = make_classification(n_samples=2, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + # The mockup does not have partial_fit() + estimator = MockImprovingClassifier(1) + assert_raises(ValueError, learning_curve, estimator, X, y, + exploit_incremental_learning=True) + + +def test_n_sample_range_out_of_bounds(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockImprovingClassifier(20) + assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, + n_samples_range=[0.0, 1.0]) + assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, + n_samples_range=[0.1, 1.1]) + assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, + n_samples_range=[0, 20]) + assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, + n_samples_range=[1, 21]) + +def test_remove_multiple_sample_sizes(): + X, y = make_classification(n_samples=3, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockImprovingClassifier(2) + n_samples_range, _, _ = assert_warns(RuntimeWarning, + learning_curve, estimator, X, y, cv=3, + n_samples_range=np.linspace(0.33, 1.0, 3)) + assert_array_equal(n_samples_range, [1, 2]) From 9fc5f3b2c8b1f3914e7bdf270205ee019cd7b70d Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 2 Jan 2014 16:58:44 +0100 Subject: [PATCH 15/30] Exploit incremental learning --- examples/plot_learning_curve.py | 9 +++-- sklearn/grid_search.py | 46 ++++++++++++++++------ sklearn/learning_curve.py | 58 ++++++++++++++++++++-------- sklearn/tests/test_learning_curve.py | 2 +- 4 files changed, 82 insertions(+), 33 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 12602e8ec613b..568ba4970a0c5 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -1,18 +1,19 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.datasets import load_digits from sklearn.learning_curve import learning_curve if __name__ == "__main__": - estimator = GaussianNB() + estimator = PassiveAggressiveClassifier(n_iter=1) digits = load_digits() X, y = digits.data, digits.target n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=10, n_jobs=4, verbose=False) + estimator, X, y, cv=10, exploit_incremental_learning=False, + n_jobs=4, verbose=False) - plt.title("Learning Curves (Naive Bayes on Digits Dataset)") + plt.title("Learning Curves (Passive-Aggressive Classifier on Digits)") plt.xlabel("Training examples") plt.ylabel("Score") plt.plot(n_samples_range, train_scores, label="Training score") diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 631f77a9a8c09..f82373c2fcfe0 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -235,8 +235,12 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, for k, v in parameters.items())) print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')) + # update parameters of the classifier after a copy of its base structure + estimator = clone(base_estimator) + estimator.set_params(**parameters) + this_score, n_test_samples = _split_and_score( - base_estimator, X, y, parameters, train, test, scorer, + estimator, X, y, train, test, scorer, return_train_score=False, **fit_params) if verbose > 2: @@ -250,43 +254,53 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, return this_score, parameters, n_test_samples -def _split_and_score(base_estimator, X, y, parameters, train, test, scorer, - return_train_score=False, **fit_params): - # update parameters of the classifier after a copy of its base structure - estimator = clone(base_estimator) - if len(parameters) > 0: - estimator.set_params(**parameters) - - if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel): +def _split_and_score(estimator, X, y, train, test, scorer, + return_train_score=False, partial_train=None, + **fit_params): + if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if not hasattr(X, "shape"): - if getattr(base_estimator, "_pairwise", False): + if getattr(estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") X_train = [X[idx] for idx in train] X_test = [X[idx] for idx in test] + if partial_train is not None: + X_partial_train = [X[idx] for idx in partial_train] else: - if getattr(base_estimator, "_pairwise", False): + if getattr(estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") X_train = X[np.ix_(train, train)] X_test = X[np.ix_(test, train)] + if partial_train is not None: + X_partial_train = X[np.ix_(partial_train, partial_train)] else: X_train = X[safe_mask(X, train)] X_test = X[safe_mask(X, test)] + if partial_train is not None: + X_partial_train = X[safe_mask(X, partial_train)] if y is not None: y_test = y[safe_mask(y, test)] y_train = y[safe_mask(y, train)] + if partial_train is not None: + y_partial_train = y[safe_mask(y, partial_train)] else: y_test = None y_train = None + if partial_train is not None: + y_partial_train = None - _fit(estimator, X_train, y_train, **fit_params) + if partial_train is None: + _fit(estimator, X_train, y_train, **fit_params) + else: + _fit_incremental(estimator, X_partial_train, y_partial_train, + **fit_params) test_score = _score(estimator, X_test, y_test, scorer) if not isinstance(test_score, numbers.Number): @@ -308,6 +322,14 @@ def _fit(estimator, X_train, y_train, **fit_params): estimator.fit(X_train, y_train, **fit_params) +def _fit_incremental(estimator, X_partial_train, y_partial_train, + **fit_params): + if y_partial_train is None: + estimator.partial_fit(X_partial_train, **fit_params) + else: + estimator.partial_fit(X_partial_train, y_partial_train, **fit_params) + + def _score(estimator, X_test, y_test, scorer): if y_test is None: if scorer is None: diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index bd6f5c5405eb4..bc870fca383dd 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -84,24 +84,29 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) + parallel = Parallel(n_jobs=n_jobs, verbose=verbose) if exploit_incremental_learning: - raise NotImplemented("Incremental learning is not supported yet") - # TODO exploit incremental learning + if is_classifier(estimator): + classes = np.unique(y) + else: + classes = None + out = parallel(delayed(_incremental_fit_estimator)( + estimator, X, y, classes, train, test, + n_samples_range, scorer, verbose) + for train, test in cv) else: - out = Parallel( - # TODO use pre_dispatch parameter? what is it good for? - n_jobs=n_jobs, verbose=verbose)( - delayed(_fit_estimator)( - estimator, X, y, train, test, n_train_samples, - scorer, verbose) - for n_train_samples in n_samples_range for train, test in cv) - + out = parallel(delayed(_fit_estimator)( + estimator, X, y, train, test, n_train_samples, + scorer, verbose) + for train, test in cv + for n_train_samples in n_samples_range) out = np.array(out) n_cv_folds = out.shape[0]/n_unique_ticks - out = out.reshape(n_unique_ticks, n_cv_folds, 2) - avg_over_cv = out.mean(axis=1).reshape(n_unique_ticks, 2) + out = out.reshape(n_cv_folds, n_unique_ticks, 2) + + avg_over_cv = np.asarray(out).mean(axis=0).reshape(n_unique_ticks, 2) - return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] + return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] def _translate_n_samples_range(n_samples_range, n_max_training_samples): @@ -138,12 +143,33 @@ def _translate_n_samples_range(n_samples_range, n_max_training_samples): return n_samples_range, n_unique_ticks -def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, - scorer, verbose): +def _fit_estimator(base_estimator, X, y, train, test, + n_train_samples, scorer, verbose): # HACK as long as boolean indices are allowed in cv generators if train.dtype == np.bool: train = np.nonzero(train) + + estimator = clone(base_estimator) test_score, _, train_score, _ = _split_and_score( - base_estimator, X, y, parameters={}, train=train[:n_train_samples], + estimator, X, y, train=train[:n_train_samples], test=test, scorer=scorer, return_train_score=True) return train_score, test_score + +def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, + n_samples_range, scorer, verbose): + # HACK as long as boolean indices are allowed in cv generators + if train.dtype == np.bool: + train = np.nonzero(train) + + estimator = clone(base_estimator) + train_scores, test_scores = [], [] + for n_train_samples, partial_train in zip(n_samples_range, np.split(train, + n_samples_range[:-1])): + test_score, _, train_score, _ = _split_and_score( + estimator, X, y, train=train[:n_train_samples], + partial_train=partial_train, test=test, scorer=scorer, + return_train_score=True, classes=classes) + train_scores.append(train_score) + test_scores.append(test_score) + #return np.zeros((2, n_samples_range.shape[0])) + return np.array((train_scores, test_scores)).T diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 607bbefd6c07e..73463d7968eab 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -20,7 +20,7 @@ def fit(self, X_subset, y_subset): return self def predict(self, X): - raise NotImplemented + raise NotImplementedError def score(self, X=None, Y=None): # training score becomes worse (2 -> 1), test error better (0 -> 1) From 6a516b423f72de31efd11de8baded3d54628bd2b Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 2 Jan 2014 17:22:55 +0100 Subject: [PATCH 16/30] Test incremental learning --- sklearn/tests/test_learning_curve.py | 34 ++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 73463d7968eab..ebbd4c88afbe1 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -12,10 +12,10 @@ class MockImprovingClassifier(object): def __init__(self, n_max_train_samples): self.n_max_train_samples = n_max_train_samples self.n_train_samples = 0 + self.X_subset = None def fit(self, X_subset, y_subset): self.X_subset = X_subset - self.y_subset = y_subset self.n_train_samples = X_subset.shape[0] return self @@ -24,11 +24,14 @@ def predict(self, X): def score(self, X=None, Y=None): # training score becomes worse (2 -> 1), test error better (0 -> 1) - if X is self.X_subset: + if self._is_training_data(X): return 2. - float(self.n_train_samples) / self.n_max_train_samples else: return float(self.n_train_samples) / self.n_max_train_samples + def _is_training_data(self, X): + return X is self.X_subset + def get_params(self, deep=False): return {"n_max_train_samples" : self.n_max_train_samples} @@ -37,6 +40,21 @@ def set_params(self, **params): return self +class MockIncrementalImprovingClassifier(MockImprovingClassifier): + """Dummy classifier that provides partial_fit""" + def __init__(self, n_max_train_samples): + super(MockIncrementalImprovingClassifier, self).__init__( + n_max_train_samples) + self.x = None + + def _is_training_data(self, X): + return self.x in X + + def partial_fit(self, X, y, **params): + self.n_train_samples += X.shape[0] + self.x = X[0] + + def test_learning_curve(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, @@ -59,6 +77,18 @@ def test_incremental_learning_not_possible(): exploit_incremental_learning=True) +def test_incremental_learning(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockIncrementalImprovingClassifier(20) + n_samples_range, train_scores, test_scores = learning_curve( + estimator, X, y, cv=3, exploit_incremental_learning=True) + assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) + + def test_n_sample_range_out_of_bounds(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, From 39dcb689d446475fda4f5f0723270035ef35b15a Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 2 Jan 2014 18:04:12 +0100 Subject: [PATCH 17/30] Improve test coverage --- sklearn/learning_curve.py | 17 ++++++++--------- sklearn/tests/test_learning_curve.py | 20 ++++++++++++++++++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index bc870fca383dd..db2e117adcdcf 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -77,7 +77,14 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), # Make a list since we will be iterating multiple times over the folds cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) - n_max_training_samples = cv[0][0].shape[0] + # HACK as long as boolean indices are allowed in cv generators + if cv[0][0].dtype == bool: + new_cv = [] + for i in range(len(cv)): + new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0])) + cv = new_cv + + n_max_training_samples = len(cv[0][0]) n_samples_range, n_unique_ticks = _translate_n_samples_range( n_samples_range, n_max_training_samples) @@ -145,10 +152,6 @@ def _translate_n_samples_range(n_samples_range, n_max_training_samples): def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): - # HACK as long as boolean indices are allowed in cv generators - if train.dtype == np.bool: - train = np.nonzero(train) - estimator = clone(base_estimator) test_score, _, train_score, _ = _split_and_score( estimator, X, y, train=train[:n_train_samples], @@ -157,10 +160,6 @@ def _fit_estimator(base_estimator, X, y, train, test, def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, n_samples_range, scorer, verbose): - # HACK as long as boolean indices are allowed in cv generators - if train.dtype == np.bool: - train = np.nonzero(train) - estimator = clone(base_estimator) train_scores, test_scores = [], [] for n_train_samples, partial_train in zip(n_samples_range, np.split(train, diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index ebbd4c88afbe1..9cc20fe381317 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -5,7 +5,8 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.datasets import make_classification -from sklearn.svm import SVC +from sklearn.cross_validation import KFold + class MockImprovingClassifier(object): """Dummy classifier to test the learning curve""" @@ -103,7 +104,8 @@ def test_n_sample_range_out_of_bounds(): assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, n_samples_range=[1, 21]) -def test_remove_multiple_sample_sizes(): + +def test_remove_duplicate_sample_sizes(): X, y = make_classification(n_samples=3, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -112,3 +114,17 @@ def test_remove_multiple_sample_sizes(): learning_curve, estimator, X, y, cv=3, n_samples_range=np.linspace(0.33, 1.0, 3)) assert_array_equal(n_samples_range, [1, 2]) + + +def test_learning_curve_with_boolean_indices(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockImprovingClassifier(20) + cv = KFold(n=30, n_folds=3, indices=False) + n_samples_range, train_scores, test_scores = learning_curve(estimator, + X, y, cv=cv) + assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) + From c42cae69cbda467c6510ebea6d3c71ae78c79f72 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 3 Jan 2014 12:11:29 +0100 Subject: [PATCH 18/30] FIX batch learning and incremental learning have equal results --- examples/plot_learning_curve.py | 15 +++++++++++---- sklearn/learning_curve.py | 4 ++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 568ba4970a0c5..9a928c97d8d11 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -9,14 +9,21 @@ digits = load_digits() X, y = digits.data, digits.target - n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=10, exploit_incremental_learning=False, - n_jobs=4, verbose=False) - plt.title("Learning Curves (Passive-Aggressive Classifier on Digits)") plt.xlabel("Training examples") plt.ylabel("Score") + + n_samples_range, train_scores, test_scores = learning_curve( + estimator, X, y, cv=10, exploit_incremental_learning=False, + n_jobs=1, verbose=False) plt.plot(n_samples_range, train_scores, label="Training score") plt.plot(n_samples_range, test_scores, label="Cross-validation score") + + n_samples_range, train_scores, test_scores = learning_curve( + estimator, X, y, cv=10, exploit_incremental_learning=True, + n_jobs=1, verbose=False) + plt.plot(n_samples_range, train_scores, label="Training score") + plt.plot(n_samples_range, test_scores, label="Cross-validation score") + plt.legend(loc="best") plt.show() diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index db2e117adcdcf..0ecd5d0b21b7e 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -158,17 +158,17 @@ def _fit_estimator(base_estimator, X, y, train, test, test=test, scorer=scorer, return_train_score=True) return train_score, test_score + def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, n_samples_range, scorer, verbose): estimator = clone(base_estimator) train_scores, test_scores = [], [] for n_train_samples, partial_train in zip(n_samples_range, np.split(train, - n_samples_range[:-1])): + n_samples_range)[:-1]): test_score, _, train_score, _ = _split_and_score( estimator, X, y, train=train[:n_train_samples], partial_train=partial_train, test=test, scorer=scorer, return_train_score=True, classes=classes) train_scores.append(train_score) test_scores.append(test_score) - #return np.zeros((2, n_samples_range.shape[0])) return np.array((train_scores, test_scores)).T From 9dd060160acb5cbb56657d47e18d96c93e279631 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 3 Jan 2014 12:24:02 +0100 Subject: [PATCH 19/30] Test that the results of batch/online learning are equal --- examples/plot_learning_curve.py | 6 ------ sklearn/learning_curve.py | 3 +++ sklearn/tests/test_learning_curve.py | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 9a928c97d8d11..8faaaac4ef95e 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -13,12 +13,6 @@ plt.xlabel("Training examples") plt.ylabel("Score") - n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=10, exploit_incremental_learning=False, - n_jobs=1, verbose=False) - plt.plot(n_samples_range, train_scores, label="Training score") - plt.plot(n_samples_range, test_scores, label="Cross-validation score") - n_samples_range, train_scores, test_scores = learning_curve( estimator, X, y, cv=10, exploit_incremental_learning=True, n_jobs=1, verbose=False) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 0ecd5d0b21b7e..1f858688dea88 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -87,6 +87,9 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), n_max_training_samples = len(cv[0][0]) n_samples_range, n_unique_ticks = _translate_n_samples_range( n_samples_range, n_max_training_samples) + # Because the lengths of folds can be significantly different, it is + # not guaranteed that we use all of the available training data when we + # use the first 'n_max_training_samples' samples. _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 9cc20fe381317..218648ed77de3 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -6,6 +6,7 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.datasets import make_classification from sklearn.cross_validation import KFold +from sklearn.linear_model import PassiveAggressiveClassifier class MockImprovingClassifier(object): @@ -90,6 +91,21 @@ def test_incremental_learning(): assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) +def test_batch_and_incremental_learning_are_equal(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) + n_samples_range_inc, train_scores_inc, test_scores_inc = learning_curve( + estimator, X, y, cv=3, exploit_incremental_learning=True) + n_samples_range_batch, train_scores_batch, test_scores_batch = \ + learning_curve(estimator, X, y, cv=3, + exploit_incremental_learning=False) + assert_array_equal(n_samples_range_inc, n_samples_range_batch) + assert_array_almost_equal(train_scores_inc, train_scores_batch) + assert_array_almost_equal(test_scores_inc, test_scores_batch) + + def test_n_sample_range_out_of_bounds(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, From cec31e2436a351ccbf8274184710421c3e3a79cb Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 4 Jan 2014 18:09:32 +0100 Subject: [PATCH 20/30] FIX comparison of batch and incremental learning The minimum number of training examples has to be big enough to contain samples from each class. --- sklearn/learning_curve.py | 5 ++++- sklearn/tests/test_learning_curve.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 1f858688dea88..41f70ace9b846 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -31,7 +31,10 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), n_samples_range : array-like, shape = [n_ticks,], dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a - fraction of n_samples, i.e. it has to be within (0, 1]. + fraction of the maximum size of the training set (that is determined + by the selected validation method), i.e. it has to be within (0, 1]. + Note that for classification the number of samples usually have to + be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 10)) cv : integer, cross-validation generator or None, optional, default: None diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 218648ed77de3..58416061d039c 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -95,12 +95,18 @@ def test_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) + n_samples_range = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) - n_samples_range_inc, train_scores_inc, test_scores_inc = learning_curve( - estimator, X, y, cv=3, exploit_incremental_learning=True) + + n_samples_range_inc, train_scores_inc, test_scores_inc = \ + learning_curve( + estimator, X, y, n_samples_range=n_samples_range, + cv=3, exploit_incremental_learning=True) n_samples_range_batch, train_scores_batch, test_scores_batch = \ - learning_curve(estimator, X, y, cv=3, - exploit_incremental_learning=False) + learning_curve( + estimator, X, y, cv=3, n_samples_range=n_samples_range, + exploit_incremental_learning=False) + assert_array_equal(n_samples_range_inc, n_samples_range_batch) assert_array_almost_equal(train_scores_inc, train_scores_batch) assert_array_almost_equal(test_scores_inc, test_scores_batch) From a64ef4e4bf5e773f068759f44f837cbfe6914ae8 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 4 Jan 2014 18:22:46 +0100 Subject: [PATCH 21/30] Add pre_dispatch parameter --- sklearn/learning_curve.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 41f70ace9b846..2bb915cd4c156 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -9,7 +9,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, - n_jobs=1, verbose=0): + n_jobs=1, pre_dispatch=None, verbose=0): """Learning curve Determines cross-validated training and test scores for different training @@ -54,6 +54,11 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), n_jobs : integer, optional Number of jobs to run in parallel (default 1). + pre_dispatch : integer or string, optional + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The string can + be an expression like '2*n_jobs'. + verbose : integer, optional Controls the verbosity: the higher, the more messages. @@ -70,7 +75,6 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), test_scores : array, shape = [n_ticks,] Scores on test set. """ - # TODO use verbose argument if exploit_incremental_learning and not hasattr(estimator, 'partial_fit'): raise ValueError('An estimator must support the partial_fit interface ' @@ -97,7 +101,8 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) - parallel = Parallel(n_jobs=n_jobs, verbose=verbose) + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, + verbose=verbose) if exploit_incremental_learning: if is_classifier(estimator): classes = np.unique(y) From 36be74dff24c761cae74644ca2780eef74057631 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 4 Jan 2014 19:08:34 +0100 Subject: [PATCH 22/30] Use parameter 'verbose' --- examples/plot_learning_curve.py | 2 +- sklearn/learning_curve.py | 10 +++++---- sklearn/tests/test_learning_curve.py | 32 ++++++++++++++++++++++------ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 8faaaac4ef95e..b977a687598ae 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -15,7 +15,7 @@ n_samples_range, train_scores, test_scores = learning_curve( estimator, X, y, cv=10, exploit_incremental_learning=True, - n_jobs=1, verbose=False) + n_jobs=1, verbose=1) plt.plot(n_samples_range, train_scores, label="Training score") plt.plot(n_samples_range, test_scores, label="Cross-validation score") diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 2bb915cd4c156..be2e27839b873 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -9,7 +9,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, - n_jobs=1, pre_dispatch=None, verbose=0): + n_jobs=1, pre_dispatch="all", verbose=0): """Learning curve Determines cross-validated training and test scores for different training @@ -76,9 +76,9 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), Scores on test set. """ - if exploit_incremental_learning and not hasattr(estimator, 'partial_fit'): - raise ValueError('An estimator must support the partial_fit interface ' - 'to exploit incremental learning') + if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): + raise ValueError("An estimator must support the partial_fit interface " + "to exploit incremental learning") X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) # Make a list since we will be iterating multiple times over the folds @@ -97,6 +97,8 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. + if verbose > 0: + print("[learning_curve] Training set sizes: " + str(n_samples_range)) _check_scorable(estimator, scoring=scoring) scorer = _deprecate_loss_and_score_funcs(scoring=scoring) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 58416061d039c..a9768e1204bda 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -1,3 +1,5 @@ +import sys +from sklearn.externals.six.moves import cStringIO as StringIO import numpy as np from sklearn.learning_curve import learning_curve from sklearn.utils.testing import assert_raises @@ -69,7 +71,26 @@ def test_learning_curve(): assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) -def test_incremental_learning_not_possible(): +def test_learning_curve_verbose(): + X, y = make_classification(n_samples=30, n_features=1, n_informative=1, + n_redundant=0, n_classes=2, + n_clusters_per_class=1, random_state=0) + estimator = MockImprovingClassifier(20) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + n_samples_range, train_scores, test_scores = \ + learning_curve(estimator, X, y, cv=3, verbose=1) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + assert("[learning_curve]" in out) + + +def test_learning_curve_incremental_learning_not_possible(): X, y = make_classification(n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -79,7 +100,7 @@ def test_incremental_learning_not_possible(): exploit_incremental_learning=True) -def test_incremental_learning(): +def test_learning_curve_incremental_learning(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -91,7 +112,7 @@ def test_incremental_learning(): assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) -def test_batch_and_incremental_learning_are_equal(): +def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -112,7 +133,7 @@ def test_batch_and_incremental_learning_are_equal(): assert_array_almost_equal(test_scores_inc, test_scores_batch) -def test_n_sample_range_out_of_bounds(): +def test_learning_curve_n_sample_range_out_of_bounds(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -127,7 +148,7 @@ def test_n_sample_range_out_of_bounds(): n_samples_range=[1, 21]) -def test_remove_duplicate_sample_sizes(): +def test_learning_curve_remove_duplicate_sample_sizes(): X, y = make_classification(n_samples=3, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) @@ -149,4 +170,3 @@ def test_learning_curve_with_boolean_indices(): assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) - From b9c838d0b1dae7204c60104c41971c16d5d8cce6 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 4 Jan 2014 19:57:11 +0100 Subject: [PATCH 23/30] DOC Add and update documentation * document example * add author and license --- examples/plot_learning_curve.py | 50 +++++++++++++++++++--------- sklearn/learning_curve.py | 16 +++++++-- sklearn/tests/test_learning_curve.py | 4 +++ 3 files changed, 52 insertions(+), 18 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index b977a687598ae..7f2d8d87030fe 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -1,23 +1,43 @@ +""" +======================== +Plotting Learning Curves +======================== + +A learning curve shows the validation and training score of a learning +algorithm for varying numbers of training samples. A learning curve +shows how much we benefit from adding more training data. + +In this example, the learning curve of a PassiveAggressiveClassifier +is shown for the digits dataset. Note that the training score and the +cross-validation score are both not very good. However, the of the curve +can be found in more complex datasets very often: the training score +is very high at the beginning and decreases and the cross-validation +score is very low at the beginning and increases. +""" + +# Author: Alexander Fabisch +# +# License: BSD 3 clause + import numpy as np import matplotlib.pyplot as plt -from sklearn.linear_model import PassiveAggressiveClassifier +from sklearn.naive_bayes import GaussianNB from sklearn.datasets import load_digits from sklearn.learning_curve import learning_curve -if __name__ == "__main__": - estimator = PassiveAggressiveClassifier(n_iter=1) - digits = load_digits() - X, y = digits.data, digits.target - plt.title("Learning Curves (Passive-Aggressive Classifier on Digits)") - plt.xlabel("Training examples") - plt.ylabel("Score") +estimator = GaussianNB() +digits = load_digits() +X, y = digits.data, digits.target + +plt.title("Learning Curves (Passive-Aggressive Classifier on Digits)") +plt.xlabel("Training examples") +plt.ylabel("Score") - n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=10, exploit_incremental_learning=True, - n_jobs=1, verbose=1) - plt.plot(n_samples_range, train_scores, label="Training score") - plt.plot(n_samples_range, test_scores, label="Cross-validation score") +n_samples_range, train_scores, test_scores = learning_curve( + estimator, X, y, cv=10, n_jobs=1, verbose=1) +plt.plot(n_samples_range, train_scores, label="Training score") +plt.plot(n_samples_range, test_scores, label="Cross-validation score") - plt.legend(loc="best") - plt.show() +plt.legend(loc="best") +plt.show() diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index be2e27839b873..269dd4522e8b2 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -1,3 +1,7 @@ +# Author: Alexander Fabisch +# +# License: BSD 3 clause + import numpy as np import warnings from .base import is_classifier, clone @@ -15,16 +19,22 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), Determines cross-validated training and test scores for different training set sizes. + A cross-validation generator splits the whole dataset k times in training + and test data. Subsets of the training set with varying sizes will be used + to train the estimator and a score for each training subset size and the + test set will be computed. Afterwards, the scores will be averaged over + all k runs for each training subset size. + Parameters ---------- estimator : object type that implements the "fit" and "predict" methods - An object of that type is instantiated for each validation. + An object of that type which is cloned for each validation. - X : array-like, shape = [n_samples, n_features] + X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. - y : array-like, shape = [n_samples] or [n_samples, n_output], optional + y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index a9768e1204bda..9f372e6828fc7 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -1,3 +1,7 @@ +# Author: Alexander Fabisch +# +# License: BSD 3 clause + import sys from sklearn.externals.six.moves import cStringIO as StringIO import numpy as np From ff1aef408db028312c9d12000e7d4b941e1ea1dd Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sat, 4 Jan 2014 23:57:36 +0100 Subject: [PATCH 24/30] Compare naive bayes with SVM in example --- examples/plot_learning_curve.py | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 7f2d8d87030fe..b45fc0f6cd6c1 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -4,40 +4,53 @@ ======================== A learning curve shows the validation and training score of a learning -algorithm for varying numbers of training samples. A learning curve -shows how much we benefit from adding more training data. - -In this example, the learning curve of a PassiveAggressiveClassifier -is shown for the digits dataset. Note that the training score and the -cross-validation score are both not very good. However, the of the curve -can be found in more complex datasets very often: the training score -is very high at the beginning and decreases and the cross-validation -score is very low at the beginning and increases. +algorithm for varying numbers of training samples. It is a tool to +find out how much we benefit from adding more training data. If both +the validation score and the training score converge too a value that is +too low, we will not benefit much from more training data and we will +probably have to use a learning algorithm or a parametrization of the +current learning algorithm with a lower bias. + +In this example, on the left side the learning curve of a naive Bayes +classifier is shown for the digits dataset. Note that the training score +and the cross-validation score are both not very good at the end. However, +the shape of the curve can be found in more complex datasets very often: +the training score is very high at the beginning and decreases and the +cross-validation score is very low at the beginning and increases. On the +right side we see the learning curve of an SVM with RBF kernel. We can +see clearly that the training score is still around the maximum and the +validation score could be increased with more training samples. """ -# Author: Alexander Fabisch -# -# License: BSD 3 clause - import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB +from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.learning_curve import learning_curve -estimator = GaussianNB() digits = load_digits() X, y = digits.data, digits.target -plt.title("Learning Curves (Passive-Aggressive Classifier on Digits)") +plt.figure() +plt.title("Learning Curve (Naive Bayes)") plt.xlabel("Training examples") plt.ylabel("Score") - n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=10, n_jobs=1, verbose=1) + GaussianNB(), X, y, cv=10, n_jobs=1) plt.plot(n_samples_range, train_scores, label="Training score") plt.plot(n_samples_range, test_scores, label="Cross-validation score") +plt.legend(loc="best") +plt.figure() +plt.title("Learning Curve (SVM, RBF kernel, $\gamma=0.001$)") +plt.xlabel("Training examples") +plt.ylabel("Score") +n_samples_range, train_scores, test_scores = learning_curve( + SVC(gamma=0.001), X, y, cv=10, n_jobs=1) +plt.plot(n_samples_range, train_scores, label="Training score") +plt.plot(n_samples_range, test_scores, label="Cross-validation score") plt.legend(loc="best") + plt.show() From 32832984306397c1473e9429c8745f9146a1e41d Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 5 Jan 2014 19:05:29 +0100 Subject: [PATCH 25/30] Improve documentation * document some private functions * fix documentation of learning curve and example --- examples/plot_learning_curve.py | 3 ++- sklearn/grid_search.py | 19 +++++++++++++++++++ sklearn/learning_curve.py | 9 +++++---- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index b45fc0f6cd6c1..2744667f8128c 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -6,7 +6,7 @@ A learning curve shows the validation and training score of a learning algorithm for varying numbers of training samples. It is a tool to find out how much we benefit from adding more training data. If both -the validation score and the training score converge too a value that is +the validation score and the training score converge to a value that is too low, we will not benefit much from more training data and we will probably have to use a learning algorithm or a parametrization of the current learning algorithm with a lower bias. @@ -21,6 +21,7 @@ see clearly that the training score is still around the maximum and the validation score could be increased with more training samples. """ +print(__doc__) import numpy as np import matplotlib.pyplot as plt diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index f82373c2fcfe0..89e973e040a6f 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -257,6 +257,22 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, def _split_and_score(estimator, X, y, train, test, scorer, return_train_score=False, partial_train=None, **fit_params): + """Split the dataset in training and test set and compute scores. + + The dataset consists of of either a precomputed kernel matrix or an input + matrix 'X' and optional targets 'y' (y can be None for unsupervised + learners). It will be split according to the indices given by 'train' and + 'test'. Usually an estimator will be trained with the whole training set. + However, it is possible to reuse a previously trained model to train it + incrementally by passing the indices of the next training subset in + 'partial_train'. After splitting the dataset and fitting the estimator, the + scores will be computed on the test set according to the scoring function + of the estimator or with a given 'scorer'. It is possible to return the + score on the training set optionally by setting 'return_train_score'. The + function will return the score on the test set and the number of samples in + the test set and optionally the score on the training set and the number of + samples in the training set. + """ if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " @@ -316,6 +332,7 @@ def _split_and_score(estimator, X, y, train, test, scorer, def _fit(estimator, X_train, y_train, **fit_params): + """Fit and estimator on a given training set.""" if y_train is None: estimator.fit(X_train, **fit_params) else: @@ -324,6 +341,7 @@ def _fit(estimator, X_train, y_train, **fit_params): def _fit_incremental(estimator, X_partial_train, y_partial_train, **fit_params): + """Fit an estimator incrementally with a given training subset.""" if y_partial_train is None: estimator.partial_fit(X_partial_train, **fit_params) else: @@ -331,6 +349,7 @@ def _fit_incremental(estimator, X_partial_train, y_partial_train, def _score(estimator, X_test, y_test, scorer): + """Compute the score of an estimator on a given test set.""" if y_test is None: if scorer is None: this_score = estimator.score(X_test) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 269dd4522e8b2..7b349f54f9cb7 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -11,6 +11,7 @@ from .metrics.scorer import _deprecate_loss_and_score_funcs from .grid_search import _check_scorable, _split_and_score + def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0): @@ -47,8 +48,8 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 10)) - cv : integer, cross-validation generator or None, optional, default: None - If an integer is passed, it is the number of folds (default 3). + cv : integer, cross-validation generator, optional + If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects @@ -151,8 +152,8 @@ def _translate_n_samples_range(n_samples_range, n_max_training_samples): "but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - n_samples_range = (n_samples_range * n_max_training_samples - ).astype(np.int) + n_samples_range = (n_samples_range + * n_max_training_samples).astype(np.int) n_samples_range = np.clip(n_samples_range, 1, n_max_training_samples) else: if (n_min_required_samples <= 0 or From 754a10491503deb462d63ba2d4f6aa6d62e65486 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Sun, 5 Jan 2014 19:17:25 +0100 Subject: [PATCH 26/30] pep8 --- examples/plot_learning_curve.py | 1 - sklearn/learning_curve.py | 30 +++++++++++++--------------- sklearn/tests/test_learning_curve.py | 26 ++++++++++++------------ 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 2744667f8128c..2d3527a66fa62 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -23,7 +23,6 @@ """ print(__doc__) -import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 7b349f54f9cb7..a5343c1e4f148 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -104,7 +104,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), n_max_training_samples = len(cv[0][0]) n_samples_range, n_unique_ticks = _translate_n_samples_range( - n_samples_range, n_max_training_samples) + n_samples_range, n_max_training_samples) # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. @@ -122,15 +122,12 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), else: classes = None out = parallel(delayed(_incremental_fit_estimator)( - estimator, X, y, classes, train, test, - n_samples_range, scorer, verbose) - for train, test in cv) + estimator, X, y, classes, train, test, n_samples_range, scorer, + verbose) for train, test in cv) else: out = parallel(delayed(_fit_estimator)( - estimator, X, y, train, test, n_train_samples, - scorer, verbose) - for train, test in cv - for n_train_samples in n_samples_range) + estimator, X, y, train, test, n_train_samples, scorer, verbose) + for train, test in cv for n_train_samples in n_samples_range) out = np.array(out) n_cv_folds = out.shape[0]/n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) @@ -157,7 +154,7 @@ def _translate_n_samples_range(n_samples_range, n_max_training_samples): n_samples_range = np.clip(n_samples_range, 1, n_max_training_samples) else: if (n_min_required_samples <= 0 or - n_max_required_samples > n_max_training_samples): + n_max_required_samples > n_max_training_samples): raise ValueError("n_samples_range must be within (0, %d], " "but is within [%d, %d]." % (n_max_training_samples, @@ -178,8 +175,8 @@ def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): estimator = clone(base_estimator) test_score, _, train_score, _ = _split_and_score( - estimator, X, y, train=train[:n_train_samples], - test=test, scorer=scorer, return_train_score=True) + estimator, X, y, train=train[:n_train_samples], test=test, + scorer=scorer, return_train_score=True) return train_score, test_score @@ -187,12 +184,13 @@ def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, n_samples_range, scorer, verbose): estimator = clone(base_estimator) train_scores, test_scores = [], [] - for n_train_samples, partial_train in zip(n_samples_range, np.split(train, - n_samples_range)[:-1]): + for n_train_samples, partial_train in zip(n_samples_range, + np.split(train, + n_samples_range)[:-1]): test_score, _, train_score, _ = _split_and_score( - estimator, X, y, train=train[:n_train_samples], - partial_train=partial_train, test=test, scorer=scorer, - return_train_score=True, classes=classes) + estimator, X, y, train=train[:n_train_samples], + partial_train=partial_train, test=test, scorer=scorer, + return_train_score=True, classes=classes) train_scores.append(train_score) test_scores.append(test_score) return np.array((train_scores, test_scores)).T diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 9f372e6828fc7..7b0f23efd3900 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -41,7 +41,7 @@ def _is_training_data(self, X): return X is self.X_subset def get_params(self, deep=False): - return {"n_max_train_samples" : self.n_max_train_samples} + return {"n_max_train_samples": self.n_max_train_samples} def set_params(self, **params): self.n_max_train_samples = params["n_max_train_samples"] @@ -52,7 +52,7 @@ class MockIncrementalImprovingClassifier(MockImprovingClassifier): """Dummy classifier that provides partial_fit""" def __init__(self, n_max_train_samples): super(MockIncrementalImprovingClassifier, self).__init__( - n_max_train_samples) + n_max_train_samples) self.x = None def _is_training_data(self, X): @@ -85,7 +85,7 @@ def test_learning_curve_verbose(): sys.stdout = StringIO() try: n_samples_range, train_scores, test_scores = \ - learning_curve(estimator, X, y, cv=3, verbose=1) + learning_curve(estimator, X, y, cv=3, verbose=1) finally: out = sys.stdout.getvalue() sys.stdout.close() @@ -110,7 +110,7 @@ def test_learning_curve_incremental_learning(): n_clusters_per_class=1, random_state=0) estimator = MockIncrementalImprovingClassifier(20) n_samples_range, train_scores, test_scores = learning_curve( - estimator, X, y, cv=3, exploit_incremental_learning=True) + estimator, X, y, cv=3, exploit_incremental_learning=True) assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) @@ -124,13 +124,13 @@ def test_learning_curve_batch_and_incremental_learning_are_equal(): estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) n_samples_range_inc, train_scores_inc, test_scores_inc = \ - learning_curve( - estimator, X, y, n_samples_range=n_samples_range, - cv=3, exploit_incremental_learning=True) + learning_curve( + estimator, X, y, n_samples_range=n_samples_range, + cv=3, exploit_incremental_learning=True) n_samples_range_batch, train_scores_batch, test_scores_batch = \ - learning_curve( - estimator, X, y, cv=3, n_samples_range=n_samples_range, - exploit_incremental_learning=False) + learning_curve( + estimator, X, y, cv=3, n_samples_range=n_samples_range, + exploit_incremental_learning=False) assert_array_equal(n_samples_range_inc, n_samples_range_batch) assert_array_almost_equal(train_scores_inc, train_scores_batch) @@ -157,9 +157,9 @@ def test_learning_curve_remove_duplicate_sample_sizes(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(2) - n_samples_range, _, _ = assert_warns(RuntimeWarning, - learning_curve, estimator, X, y, cv=3, - n_samples_range=np.linspace(0.33, 1.0, 3)) + n_samples_range, _, _ = assert_warns( + RuntimeWarning, learning_curve, estimator, X, y, cv=3, + n_samples_range=np.linspace(0.33, 1.0, 3)) assert_array_equal(n_samples_range, [1, 2]) From 9ede03c4201ffb4d8a4ee89c7f2bfbfaab6d8999 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Mon, 6 Jan 2014 23:08:49 +0100 Subject: [PATCH 27/30] Refactoring * remove complicated function from grid_search * factor out get_scorer() * rename n_samples_range to samples_range --- examples/plot_learning_curve.py | 15 ++-- sklearn/grid_search.py | 104 +++++++-------------------- sklearn/learning_curve.py | 91 ++++++++++++----------- sklearn/metrics/scorer.py | 8 ++- sklearn/tests/test_learning_curve.py | 44 ++++++------ 5 files changed, 112 insertions(+), 150 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 2d3527a66fa62..21f43dafa75aa 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -9,7 +9,8 @@ the validation score and the training score converge to a value that is too low, we will not benefit much from more training data and we will probably have to use a learning algorithm or a parametrization of the -current learning algorithm with a lower bias. +current learning algorithm that can learn more complex concepts (i.e. +has a lower bias). In this example, on the left side the learning curve of a naive Bayes classifier is shown for the digits dataset. Note that the training score @@ -37,20 +38,20 @@ plt.title("Learning Curve (Naive Bayes)") plt.xlabel("Training examples") plt.ylabel("Score") -n_samples_range, train_scores, test_scores = learning_curve( +samples_range, train_scores, test_scores = learning_curve( GaussianNB(), X, y, cv=10, n_jobs=1) -plt.plot(n_samples_range, train_scores, label="Training score") -plt.plot(n_samples_range, test_scores, label="Cross-validation score") +plt.plot(samples_range, train_scores, label="Training score") +plt.plot(samples_range, test_scores, label="Cross-validation score") plt.legend(loc="best") plt.figure() plt.title("Learning Curve (SVM, RBF kernel, $\gamma=0.001$)") plt.xlabel("Training examples") plt.ylabel("Score") -n_samples_range, train_scores, test_scores = learning_curve( +samples_range, train_scores, test_scores = learning_curve( SVC(gamma=0.001), X, y, cv=10, n_jobs=1) -plt.plot(n_samples_range, train_scores, label="Training score") -plt.plot(n_samples_range, test_scores, label="Cross-validation score") +plt.plot(samples_range, train_scores, label="Training score") +plt.plot(samples_range, test_scores, label="Cross-validation score") plt.legend(loc="best") plt.show() diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 89e973e040a6f..108d320139c2e 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -239,9 +239,10 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, estimator = clone(base_estimator) estimator.set_params(**parameters) - this_score, n_test_samples = _split_and_score( - estimator, X, y, train, test, scorer, - return_train_score=False, **fit_params) + X_train, y_train = _split(estimator, X, y, train) + X_test, y_test = _split(estimator, X, y, test, train) + _fit(estimator.fit, X_train, y_train, **fit_params) + this_score = _score(estimator, X_test, y_test, scorer) if verbose > 2: msg += ", score=%f" % this_score @@ -251,28 +252,11 @@ def fit_grid_point(X, y, base_estimator, parameters, train, test, scorer, start_time)) print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - return this_score, parameters, n_test_samples - - -def _split_and_score(estimator, X, y, train, test, scorer, - return_train_score=False, partial_train=None, - **fit_params): - """Split the dataset in training and test set and compute scores. - - The dataset consists of of either a precomputed kernel matrix or an input - matrix 'X' and optional targets 'y' (y can be None for unsupervised - learners). It will be split according to the indices given by 'train' and - 'test'. Usually an estimator will be trained with the whole training set. - However, it is possible to reuse a previously trained model to train it - incrementally by passing the indices of the next training subset in - 'partial_train'. After splitting the dataset and fitting the estimator, the - scores will be computed on the test set according to the scoring function - of the estimator or with a given 'scorer'. It is possible to return the - score on the training set optionally by setting 'return_train_score'. The - function will return the score on the test set and the number of samples in - the test set and optionally the score on the training set and the number of - samples in the training set. - """ + return this_score, parameters, _num_samples(X_test) + + +def _split(estimator, X, y, indices, train_indices=None): + """Create subset of dataset.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " @@ -282,70 +266,33 @@ def _split_and_score(estimator, X, y, train, test, scorer, if getattr(estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") - X_train = [X[idx] for idx in train] - X_test = [X[idx] for idx in test] - if partial_train is not None: - X_partial_train = [X[idx] for idx in partial_train] + X_subset = [X[idx] for idx in indices] else: if getattr(estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") - X_train = X[np.ix_(train, train)] - X_test = X[np.ix_(test, train)] - if partial_train is not None: - X_partial_train = X[np.ix_(partial_train, partial_train)] + if train_indices is None: + X_subset = X[np.ix_(indices, indices)] + else: + X_subset = X[np.ix_(indices, train_indices)] else: - X_train = X[safe_mask(X, train)] - X_test = X[safe_mask(X, test)] - if partial_train is not None: - X_partial_train = X[safe_mask(X, partial_train)] + X_subset = X[safe_mask(X, indices)] if y is not None: - y_test = y[safe_mask(y, test)] - y_train = y[safe_mask(y, train)] - if partial_train is not None: - y_partial_train = y[safe_mask(y, partial_train)] + y_subset = y[safe_mask(y, indices)] else: - y_test = None - y_train = None - if partial_train is not None: - y_partial_train = None + y_subset = None - if partial_train is None: - _fit(estimator, X_train, y_train, **fit_params) - else: - _fit_incremental(estimator, X_partial_train, y_partial_train, - **fit_params) - test_score = _score(estimator, X_test, y_test, scorer) - - if not isinstance(test_score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s)" - " instead." % (str(test_score), type(test_score))) - - if return_train_score: - train_score = _score(estimator, X_train, y_train, scorer) - return (test_score, _num_samples(X_test), train_score, - _num_samples(X_train)) - else: - return test_score, _num_samples(X_test) + return X_subset, y_subset -def _fit(estimator, X_train, y_train, **fit_params): +def _fit(fit_function, X_train, y_train, **fit_params): """Fit and estimator on a given training set.""" if y_train is None: - estimator.fit(X_train, **fit_params) + fit_function(X_train, **fit_params) else: - estimator.fit(X_train, y_train, **fit_params) - - -def _fit_incremental(estimator, X_partial_train, y_partial_train, - **fit_params): - """Fit an estimator incrementally with a given training subset.""" - if y_partial_train is None: - estimator.partial_fit(X_partial_train, **fit_params) - else: - estimator.partial_fit(X_partial_train, y_partial_train, **fit_params) + fit_function(X_train, y_train, **fit_params) def _score(estimator, X_test, y_test, scorer): @@ -442,7 +389,8 @@ def __init__(self, estimator, scoring=None, loss_func=None, self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch - self._check_estimator() + _check_scorable(self.estimator, scoring=self.scoring, + loss_func=self.loss_func, score_func=self.score_func) def score(self, X, y=None): """Returns the score on the given test data and labels, if the search @@ -487,11 +435,7 @@ def decision_function(self): @property def transform(self): return self.best_estimator_.transform - - def _check_estimator(self): - _check_scorable(self.estimator, scoring=self.scoring, - loss_func=self.loss_func, score_func=self.score_func) - + def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index a5343c1e4f148..20029e2633e41 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -8,11 +8,11 @@ from .cross_validation import _check_cv from .utils import check_arrays from .externals.joblib import Parallel, delayed -from .metrics.scorer import _deprecate_loss_and_score_funcs -from .grid_search import _check_scorable, _split_and_score +from .metrics.scorer import get_scorer +from .grid_search import _check_scorable, _split, _fit, _score -def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), +def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0): """Learning curve @@ -39,7 +39,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), Target relative to X for classification or regression; None for unsupervised learning. - n_samples_range : array-like, shape = [n_ticks,], dtype float or int + samples_range : array-like, shape = [n_ticks,], dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined @@ -75,7 +75,7 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), Returns ------- - n_samples_range : array, shape = [n_unique_ticks,], dtype int + samples_range_abs : array, shape = [n_unique_ticks,], dtype int Numbers of training examples that has been used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. @@ -103,16 +103,16 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), cv = new_cv n_max_training_samples = len(cv[0][0]) - n_samples_range, n_unique_ticks = _translate_n_samples_range( - n_samples_range, n_max_training_samples) + samples_range_abs, n_unique_ticks = _translate_samples_range( + samples_range, n_max_training_samples) # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. if verbose > 0: - print("[learning_curve] Training set sizes: " + str(n_samples_range)) + print("[learning_curve] Training set sizes: " + str(samples_range_abs)) _check_scorable(estimator, scoring=scoring) - scorer = _deprecate_loss_and_score_funcs(scoring=scoring) + scorer = get_scorer(scoring) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) @@ -122,75 +122,86 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10), else: classes = None out = parallel(delayed(_incremental_fit_estimator)( - estimator, X, y, classes, train, test, n_samples_range, scorer, + estimator, X, y, classes, train, test, samples_range_abs, scorer, verbose) for train, test in cv) else: out = parallel(delayed(_fit_estimator)( estimator, X, y, train, test, n_train_samples, scorer, verbose) - for train, test in cv for n_train_samples in n_samples_range) + for train, test in cv for n_train_samples in samples_range_abs) out = np.array(out) n_cv_folds = out.shape[0]/n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) avg_over_cv = np.asarray(out).mean(axis=0).reshape(n_unique_ticks, 2) - return n_samples_range, avg_over_cv[:, 0], avg_over_cv[:, 1] + return samples_range_abs, avg_over_cv[:, 0], avg_over_cv[:, 1] -def _translate_n_samples_range(n_samples_range, n_max_training_samples): - """Determine range of number of training samples""" - n_samples_range = np.asarray(n_samples_range) - n_ticks = n_samples_range.shape[0] - n_min_required_samples = np.min(n_samples_range) - n_max_required_samples = np.max(n_samples_range) - if np.issubdtype(n_samples_range.dtype, np.float): +def _translate_samples_range(samples_range, n_max_training_samples): + """Determine range of number of training samples. + + If the dtype of samples_range is float, the numbers will be interpreted as + fractions of n_max_training_samples. Otherwise they will be interpreted + as absolute values with n_max_training_samples as maximum value. + """ + samples_range_abs = np.asarray(samples_range) + n_ticks = samples_range_abs.shape[0] + n_min_required_samples = np.min(samples_range_abs) + n_max_required_samples = np.max(samples_range_abs) + if np.issubdtype(samples_range_abs.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("n_samples_range must be within (0, 1], " + raise ValueError("samples_range must be within (0, 1], " "but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - n_samples_range = (n_samples_range - * n_max_training_samples).astype(np.int) - n_samples_range = np.clip(n_samples_range, 1, n_max_training_samples) + samples_range_abs = (samples_range_abs + * n_max_training_samples).astype(np.int) + samples_range_abs = np.clip(samples_range_abs, 1, + n_max_training_samples) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): - raise ValueError("n_samples_range must be within (0, %d], " + raise ValueError("samples_range must be within (0, %d], " "but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) - n_samples_range = np.unique(n_samples_range) - n_unique_ticks = n_samples_range.shape[0] + samples_range_abs = np.unique(samples_range_abs) + n_unique_ticks = samples_range_abs.shape[0] if n_ticks > n_unique_ticks: warnings.warn("Number of ticks will be less than than the size of " - "'n_samples_range' (%d instead of %d)." + "'samples_range' (%d instead of %d)." % (n_unique_ticks, n_ticks), RuntimeWarning) - return n_samples_range, n_unique_ticks + return samples_range_abs, n_unique_ticks def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): + train_subset = train[:n_train_samples] estimator = clone(base_estimator) - test_score, _, train_score, _ = _split_and_score( - estimator, X, y, train=train[:n_train_samples], test=test, - scorer=scorer, return_train_score=True) + X_train, y_train = _split(estimator, X, y, train_subset) + X_test, y_test = _split(estimator, X, y, test, train_subset) + _fit(estimator.fit, X_train, y_train) + train_score = _score(estimator, X_train, y_train, scorer) + test_score = _score(estimator, X_test, y_test, scorer) return train_score, test_score def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, - n_samples_range, scorer, verbose): + samples_range, scorer, verbose): estimator = clone(base_estimator) train_scores, test_scores = [], [] - for n_train_samples, partial_train in zip(n_samples_range, + for n_train_samples, partial_train in zip(samples_range, np.split(train, - n_samples_range)[:-1]): - test_score, _, train_score, _ = _split_and_score( - estimator, X, y, train=train[:n_train_samples], - partial_train=partial_train, test=test, scorer=scorer, - return_train_score=True, classes=classes) - train_scores.append(train_score) - test_scores.append(test_score) + samples_range)[:-1]): + X_train, y_train = _split(estimator, X, y, train[:n_train_samples]) + X_partial_train, y_partial_train = _split(estimator, X, y, + partial_train) + X_test, y_test = _split(estimator, X, y, test, train[:n_train_samples]) + _fit(estimator.partial_fit, X_partial_train, y_partial_train, + classes=classes) + train_scores.append(_score(estimator, X_train, y_train, scorer)) + test_scores.append(_score(estimator, X_test, y_test, scorer)) return np.array((train_scores, test_scores)).T diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 289c32ec3cfb5..2a28495890ba2 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -180,7 +180,13 @@ def _deprecate_loss_and_score_funcs( if loss_func is None or score_overrides_loss: scorer = make_scorer(score_func) - elif isinstance(scoring, six.string_types): + else: + scorer = get_scorer(scoring) + return scorer + + +def get_scorer(scoring): + if isinstance(scoring, six.string_types): try: scorer = SCORERS[scoring] except KeyError: diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 7b0f23efd3900..0c10286896881 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -68,9 +68,9 @@ def test_learning_curve(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) - n_samples_range, train_scores, test_scores = learning_curve(estimator, - X, y, cv=3) - assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + samples_range, train_scores, test_scores = learning_curve(estimator, X, y, + cv=3) + assert_array_equal(samples_range, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) @@ -84,7 +84,7 @@ def test_learning_curve_verbose(): old_stdout = sys.stdout sys.stdout = StringIO() try: - n_samples_range, train_scores, test_scores = \ + samples_range, train_scores, test_scores = \ learning_curve(estimator, X, y, cv=3, verbose=1) finally: out = sys.stdout.getvalue() @@ -109,9 +109,9 @@ def test_learning_curve_incremental_learning(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockIncrementalImprovingClassifier(20) - n_samples_range, train_scores, test_scores = learning_curve( + samples_range, train_scores, test_scores = learning_curve( estimator, X, y, cv=3, exploit_incremental_learning=True) - assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + assert_array_equal(samples_range, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) @@ -120,19 +120,19 @@ def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) - n_samples_range = np.linspace(0.2, 1.0, 5) + samples_range = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) - n_samples_range_inc, train_scores_inc, test_scores_inc = \ + samples_range_inc, train_scores_inc, test_scores_inc = \ learning_curve( - estimator, X, y, n_samples_range=n_samples_range, + estimator, X, y, samples_range=samples_range, cv=3, exploit_incremental_learning=True) - n_samples_range_batch, train_scores_batch, test_scores_batch = \ + samples_range_batch, train_scores_batch, test_scores_batch = \ learning_curve( - estimator, X, y, cv=3, n_samples_range=n_samples_range, + estimator, X, y, cv=3, samples_range=samples_range, exploit_incremental_learning=False) - assert_array_equal(n_samples_range_inc, n_samples_range_batch) + assert_array_equal(samples_range_inc, samples_range_batch) assert_array_almost_equal(train_scores_inc, train_scores_batch) assert_array_almost_equal(test_scores_inc, test_scores_batch) @@ -143,13 +143,13 @@ def test_learning_curve_n_sample_range_out_of_bounds(): n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - n_samples_range=[0.0, 1.0]) + samples_range=[0.0, 1.0]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - n_samples_range=[0.1, 1.1]) + samples_range=[0.1, 1.1]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - n_samples_range=[0, 20]) + samples_range=[0, 20]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - n_samples_range=[1, 21]) + samples_range=[1, 21]) def test_learning_curve_remove_duplicate_sample_sizes(): @@ -157,10 +157,10 @@ def test_learning_curve_remove_duplicate_sample_sizes(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(2) - n_samples_range, _, _ = assert_warns( + samples_range, _, _ = assert_warns( RuntimeWarning, learning_curve, estimator, X, y, cv=3, - n_samples_range=np.linspace(0.33, 1.0, 3)) - assert_array_equal(n_samples_range, [1, 2]) + samples_range=np.linspace(0.33, 1.0, 3)) + assert_array_equal(samples_range, [1, 2]) def test_learning_curve_with_boolean_indices(): @@ -169,8 +169,8 @@ def test_learning_curve_with_boolean_indices(): n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) cv = KFold(n=30, n_folds=3, indices=False) - n_samples_range, train_scores, test_scores = learning_curve(estimator, - X, y, cv=cv) - assert_array_equal(n_samples_range, np.linspace(2, 20, 10)) + samples_range, train_scores, test_scores = learning_curve(estimator, X, y, + cv=cv) + assert_array_equal(samples_range, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) From 93f1acb8e3ea235c5366195ec09e6ce85fbac373 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Tue, 7 Jan 2014 08:09:36 +0100 Subject: [PATCH 28/30] Rename samples_range to train_sizes --- examples/plot_learning_curve.py | 12 +++--- sklearn/learning_curve.py | 55 ++++++++++++------------ sklearn/tests/test_learning_curve.py | 62 ++++++++++++++-------------- 3 files changed, 64 insertions(+), 65 deletions(-) diff --git a/examples/plot_learning_curve.py b/examples/plot_learning_curve.py index 21f43dafa75aa..5ad555a2af562 100644 --- a/examples/plot_learning_curve.py +++ b/examples/plot_learning_curve.py @@ -38,20 +38,20 @@ plt.title("Learning Curve (Naive Bayes)") plt.xlabel("Training examples") plt.ylabel("Score") -samples_range, train_scores, test_scores = learning_curve( +train_sizes, train_scores, test_scores = learning_curve( GaussianNB(), X, y, cv=10, n_jobs=1) -plt.plot(samples_range, train_scores, label="Training score") -plt.plot(samples_range, test_scores, label="Cross-validation score") +plt.plot(train_sizes, train_scores, label="Training score") +plt.plot(train_sizes, test_scores, label="Cross-validation score") plt.legend(loc="best") plt.figure() plt.title("Learning Curve (SVM, RBF kernel, $\gamma=0.001$)") plt.xlabel("Training examples") plt.ylabel("Score") -samples_range, train_scores, test_scores = learning_curve( +train_sizes, train_scores, test_scores = learning_curve( SVC(gamma=0.001), X, y, cv=10, n_jobs=1) -plt.plot(samples_range, train_scores, label="Training score") -plt.plot(samples_range, test_scores, label="Cross-validation score") +plt.plot(train_sizes, train_scores, label="Training score") +plt.plot(train_sizes, test_scores, label="Cross-validation score") plt.legend(loc="best") plt.show() diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 20029e2633e41..25ca2270bfd44 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -12,7 +12,7 @@ from .grid_search import _check_scorable, _split, _fit, _score -def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), +def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0): """Learning curve @@ -39,7 +39,7 @@ def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), Target relative to X for classification or regression; None for unsupervised learning. - samples_range : array-like, shape = [n_ticks,], dtype float or int + train_sizes : array-like, shape = [n_ticks,], dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined @@ -75,7 +75,7 @@ def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), Returns ------- - samples_range_abs : array, shape = [n_unique_ticks,], dtype int + train_sizes_abs : array, shape = [n_unique_ticks,], dtype int Numbers of training examples that has been used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. @@ -103,13 +103,13 @@ def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), cv = new_cv n_max_training_samples = len(cv[0][0]) - samples_range_abs, n_unique_ticks = _translate_samples_range( - samples_range, n_max_training_samples) + train_sizes_abs, n_unique_ticks = _translate_train_sizes( + train_sizes, n_max_training_samples) # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. if verbose > 0: - print("[learning_curve] Training set sizes: " + str(samples_range_abs)) + print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) _check_scorable(estimator, scoring=scoring) scorer = get_scorer(scoring) @@ -122,59 +122,59 @@ def learning_curve(estimator, X, y, samples_range=np.linspace(0.1, 1.0, 10), else: classes = None out = parallel(delayed(_incremental_fit_estimator)( - estimator, X, y, classes, train, test, samples_range_abs, scorer, + estimator, X, y, classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) else: out = parallel(delayed(_fit_estimator)( estimator, X, y, train, test, n_train_samples, scorer, verbose) - for train, test in cv for n_train_samples in samples_range_abs) + for train, test in cv for n_train_samples in train_sizes_abs) out = np.array(out) n_cv_folds = out.shape[0]/n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) avg_over_cv = np.asarray(out).mean(axis=0).reshape(n_unique_ticks, 2) - return samples_range_abs, avg_over_cv[:, 0], avg_over_cv[:, 1] + return train_sizes_abs, avg_over_cv[:, 0], avg_over_cv[:, 1] -def _translate_samples_range(samples_range, n_max_training_samples): +def _translate_train_sizes(train_sizes, n_max_training_samples): """Determine range of number of training samples. - If the dtype of samples_range is float, the numbers will be interpreted as + If the dtype of train_sizes is float, the numbers will be interpreted as fractions of n_max_training_samples. Otherwise they will be interpreted as absolute values with n_max_training_samples as maximum value. """ - samples_range_abs = np.asarray(samples_range) - n_ticks = samples_range_abs.shape[0] - n_min_required_samples = np.min(samples_range_abs) - n_max_required_samples = np.max(samples_range_abs) - if np.issubdtype(samples_range_abs.dtype, np.float): + train_sizes_abs = np.asarray(train_sizes) + n_ticks = train_sizes_abs.shape[0] + n_min_required_samples = np.min(train_sizes_abs) + n_max_required_samples = np.max(train_sizes_abs) + if np.issubdtype(train_sizes_abs.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("samples_range must be within (0, 1], " + raise ValueError("train_sizes must be within (0, 1], " "but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) - samples_range_abs = (samples_range_abs + train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(np.int) - samples_range_abs = np.clip(samples_range_abs, 1, + train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): - raise ValueError("samples_range must be within (0, %d], " + raise ValueError("train_sizes must be within (0, %d], " "but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) - samples_range_abs = np.unique(samples_range_abs) - n_unique_ticks = samples_range_abs.shape[0] + train_sizes_abs = np.unique(train_sizes_abs) + n_unique_ticks = train_sizes_abs.shape[0] if n_ticks > n_unique_ticks: warnings.warn("Number of ticks will be less than than the size of " - "'samples_range' (%d instead of %d)." + "'train_sizes' (%d instead of %d)." % (n_unique_ticks, n_ticks), RuntimeWarning) - return samples_range_abs, n_unique_ticks + return train_sizes_abs, n_unique_ticks def _fit_estimator(base_estimator, X, y, train, test, @@ -190,12 +190,11 @@ def _fit_estimator(base_estimator, X, y, train, test, def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, - samples_range, scorer, verbose): + train_sizes, scorer, verbose): estimator = clone(base_estimator) train_scores, test_scores = [], [] - for n_train_samples, partial_train in zip(samples_range, - np.split(train, - samples_range)[:-1]): + for n_train_samples, partial_train in zip(train_sizes, + np.split(train, train_sizes)[:-1]): X_train, y_train = _split(estimator, X, y, train[:n_train_samples]) X_partial_train, y_partial_train = _split(estimator, X, y, partial_train) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 0c10286896881..66b8f36279b97 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -17,14 +17,14 @@ class MockImprovingClassifier(object): """Dummy classifier to test the learning curve""" - def __init__(self, n_max_train_samples): - self.n_max_train_samples = n_max_train_samples - self.n_train_samples = 0 + def __init__(self, n_max_train_sizes): + self.n_max_train_sizes = n_max_train_sizes + self.train_sizes = 0 self.X_subset = None def fit(self, X_subset, y_subset): self.X_subset = X_subset - self.n_train_samples = X_subset.shape[0] + self.train_sizes = X_subset.shape[0] return self def predict(self, X): @@ -33,33 +33,33 @@ def predict(self, X): def score(self, X=None, Y=None): # training score becomes worse (2 -> 1), test error better (0 -> 1) if self._is_training_data(X): - return 2. - float(self.n_train_samples) / self.n_max_train_samples + return 2. - float(self.train_sizes) / self.n_max_train_sizes else: - return float(self.n_train_samples) / self.n_max_train_samples + return float(self.train_sizes) / self.n_max_train_sizes def _is_training_data(self, X): return X is self.X_subset def get_params(self, deep=False): - return {"n_max_train_samples": self.n_max_train_samples} + return {"n_max_train_sizes": self.n_max_train_sizes} def set_params(self, **params): - self.n_max_train_samples = params["n_max_train_samples"] + self.n_max_train_sizes = params["n_max_train_sizes"] return self class MockIncrementalImprovingClassifier(MockImprovingClassifier): """Dummy classifier that provides partial_fit""" - def __init__(self, n_max_train_samples): + def __init__(self, n_max_train_sizes): super(MockIncrementalImprovingClassifier, self).__init__( - n_max_train_samples) + n_max_train_sizes) self.x = None def _is_training_data(self, X): return self.x in X def partial_fit(self, X, y, **params): - self.n_train_samples += X.shape[0] + self.train_sizes += X.shape[0] self.x = X[0] @@ -68,9 +68,9 @@ def test_learning_curve(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) - samples_range, train_scores, test_scores = learning_curve(estimator, X, y, + train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=3) - assert_array_equal(samples_range, np.linspace(2, 20, 10)) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) @@ -84,7 +84,7 @@ def test_learning_curve_verbose(): old_stdout = sys.stdout sys.stdout = StringIO() try: - samples_range, train_scores, test_scores = \ + train_sizes, train_scores, test_scores = \ learning_curve(estimator, X, y, cv=3, verbose=1) finally: out = sys.stdout.getvalue() @@ -109,9 +109,9 @@ def test_learning_curve_incremental_learning(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockIncrementalImprovingClassifier(20) - samples_range, train_scores, test_scores = learning_curve( + train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=3, exploit_incremental_learning=True) - assert_array_equal(samples_range, np.linspace(2, 20, 10)) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) @@ -120,19 +120,19 @@ def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) - samples_range = np.linspace(0.2, 1.0, 5) + train_sizes = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) - samples_range_inc, train_scores_inc, test_scores_inc = \ + train_sizes_inc, train_scores_inc, test_scores_inc = \ learning_curve( - estimator, X, y, samples_range=samples_range, + estimator, X, y, train_sizes=train_sizes, cv=3, exploit_incremental_learning=True) - samples_range_batch, train_scores_batch, test_scores_batch = \ + train_sizes_batch, train_scores_batch, test_scores_batch = \ learning_curve( - estimator, X, y, cv=3, samples_range=samples_range, + estimator, X, y, cv=3, train_sizes=train_sizes, exploit_incremental_learning=False) - assert_array_equal(samples_range_inc, samples_range_batch) + assert_array_equal(train_sizes_inc, train_sizes_batch) assert_array_almost_equal(train_scores_inc, train_scores_batch) assert_array_almost_equal(test_scores_inc, test_scores_batch) @@ -143,13 +143,13 @@ def test_learning_curve_n_sample_range_out_of_bounds(): n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - samples_range=[0.0, 1.0]) + train_sizes=[0.0, 1.0]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - samples_range=[0.1, 1.1]) + train_sizes=[0.1, 1.1]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - samples_range=[0, 20]) + train_sizes=[0, 20]) assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - samples_range=[1, 21]) + train_sizes=[1, 21]) def test_learning_curve_remove_duplicate_sample_sizes(): @@ -157,10 +157,10 @@ def test_learning_curve_remove_duplicate_sample_sizes(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(2) - samples_range, _, _ = assert_warns( + train_sizes, _, _ = assert_warns( RuntimeWarning, learning_curve, estimator, X, y, cv=3, - samples_range=np.linspace(0.33, 1.0, 3)) - assert_array_equal(samples_range, [1, 2]) + train_sizes=np.linspace(0.33, 1.0, 3)) + assert_array_equal(train_sizes, [1, 2]) def test_learning_curve_with_boolean_indices(): @@ -169,8 +169,8 @@ def test_learning_curve_with_boolean_indices(): n_clusters_per_class=1, random_state=0) estimator = MockImprovingClassifier(20) cv = KFold(n=30, n_folds=3, indices=False) - samples_range, train_scores, test_scores = learning_curve(estimator, X, y, + train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv) - assert_array_equal(samples_range, np.linspace(2, 20, 10)) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores, np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores, np.linspace(0.1, 1.0, 10)) From 974812725111ff23479689007d6df9ddba594e30 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Tue, 7 Jan 2014 22:37:58 +0100 Subject: [PATCH 29/30] Add documentation of '_translate_train_sizes' --- sklearn/learning_curve.py | 63 ++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 25ca2270bfd44..09da1524e8595 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -39,11 +39,12 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), Target relative to X for classification or regression; None for unsupervised learning. - train_sizes : array-like, shape = [n_ticks,], dtype float or int - Numbers of training examples that will be used to generate the - learning curve. If the dtype is float, it is regarded as a + train_sizes : array-like, shape = (n_ticks,), dtype float or int + Relative or absolute numbers of training examples that will be used to + generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. + Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 10)) @@ -103,11 +104,12 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv = new_cv n_max_training_samples = len(cv[0][0]) - train_sizes_abs, n_unique_ticks = _translate_train_sizes( - train_sizes, n_max_training_samples) # Because the lengths of folds can be significantly different, it is # not guaranteed that we use all of the available training data when we # use the first 'n_max_training_samples' samples. + train_sizes_abs = _translate_train_sizes(train_sizes, + n_max_training_samples) + n_unique_ticks = train_sizes_abs.shape[0] if verbose > 0: print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) @@ -138,11 +140,28 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10), def _translate_train_sizes(train_sizes, n_max_training_samples): - """Determine range of number of training samples. + """Determine absolute sizes of training subsets and validate 'train_sizes'. + + Examples: + _translate_train_sizes([0.5, 1.0], 10) -> ([5, 10], 2) + _translate_train_sizes([5, 10], 10) -> ([5, 10], 2) + + Parameters + ---------- + train_sizes : array-like, shape = (n_ticks,), dtype float or int + Numbers of training examples that will be used to generate the + learning curve. If the dtype is float, it is regarded as a + fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. - If the dtype of train_sizes is float, the numbers will be interpreted as - fractions of n_max_training_samples. Otherwise they will be interpreted - as absolute values with n_max_training_samples as maximum value. + n_max_training_samples : int + Maximum number of training samples (upper bound of 'train_sizes'). + + Returns + ------- + train_sizes_abs : array, shape = [n_unique_ticks,], dtype int + Numbers of training examples that will be used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. """ train_sizes_abs = np.asarray(train_sizes) n_ticks = train_sizes_abs.shape[0] @@ -150,31 +169,33 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): n_max_required_samples = np.max(train_sizes_abs) if np.issubdtype(train_sizes_abs.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("train_sizes must be within (0, 1], " - "but is within [%f, %f]." + raise ValueError("train_sizes has been interpreted as fractions of " + "the maximum number of training samples and must " + "be within (0, 1], but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) train_sizes_abs = (train_sizes_abs - * n_max_training_samples).astype(np.int) + * n_max_training_samples).astype(np.int) train_sizes_abs = np.clip(train_sizes_abs, 1, - n_max_training_samples) + n_max_training_samples) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): - raise ValueError("train_sizes must be within (0, %d], " - "but is within [%d, %d]." + raise ValueError("train_sizes has been interpreted as absolute " + "numbers of training samples and must be within " + "(0, %d], but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) train_sizes_abs = np.unique(train_sizes_abs) - n_unique_ticks = train_sizes_abs.shape[0] - if n_ticks > n_unique_ticks: - warnings.warn("Number of ticks will be less than than the size of " - "'train_sizes' (%d instead of %d)." - % (n_unique_ticks, n_ticks), RuntimeWarning) + if n_ticks > train_sizes_abs.shape[0]: + warnings.warn("Removed duplicate entries from 'train_sizes'. Number of " + "ticks will be less than than the size of 'train_sizes' " + "(%d instead of %d)." + % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) - return train_sizes_abs, n_unique_ticks + return train_sizes_abs def _fit_estimator(base_estimator, X, y, train, test, From 822bd7b04a0029680a07c6301eab1d78f906bf79 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 8 Jan 2014 00:31:27 +0100 Subject: [PATCH 30/30] Improve documentation of private functions --- sklearn/learning_curve.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 09da1524e8595..f6ce0e04922e4 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -143,8 +143,8 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): """Determine absolute sizes of training subsets and validate 'train_sizes'. Examples: - _translate_train_sizes([0.5, 1.0], 10) -> ([5, 10], 2) - _translate_train_sizes([5, 10], 10) -> ([5, 10], 2) + _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] + _translate_train_sizes([5, 10], 10) -> [5, 10] Parameters ---------- @@ -200,6 +200,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): def _fit_estimator(base_estimator, X, y, train, test, n_train_samples, scorer, verbose): + """Train estimator on a training subset and compute scores.""" train_subset = train[:n_train_samples] estimator = clone(base_estimator) X_train, y_train = _split(estimator, X, y, train_subset) @@ -212,6 +213,7 @@ def _fit_estimator(base_estimator, X, y, train, test, def _incremental_fit_estimator(base_estimator, X, y, classes, train, test, train_sizes, scorer, verbose): + """Train estimator on training subsets incrementally and compute scores.""" estimator = clone(base_estimator) train_scores, test_scores = [], [] for n_train_samples, partial_train in zip(train_sizes,