From 3e139f1328cb962d9a22953310285988441e5468 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 16 Nov 2016 18:29:24 +0530 Subject: [PATCH 01/20] Fix the cross_val_predict function for method='predict_proba' --- sklearn/model_selection/_validation.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 23db2a9cebc77..46593d0963c85 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -15,6 +15,7 @@ import warnings import numbers import time +import operator import numpy as np import scipy.sparse as sp @@ -365,7 +366,9 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, as in '2*n_jobs' method : string, optional, default: 'predict' - Invokes the passed method name of the passed estimator. + Invokes the passed method name of the passed estimator. For + method='predict_proba', the columns correspond to the classes + in sorted order. Returns ------- @@ -474,6 +477,20 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) + if method is 'predict_proba' and is callable(getattr(estimator, 'classes_')): + class_func = getattr(estimator, 'classes_') + classes = class_func(X_test) + class_map = dict() + for i in range(len(classes)): + class_map[i] = classes[i] + + #pred = np.empty(predictions.shape) + #for i,(x,y) in enumerate(sorted(class_map.items(), key=operator.itemgetter(1))): + # pred[i] = predictions[x] + #predictions = pred + + sorted_indices=[x for (x,y) in sorted(class_map.items(), key=operator.itemgetter(1))] + predictions=predictions[sorted_indices] return predictions, test From ab85f948b0e5db4241ca5f12f097fad198998c58 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 16 Nov 2016 19:12:53 +0530 Subject: [PATCH 02/20] Fix the syntax error and cleaning up code --- sklearn/model_selection/_validation.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 46593d0963c85..46cbad26c8c70 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -477,18 +477,11 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - if method is 'predict_proba' and is callable(getattr(estimator, 'classes_')): - class_func = getattr(estimator, 'classes_') - classes = class_func(X_test) + if method is 'predict_proba' and hasattr(estimator, 'classes_'): + classes = getattr(estimator, 'classes_') class_map = dict() for i in range(len(classes)): class_map[i] = classes[i] - - #pred = np.empty(predictions.shape) - #for i,(x,y) in enumerate(sorted(class_map.items(), key=operator.itemgetter(1))): - # pred[i] = predictions[x] - #predictions = pred - sorted_indices=[x for (x,y) in sorted(class_map.items(), key=operator.itemgetter(1))] predictions=predictions[sorted_indices] return predictions, test From e835854ab42f4f349a0ca7e58a0337c3617d1ee4 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 16 Nov 2016 23:38:25 +0530 Subject: [PATCH 03/20] STY: style fix (PEP8) --- sklearn/model_selection/_validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 46cbad26c8c70..246ec026b590f 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -482,8 +482,9 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, class_map = dict() for i in range(len(classes)): class_map[i] = classes[i] - sorted_indices=[x for (x,y) in sorted(class_map.items(), key=operator.itemgetter(1))] - predictions=predictions[sorted_indices] + sorted_indices = [i for i, j in sorted(class_map.items(), + key=operator.itemgetter(1))] + predictions = predictions[sorted_indices] return predictions, test From 295a1e72e27c3be6e8d73faa992673395b769760 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 17 Nov 2016 01:00:14 +0530 Subject: [PATCH 04/20] BUG: remove bug in _fit_and_predict --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 246ec026b590f..c6f3ffa991aed 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -484,7 +484,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, class_map[i] = classes[i] sorted_indices = [i for i, j in sorted(class_map.items(), key=operator.itemgetter(1))] - predictions = predictions[sorted_indices] + predictions = predictions[:, sorted_indices] return predictions, test From c16fbe5f30c8e09902c0c7434ef26e03a28e5448 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 5 Dec 2016 12:58:29 +0530 Subject: [PATCH 05/20] FIX: making _fit_and_predict return classes --- sklearn/model_selection/_validation.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index c6f3ffa991aed..6fb458491a3b4 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -15,7 +15,6 @@ import warnings import numbers import time -import operator import numpy as np import scipy.sparse as sp @@ -404,9 +403,10 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, for train, test in cv_iter) # Concatenate the predictions - predictions = [pred_block_i for pred_block_i, _ in prediction_blocks] + predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] test_indices = np.concatenate([indices_i - for _, indices_i in prediction_blocks]) + for _, indices_i, _ in prediction_blocks]) + classes = [classes_i for _, _, classes_i in prediction_blocks] if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') @@ -477,15 +477,11 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) + + classes = [] if method is 'predict_proba' and hasattr(estimator, 'classes_'): classes = getattr(estimator, 'classes_') - class_map = dict() - for i in range(len(classes)): - class_map[i] = classes[i] - sorted_indices = [i for i, j in sorted(class_map.items(), - key=operator.itemgetter(1))] - predictions = predictions[:, sorted_indices] - return predictions, test + return predictions, test, classes def _check_is_permutation(indices, n_samples): From e53b8504c9ef3f9d9b289c2ae850719c948b30c5 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Tue, 6 Dec 2016 10:27:01 +0530 Subject: [PATCH 06/20] DOC: Added docstring for the returned classes --- sklearn/model_selection/_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 6fb458491a3b4..d5f995c9b30f6 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -462,6 +462,10 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, test : array-like This is the value of the test parameter + + classes : array-like + Result of calling 'estimator.classes_' for estimators having the + `classes_` attribute """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} From 3f076e2ddce20ac560ca61f39b242b6d625691de Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 7 Dec 2016 09:32:56 +0530 Subject: [PATCH 07/20] FIX: added LabelBinarizer to ensure predict_proba returns prdictions of the current shape --- sklearn/model_selection/_validation.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index d5f995c9b30f6..d6c381333458c 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -20,6 +20,7 @@ import scipy.sparse as sp from ..base import is_classifier, clone +from ..preprocessing import LabelBinarizer from ..utils import indexable, check_random_state, safe_indexing from ..utils.fixes import astype from ..utils.validation import _is_arraylike, _num_samples @@ -403,10 +404,9 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, for train, test in cv_iter) # Concatenate the predictions - predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks] + predictions = [pred_block_i for pred_block_i, _ in prediction_blocks] test_indices = np.concatenate([indices_i - for _, indices_i, _ in prediction_blocks]) - classes = [classes_i for _, _, classes_i in prediction_blocks] + for _, indices_i in prediction_blocks]) if not _check_is_permutation(test_indices, _num_samples(X)): raise ValueError('cross_val_predict only works for partitions') @@ -462,16 +462,15 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, test : array-like This is the value of the test parameter - - classes : array-like - Result of calling 'estimator.classes_' for estimators having the - `classes_` attribute """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) + lb = LabelBinarizer() + lb.fit(y) + X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) @@ -482,10 +481,9 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, func = getattr(estimator, method) predictions = func(X_test) - classes = [] - if method is 'predict_proba' and hasattr(estimator, 'classes_'): - classes = getattr(estimator, 'classes_') - return predictions, test, classes + if method is 'predict_proba': + predictions = lb.transform(predictions) + return predictions, test def _check_is_permutation(indices, n_samples): From 69f9207bf68341e5db081109eca3018025996808 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 10 Dec 2016 14:56:24 +0530 Subject: [PATCH 08/20] FIX: fixed _fit_and_predict and added tests --- sklearn/model_selection/_validation.py | 19 ++++++++++++------- .../model_selection/tests/test_validation.py | 14 +++++++++++++- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index d6c381333458c..329f7308988a3 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -20,7 +20,6 @@ import scipy.sparse as sp from ..base import is_classifier, clone -from ..preprocessing import LabelBinarizer from ..utils import indexable, check_random_state, safe_indexing from ..utils.fixes import astype from ..utils.validation import _is_arraylike, _num_samples @@ -468,9 +467,6 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) - lb = LabelBinarizer() - lb.fit(y) - X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) @@ -480,9 +476,18 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - - if method is 'predict_proba': - predictions = lb.transform(predictions) + if method in ['decision_function', 'predict_proba', 'predict_log_proba']: + true_classes = np.unique(y) + train_classes = np.unique(y_train) + predictions_ = np.zeros((X_test.shape[0], true_classes.shape[0])) + if method is 'decision_function' and len(train_classes) == 2: + class_predictions = estimator.predict(X_test) + for i, j in enumerate(class_predictions): + predictions_[i, j] = predictions[i] + else: + for i, j in enumerate(train_classes): + predictions_[:, j] = predictions[:, i] + predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 31c5fc8257528..6b1b81350f772 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -914,6 +914,10 @@ def test_cross_val_predict_with_method(): X, y = shuffle(X, y, random_state=0) classes = len(set(y)) + y[:50] = 0 + y[50:100] = 1 + X[:100], y[:100] = shuffle(X[:100], y[:100], random_state=0) + kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] @@ -929,7 +933,15 @@ def test_cross_val_predict_with_method(): # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) - expected_predictions[test] = func(X[test]) + train_classes = np.unique(y[train]) + expected_predictions_ = func(X[test]) + if method is 'decision_function' and len(train_classes) == 2: + expected_test_predictions = est.predict(X[test]) + for i, j in enumerate(expected_test_predictions): + expected_predictions[i, j] = expected_predictions_[i] + else: + for i, j in enumerate(train_classes): + expected_predictions[test, j] = expected_predictions_[:, i] predictions = cross_val_predict(est, X, y, method=method, cv=kfold) From db0c86163eb5b61be64b140ba478ed460192f987 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 12 Dec 2016 12:26:32 +0530 Subject: [PATCH 09/20] TST: added separate test case and removed from tests --- sklearn/model_selection/_validation.py | 11 ++--- .../model_selection/tests/test_validation.py | 41 +++++++++++++++---- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 329f7308988a3..430bd993d16c2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -476,17 +476,12 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - if method in ['decision_function', 'predict_proba', 'predict_log_proba']: + if method in ['predict_proba', 'predict_log_proba']: true_classes = np.unique(y) train_classes = np.unique(y_train) predictions_ = np.zeros((X_test.shape[0], true_classes.shape[0])) - if method is 'decision_function' and len(train_classes) == 2: - class_predictions = estimator.predict(X_test) - for i, j in enumerate(class_predictions): - predictions_[i, j] = predictions[i] - else: - for i, j in enumerate(train_classes): - predictions_[:, j] = predictions[:, i] + for i, j in enumerate(train_classes): + predictions_[:, j] = predictions[:, i] predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 6b1b81350f772..5c9d93e083089 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -914,13 +914,43 @@ def test_cross_val_predict_with_method(): X, y = shuffle(X, y, random_state=0) classes = len(set(y)) + kfold = KFold(len(iris.target)) + + methods = ['decision_function', 'predict_proba', 'predict_log_proba'] + for method in methods: + est = LogisticRegression() + + predictions = cross_val_predict(est, X, y, method=method) + assert_equal(len(predictions), len(y)) + + expected_predictions = np.zeros([len(y), classes]) + func = getattr(est, method) + + # Naive loop (should be same as cross_val_predict): + for train, test in kfold.split(X, y): + est.fit(X[train], y[train]) + expected_predictions[test] = func(X[test]) + + predictions = cross_val_predict(est, X, y, method=method, + cv=kfold) + assert_array_almost_equal(expected_predictions, predictions) + + +def test_cross_val_predict_corner_case(): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + classes = len(set(y)) + + # Modifies the dataset so that in a particular fold, produced by + # kfold.split, the training set is composed of only 2 classes instead of 3 y[:50] = 0 y[50:100] = 1 X[:100], y[:100] = shuffle(X[:100], y[:100], random_state=0) kfold = KFold(len(iris.target)) - methods = ['decision_function', 'predict_proba', 'predict_log_proba'] + methods = ['predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() @@ -935,13 +965,8 @@ def test_cross_val_predict_with_method(): est.fit(X[train], y[train]) train_classes = np.unique(y[train]) expected_predictions_ = func(X[test]) - if method is 'decision_function' and len(train_classes) == 2: - expected_test_predictions = est.predict(X[test]) - for i, j in enumerate(expected_test_predictions): - expected_predictions[i, j] = expected_predictions_[i] - else: - for i, j in enumerate(train_classes): - expected_predictions[test, j] = expected_predictions_[:, i] + for i, j in enumerate(train_classes): + expected_predictions[test, j] = expected_predictions_[:, i] predictions = cross_val_predict(est, X, y, method=method, cv=kfold) From d0d0925e5509f82698d4d805acf89afa0c030b08 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Fri, 16 Dec 2016 03:56:45 +0530 Subject: [PATCH 10/20] FIX: improved _fit_and_predict for class labels --- sklearn/model_selection/_validation.py | 14 ++++++++++---- sklearn/model_selection/tests/test_validation.py | 10 +++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 430bd993d16c2..a685c58ea72a1 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -28,6 +28,7 @@ from ..metrics.scorer import check_scoring from ..exceptions import FitFailedWarning from ._split import check_cv +from ..preprocessing import LabelEncoder __all__ = ['cross_val_score', 'cross_val_predict', 'permutation_test_score', 'learning_curve', 'validation_curve'] @@ -394,6 +395,10 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, raise AttributeError('{} not implemented in estimator' .format(method)) + if method in ['decision_function', 'predict_proba', 'predict_log_proba']: + le = LabelEncoder() + y = le.fit_transform(y) + # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, @@ -476,12 +481,13 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) predictions = func(X_test) - if method in ['predict_proba', 'predict_log_proba']: + if method in ['decision_function', 'predict_proba', 'predict_log_proba']: true_classes = np.unique(y) - train_classes = np.unique(y_train) predictions_ = np.zeros((X_test.shape[0], true_classes.shape[0])) - for i, j in enumerate(train_classes): - predictions_[:, j] = predictions[:, i] + if method is 'decision_function' and len(estimator.classes_) == 2: + predictions_[:, estimator.classes_[-1]] = predictions + else: + predictions_[:, estimator.classes_] = predictions predictions = predictions_ return predictions, test diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 5c9d93e083089..2c083485c68ea 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -963,10 +963,14 @@ def test_cross_val_predict_corner_case(): # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) - train_classes = np.unique(y[train]) expected_predictions_ = func(X[test]) - for i, j in enumerate(train_classes): - expected_predictions[test, j] = expected_predictions_[:, i] + # To avoid 2 dimensional indexing + exp_pred_test = np.zeros((len(test), classes)) + if method is 'decision_function' and len(est.classes_) == 2: + exp_pred_test[:, est.classes_[-1]] = expected_predictions_ + else: + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test predictions = cross_val_predict(est, X, y, method=method, cv=kfold) From 96ce58e3c76bbe77a8c2d7e8b5301122845eee86 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Sat, 17 Dec 2016 12:13:04 +0530 Subject: [PATCH 11/20] TST: added tests for different label types --- .../model_selection/tests/test_validation.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2c083485c68ea..8a325658fde41 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -51,6 +51,7 @@ from sklearn.cluster import KMeans from sklearn.preprocessing import Imputer +from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline from sklearn.externals.six.moves import cStringIO as StringIO @@ -977,6 +978,83 @@ def test_cross_val_predict_corner_case(): assert_array_almost_equal(expected_predictions, predictions) +def test_cross_val_predict_different_label_types(): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + classes = len(set(y)) + + # unordered integer labels + y_ = np.empty(y.shape) + for i, c in zip(range(3), [-1, 4, 6]): + y_[np.where(y == i)] = c + + # Modifies the dataset so that in a particular fold, produced by + # kfold.split, the training set is composed of only 2 classes instead of 3 + y_[:50] = -1 + y_[50:100] = 4 + X[:100], y_[:100] = shuffle(X[:100], y_[:100], random_state=0) + + # string labels + y_str = np.empty(y.shape, dtype=object) + for i, c in zip(range(3), iris.target_names): + y_str[np.where(y == i)] = c + + # Modifies the dataset so that in a particular fold, produced by + # kfold.split, the training set is composed of only 2 classes instead of 3 + y_str[:50] = iris.target_names[0] + y_str[50:100] = iris.target_names[1] + X[:100], y_str[:100] = shuffle(X[:100], y_str[:100], random_state=0) + + kfold = KFold(len(iris.target)) + + le = LabelEncoder() + est = LogisticRegression() + + methods = ['decision_function', 'predict_proba', 'predict_log_proba'] + for method in methods: + + # Testing labels as unordered integers + predictions = cross_val_predict(est, X, y_, method=method) + assert_equal(len(predictions), len(y_)) + + predictions = cross_val_predict(est, X, y_, method=method, + cv=kfold) + + expected_predictions = np.zeros([len(y_), classes]) + func = getattr(est, method) + + # Transforming the class labels for passing to the estimator + y_ = le.fit_transform(y_) + + # Naive loop (should be same as cross_val_predict): + for train, test in kfold.split(X, y_): + est.fit(X[train], y_[train]) + expected_predictions[test] = func(X[test]) + + assert_array_almost_equal(expected_predictions, predictions) + + # Testing labels as strings + predictions = cross_val_predict(est, X, y_str, method=method) + assert_equal(len(predictions), len(y_str)) + + predictions = cross_val_predict(est, X, y_str, method=method, + cv=kfold) + + expected_predictions = np.zeros([len(y_str), classes]) + func = getattr(est, method) + + # Transforming the class labels for passing to the estimator + y_str = le.fit_transform(y_str) + + # Naive loop (should be same as cross_val_predict): + for train, test in kfold.split(X, y_str): + est.fit(X[train], y_str[train]) + expected_predictions[test] = func(X[test]) + + assert_array_almost_equal(expected_predictions, predictions) + + def test_score_memmap(): # Ensure a scalar score of memmap type is accepted iris = load_iris() From 978d3d717cc6f270de75811a3e5bd5200f225e55 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Mon, 19 Dec 2016 15:36:25 +0530 Subject: [PATCH 12/20] FIX: resolved errors --- .../model_selection/tests/test_validation.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 8a325658fde41..5940f4eef5a9b 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1006,7 +1006,7 @@ def test_cross_val_predict_different_label_types(): y_str[50:100] = iris.target_names[1] X[:100], y_str[:100] = shuffle(X[:100], y_str[:100], random_state=0) - kfold = KFold(len(iris.target)) + kfold = KFold(classes) le = LabelEncoder() est = LogisticRegression() @@ -1030,7 +1030,14 @@ def test_cross_val_predict_different_label_types(): # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y_): est.fit(X[train], y_[train]) - expected_predictions[test] = func(X[test]) + expected_predictions_ = func(X[test]) + # To avoid 2 dimensional indexing + exp_pred_test = np.zeros((len(test), classes)) + if method is 'decision_function' and len(est.classes_) == 2: + exp_pred_test[:, est.classes_[-1]] = expected_predictions_ + else: + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test assert_array_almost_equal(expected_predictions, predictions) @@ -1050,7 +1057,14 @@ def test_cross_val_predict_different_label_types(): # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y_str): est.fit(X[train], y_str[train]) - expected_predictions[test] = func(X[test]) + expected_predictions_ = func(X[test]) + # To avoid 2 dimensional indexing + exp_pred_test = np.zeros((len(test), classes)) + if method is 'decision_function' and len(est.classes_) == 2: + exp_pred_test[:, est.classes_[-1]] = expected_predictions_ + else: + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test assert_array_almost_equal(expected_predictions, predictions) From be449959444fe1da8fba076ee6382474e024291d Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Tue, 20 Dec 2016 18:14:11 +0530 Subject: [PATCH 13/20] FIX: replaced `is` with `==` for comparison --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index a685c58ea72a1..cf9b494b676eb 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -484,7 +484,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, if method in ['decision_function', 'predict_proba', 'predict_log_proba']: true_classes = np.unique(y) predictions_ = np.zeros((X_test.shape[0], true_classes.shape[0])) - if method is 'decision_function' and len(estimator.classes_) == 2: + if method == 'decision_function' and len(estimator.classes_) == 2: predictions_[:, estimator.classes_[-1]] = predictions else: predictions_[:, estimator.classes_] = predictions From 97c85c8010ed91b3ee2420e200dad07cf7e2679d Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Tue, 20 Dec 2016 18:17:56 +0530 Subject: [PATCH 14/20] FIX: update test name --- sklearn/model_selection/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 5940f4eef5a9b..22c3c248b05b2 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -937,7 +937,7 @@ def test_cross_val_predict_with_method(): assert_array_almost_equal(expected_predictions, predictions) -def test_cross_val_predict_corner_case(): +def test_cross_val_predict_class_subset(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) From 4b8d1eb18e60dace31f4e9bb3d3e90afe8c8289b Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Wed, 21 Dec 2016 18:03:34 +0530 Subject: [PATCH 15/20] TST: improved the tests --- .../model_selection/tests/test_validation.py | 119 ++++++------------ 1 file changed, 38 insertions(+), 81 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 22c3c248b05b2..654cdd14a0f4d 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -936,33 +936,44 @@ def test_cross_val_predict_with_method(): cv=kfold) assert_array_almost_equal(expected_predictions, predictions) + # Test alternative representations of y + predictions_y1 = cross_val_predict(est, X, y+1, method=method, + cv=kfold) + assert_array_equal(predictions, predictions_y1) + + predictions_y2 = cross_val_predict(est, X, y-2, method=method, + cv=kfold) + assert_array_equal(predictions, predictions_y2) + + predictions_ystr = cross_val_predict(est, X, y.astype('U'), + method=method, cv=kfold) + assert_array_equal(predictions, predictions_ystr) + def test_cross_val_predict_class_subset(): - iris = load_iris() - X, y = iris.data, iris.target - X, y = shuffle(X, y, random_state=0) - classes = len(set(y)) - # Modifies the dataset so that in a particular fold, produced by - # kfold.split, the training set is composed of only 2 classes instead of 3 - y[:50] = 0 - y[50:100] = 1 - X[:100], y[:100] = shuffle(X[:100], y[:100], random_state=0) + X = np.arange(8).reshape(4, 2) + y = np.array([0, 0, 1, 2]) + classes = 3 - kfold = KFold(len(iris.target)) + kfold3 = KFold(n_splits=3) + kfold4 = KFold(n_splits=4) + + le = LabelEncoder() - methods = ['predict_proba', 'predict_log_proba'] + methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() - predictions = cross_val_predict(est, X, y, method=method) - assert_equal(len(predictions), len(y)) - expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) + # Test with n_splits=3 + predictions = cross_val_predict(est, X, y, method=method, + cv=kfold3) + # Naive loop (should be same as cross_val_predict): - for train, test in kfold.split(X, y): + for train, test in kfold3.split(X, y): est.fit(X[train], y[train]) expected_predictions_ = func(X[test]) # To avoid 2 dimensional indexing @@ -973,63 +984,15 @@ def test_cross_val_predict_class_subset(): exp_pred_test[:, est.classes_] = expected_predictions_ expected_predictions[test] = exp_pred_test - predictions = cross_val_predict(est, X, y, method=method, - cv=kfold) assert_array_almost_equal(expected_predictions, predictions) - -def test_cross_val_predict_different_label_types(): - iris = load_iris() - X, y = iris.data, iris.target - X, y = shuffle(X, y, random_state=0) - classes = len(set(y)) - - # unordered integer labels - y_ = np.empty(y.shape) - for i, c in zip(range(3), [-1, 4, 6]): - y_[np.where(y == i)] = c - - # Modifies the dataset so that in a particular fold, produced by - # kfold.split, the training set is composed of only 2 classes instead of 3 - y_[:50] = -1 - y_[50:100] = 4 - X[:100], y_[:100] = shuffle(X[:100], y_[:100], random_state=0) - - # string labels - y_str = np.empty(y.shape, dtype=object) - for i, c in zip(range(3), iris.target_names): - y_str[np.where(y == i)] = c - - # Modifies the dataset so that in a particular fold, produced by - # kfold.split, the training set is composed of only 2 classes instead of 3 - y_str[:50] = iris.target_names[0] - y_str[50:100] = iris.target_names[1] - X[:100], y_str[:100] = shuffle(X[:100], y_str[:100], random_state=0) - - kfold = KFold(classes) - - le = LabelEncoder() - est = LogisticRegression() - - methods = ['decision_function', 'predict_proba', 'predict_log_proba'] - for method in methods: - - # Testing labels as unordered integers - predictions = cross_val_predict(est, X, y_, method=method) - assert_equal(len(predictions), len(y_)) - - predictions = cross_val_predict(est, X, y_, method=method, - cv=kfold) - - expected_predictions = np.zeros([len(y_), classes]) - func = getattr(est, method) - - # Transforming the class labels for passing to the estimator - y_ = le.fit_transform(y_) + # Test with n_splits=4 + predictions = cross_val_predict(est, X, y, method=method, + cv=kfold4) # Naive loop (should be same as cross_val_predict): - for train, test in kfold.split(X, y_): - est.fit(X[train], y_[train]) + for train, test in kfold4.split(X, y): + est.fit(X[train], y[train]) expected_predictions_ = func(X[test]) # To avoid 2 dimensional indexing exp_pred_test = np.zeros((len(test), classes)) @@ -1041,22 +1004,16 @@ def test_cross_val_predict_different_label_types(): assert_array_almost_equal(expected_predictions, predictions) - # Testing labels as strings - predictions = cross_val_predict(est, X, y_str, method=method) - assert_equal(len(predictions), len(y_str)) - - predictions = cross_val_predict(est, X, y_str, method=method, - cv=kfold) - - expected_predictions = np.zeros([len(y_str), classes]) - func = getattr(est, method) + # Testing unordered labels + y = [1, 1, -4, 6] + predictions = cross_val_predict(est, X, y, method=method, + cv=kfold3) - # Transforming the class labels for passing to the estimator - y_str = le.fit_transform(y_str) + y = le.fit_transform(y) # Naive loop (should be same as cross_val_predict): - for train, test in kfold.split(X, y_str): - est.fit(X[train], y_str[train]) + for train, test in kfold3.split(X, y): + est.fit(X[train], y[train]) expected_predictions_ = func(X[test]) # To avoid 2 dimensional indexing exp_pred_test = np.zeros((len(test), classes)) From 1e2773c88253f7c85c74b4750ba0457304404105 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 22 Dec 2016 11:23:50 +0530 Subject: [PATCH 16/20] ENH: added function to expected predictions --- .../model_selection/tests/test_validation.py | 68 +++++++------------ 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 654cdd14a0f4d..9ffa60ba4ae37 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -950,6 +950,25 @@ def test_cross_val_predict_with_method(): assert_array_equal(predictions, predictions_ystr) +def get_expected_predictions(X, y, cv, classes, est, method): + + expected_predictions = np.zeros([len(y), classes]) + func = getattr(est, method) + + for train, test in cv.split(X, y): + est.fit(X[train], y[train]) + expected_predictions_ = func(X[test]) + # To avoid 2 dimensional indexing + exp_pred_test = np.zeros((len(test), classes)) + if method is 'decision_function' and len(est.classes_) == 2: + exp_pred_test[:, est.classes_[-1]] = expected_predictions_ + else: + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test + + return expected_predictions + + def test_cross_val_predict_class_subset(): X = np.arange(8).reshape(4, 2) @@ -965,64 +984,29 @@ def test_cross_val_predict_class_subset(): for method in methods: est = LogisticRegression() - expected_predictions = np.zeros([len(y), classes]) - func = getattr(est, method) - # Test with n_splits=3 predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) - # Naive loop (should be same as cross_val_predict): - for train, test in kfold3.split(X, y): - est.fit(X[train], y[train]) - expected_predictions_ = func(X[test]) - # To avoid 2 dimensional indexing - exp_pred_test = np.zeros((len(test), classes)) - if method is 'decision_function' and len(est.classes_) == 2: - exp_pred_test[:, est.classes_[-1]] = expected_predictions_ - else: - exp_pred_test[:, est.classes_] = expected_predictions_ - expected_predictions[test] = exp_pred_test - + # Runs a naive loop (should be same as cross_val_predict): + expected_predictions = get_expected_predictions(X, y, kfold3, classes, + est, method) assert_array_almost_equal(expected_predictions, predictions) # Test with n_splits=4 predictions = cross_val_predict(est, X, y, method=method, cv=kfold4) - - # Naive loop (should be same as cross_val_predict): - for train, test in kfold4.split(X, y): - est.fit(X[train], y[train]) - expected_predictions_ = func(X[test]) - # To avoid 2 dimensional indexing - exp_pred_test = np.zeros((len(test), classes)) - if method is 'decision_function' and len(est.classes_) == 2: - exp_pred_test[:, est.classes_[-1]] = expected_predictions_ - else: - exp_pred_test[:, est.classes_] = expected_predictions_ - expected_predictions[test] = exp_pred_test - + expected_predictions = get_expected_predictions(X, y, kfold4, classes, + est, method) assert_array_almost_equal(expected_predictions, predictions) # Testing unordered labels y = [1, 1, -4, 6] predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) - y = le.fit_transform(y) - - # Naive loop (should be same as cross_val_predict): - for train, test in kfold3.split(X, y): - est.fit(X[train], y[train]) - expected_predictions_ = func(X[test]) - # To avoid 2 dimensional indexing - exp_pred_test = np.zeros((len(test), classes)) - if method is 'decision_function' and len(est.classes_) == 2: - exp_pred_test[:, est.classes_[-1]] = expected_predictions_ - else: - exp_pred_test[:, est.classes_] = expected_predictions_ - expected_predictions[test] = exp_pred_test - + expected_predictions = get_expected_predictions(X, y, kfold3, classes, + est, method) assert_array_almost_equal(expected_predictions, predictions) From 920452262cce79558398e3e523fcd776ffd36361 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 29 Dec 2016 12:45:38 +0530 Subject: [PATCH 17/20] FIX: remove flake8 errors --- sklearn/model_selection/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 9ffa60ba4ae37..c074653b2d3b3 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -937,11 +937,11 @@ def test_cross_val_predict_with_method(): assert_array_almost_equal(expected_predictions, predictions) # Test alternative representations of y - predictions_y1 = cross_val_predict(est, X, y+1, method=method, + predictions_y1 = cross_val_predict(est, X, y + 1, method=method, cv=kfold) assert_array_equal(predictions, predictions_y1) - predictions_y2 = cross_val_predict(est, X, y-2, method=method, + predictions_y2 = cross_val_predict(est, X, y - 2, method=method, cv=kfold) assert_array_equal(predictions, predictions_y2) From fd21dceae632b30c709fc68bffc1e2bf90c43e98 Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 29 Dec 2016 12:58:50 +0530 Subject: [PATCH 18/20] FIX: replace unicode conversion with str conversion --- sklearn/model_selection/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index c074653b2d3b3..d81a12eb337d3 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -945,7 +945,7 @@ def test_cross_val_predict_with_method(): cv=kfold) assert_array_equal(predictions, predictions_y2) - predictions_ystr = cross_val_predict(est, X, y.astype('U'), + predictions_ystr = cross_val_predict(est, X, y.astype('str'), method=method, cv=kfold) assert_array_equal(predictions, predictions_ystr) From e8af5e03d5fa395b38f50180b94ec43c45ccc90b Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Thu, 5 Jan 2017 09:35:10 +0530 Subject: [PATCH 19/20] FIX: replaced 'true_classes' with 'n_classes --- sklearn/model_selection/_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index cf9b494b676eb..e8e33a4260f38 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -482,8 +482,8 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, func = getattr(estimator, method) predictions = func(X_test) if method in ['decision_function', 'predict_proba', 'predict_log_proba']: - true_classes = np.unique(y) - predictions_ = np.zeros((X_test.shape[0], true_classes.shape[0])) + n_classes = len(set(y)) + predictions_ = np.zeros((X_test.shape[0], n_classes)) if method == 'decision_function' and len(estimator.classes_) == 2: predictions_[:, estimator.classes_[-1]] = predictions else: From 933ef9bf6e3fb27b560bf0ed748415f7c59baf2e Mon Sep 17 00:00:00 2001 From: Aman Dalmia Date: Fri, 6 Jan 2017 21:51:55 +0530 Subject: [PATCH 20/20] DOC: added whats_new entry --- doc/whats_new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index cd6789bb1f805..367220a15d29b 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -73,6 +73,10 @@ Enhancements Bug fixes ......... + - :func:`model_selection.cross_val_predict` now returns output of the + correct shape for all values of the argument ``method``. + :issue:`7863` by :user:`Aman Dalmia `. + - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not exactly implement Benjamini-Hochberg procedure. It formerly may have selected fewer features than it should.