8000 [MRG + 1] Fix the cross_val_predict function for method='predict_proba' by dalmia · Pull Request #7889 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG + 1] Fix the cross_val_predict function for method='predict_proba' #7889

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jan 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ Enhancements
Bug fixes
.........

- :func:`model_selection.cross_val_predict` now returns output of the
correct shape for all values of the argument ``method``.
:issue:`7863` by :user:`Aman Dalmia <dalmia>`.

- Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not
exactly implement Benjamini-Hochberg procedure. It formerly may have
selected fewer features than it should.
Expand Down
17 changes: 16 additions & 1 deletion sklearn/model_selection/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ..metrics.scorer import check_scoring
from ..exceptions import FitFailedWarning
from ._split import check_cv
from ..preprocessing import LabelEncoder

__all__ = ['cross_val_score', 'cross_val_predict', 'permutation_test_score',
'learning_curve', 'validation_curve']
Expand Down Expand Up @@ -365,7 +366,9 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
as in '2*n_jobs'

method : string, optional, default: 'predict'
Invokes the passed method name of the passed estimator.
Invokes the passed method name of the passed estimator. For
method='predict_proba', the columns correspond to the classes
in sorted order.

Returns
-------
Expand All @@ -392,6 +395,10 @@ def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
raise AttributeError('{} not implemented in estimator'
.format(method))

if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
le = LabelEncoder()
y = le.fit_transform(y)

# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
Expand Down Expand Up @@ -474,6 +481,14 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params,
estimator.fit(X_train, y_train, **fit_params)
func = getattr(estimator, method)
predictions = func(X_test)
if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should be inspecting the shape of the prediction, rather than doing this based on name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem lies is that predict_proba and predict_log_proba return a 2D array when there are 2 classes in the training set but decision_function returns a 1D array. So, I chose to go by method to differentiate the two.

n_classes = len(set(y))
predictions_ = np.zeros((X_test.shape[0], n_classes))
if method == 'decision_function' and len(estimator.classes_) == 2:
predictions_[:, estimator.classes_[-1]] = predictions
else:
predictions_[:, estimator.classes_] = predictions
predictions = predictions_
return predictions, test


Expand Down
74 changes: 74 additions & 0 deletions sklearn/model_selection/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from sklearn.cluster import KMeans

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.externals.six.moves import cStringIO as StringIO
Expand Down Expand Up @@ -935,6 +936,79 @@ def test_cross_val_predict_with_method():
cv=kfold)
assert_array_almost_equal(expected_predictions, predictions)

# Test alternative representations of y
predictions_y1 = cross_val_predict(est, X, y + 1, method=method,
cv=kfold)
assert_array_equal(predictions, predictions_y1)

predictions_y2 = cross_val_predict(est, X, y - 2, method=method,
cv=kfold)
assert_array_equal(predictions, predictions_y2)

predictions_ystr = cross_val_predict(est, X, y.astype('str'),
method=method, cv=kfold)
assert_array_equal(predictions, predictions_ystr)


def get_expected_predictions(X, y, cv, classes, est, method):

expected_predictions = np.zeros([len(y), classes])
func = getattr(est, method)

for train, test in cv.split(X, y):
est.fit(X[train], y[train])
expected_predictions_ = func(X[test])
# To avoid 2 dimensional indexing
exp_pred_test = np.zeros((len(test), classes))
if method is 'decision_function' and len(est.classes_) == 2:
exp_pred_test[:, est.classes_[-1]] = expected_predictions_
else:
exp_pred_test[:, est.classes_] = expected_predictions_
expected_predictions[test] = exp_pred_test

return expected_predictions


def test_cross_val_predict_class_subset():

X = np.arange(8).reshape(4, 2)
y = np.array([0, 0, 1, 2])
classes = 3

kfold3 = KFold(n_splits=3)
kfold4 = KFold(n_splits=4)

le = LabelEncoder()

methods = ['decision_function', 'predict_proba', 'predict_log_proba']
for method in methods:
est = LogisticRegression()

# Test with n_splits=3
predictions = cross_val_predict(est, X, y, method=method,
cv=kfold3)

# Runs a naive loop (should be same as cross_val_predict):
expected_predictions = get_expected_predictions(X, y, kfold3, classes,
est, method)
assert_array_almost_equal(expected_predictions, predictions)

# Test with n_splits=4
predictions = cross_val_predict(est, X, y, method=method,
cv=kfold4)
expected_predictions = get_expected_predictions(X, y, kfold4, classes,
est, method)
assert_array_almost_equal(expected_predictions, predictions)

# Testing unordered labels
y = [1, 1, -4, 6]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to explicitly test for a use-case where the classes_ is guaranteed to not be sorted but cross_val_predict returns output in sorted order?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned here, classes_ should indeed be sorted. Please have a look.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough! Thanks for checking :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But wait could we have a mock estimator that does not have the classes_ sorted? @jnothman Is it worth having such a test?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind this has been discussed before. Sorry for not checking... +1 for merge with the whatsnew entry...

predictions = cross_val_predict(est, X, y, method=method,
cv=kfold3)
y = le.fit_transform(y)
expected_predictions = get_expected_predictions(X, y, kfold3, classes,
est, method)
assert_array_almost_equal(expected_predictions, predictions)


def test_score_memmap():
# Ensure a scalar score of memmap type is accepted
Expand Down
0