-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG + 1] Fix the cross_val_predict function for method='predict_proba' #7889
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3e139f1
ab85f94
e835854
295a1e7
c16fbe5
e53b850
3f076e2
69f9207
db0c861
d0d0925
96ce58e
978d3d7
be44995
97c85c8
4b8d1eb
1e2773c
9204522
fd21dce
e8af5e0
933ef9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,7 @@ | |
from sklearn.cluster import KMeans | ||
|
||
from sklearn.preprocessing import Imputer | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.pipeline import Pipeline | ||
|
||
from sklearn.externals.six.moves import cStringIO as StringIO | ||
|
@@ -935,6 +936,79 @@ def test_cross_val_predict_with_method(): | |
cv=kfold) | ||
assert_array_almost_equal(expected_predictions, predictions) | ||
|
||
# Test alternative representations of y | ||
predictions_y1 = cross_val_predict(est, X, y + 1, method=method, | ||
cv=kfold) | ||
assert_array_equal(predictions, predictions_y1) | ||
|
||
predictions_y2 = cross_val_predict(est, X, y - 2, method=method, | ||
cv=kfold) | ||
assert_array_equal(predictions, predictions_y2) | ||
|
||
predictions_ystr = cross_val_predict(est, X, y.astype('str'), | ||
method=method, cv=kfold) | ||
assert_array_equal(predictions, predictions_ystr) | ||
|
||
|
||
def get_expected_predictions(X, y, cv, classes, est, method): | ||
|
||
expected_predictions = np.zeros([len(y), classes]) | ||
func = getattr(est, method) | ||
|
||
for train, test in cv.split(X, y): | ||
est.fit(X[train], y[train]) | ||
expected_predictions_ = func(X[test]) | ||
# To avoid 2 dimensional indexing | ||
exp_pred_test = np.zeros((len(test), classes)) | ||
if method is 'decision_function' and len(est.classes_) == 2: | ||
exp_pred_test[:, est.classes_[-1]] = expected_predictions_ | ||
else: | ||
exp_pred_test[:, est.classes_] = expected_predictions_ | ||
expected_predictions[test] = exp_pred_test | ||
|
||
return expected_predictions | ||
|
||
|
||
def test_cross_val_predict_class_subset(): | ||
|
||
X = np.arange(8).reshape(4, 2) | ||
y = np.array([0, 0, 1, 2]) | ||
classes = 3 | ||
|
||
kfold3 = KFold(n_splits=3) | ||
kfold4 = KFold(n_splits=4) | ||
|
||
le = LabelEncoder() | ||
|
||
methods = ['decision_function', 'predict_proba', 'predict_log_proba'] | ||
for method in methods: | ||
est = LogisticRegression() | ||
|
||
# Test with n_splits=3 | ||
predictions = cross_val_predict(est, X, y, method=method, | ||
cv=kfold3) | ||
|
||
# Runs a naive loop (should be same as cross_val_predict): | ||
expected_predictions = get_expected_predictions(X, y, kfold3, classes, | ||
est, method) | ||
assert_array_almost_equal(expected_predictions, predictions) | ||
|
||
# Test with n_splits=4 | ||
predictions = cross_val_predict(est, X, y, method=method, | ||
cv=kfold4) | ||
expected_predictions = get_expected_predictions(X, y, kfold4, classes, | ||
est, method) | ||
assert_array_almost_equal(expected_predictions, predictions) | ||
|
||
# Testing unordered labels | ||
y = [1, 1, -4, 6] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want to explicitly test for a use-case where the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned here, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough! Thanks for checking :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But wait could we have a mock estimator that does not have the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind this has been discussed before. Sorry for not checking... +1 for merge with the whatsnew entry... |
||
predictions = cross_val_predict(est, X, y, method=method, | ||
cv=kfold3) | ||
y = le.fit_transform(y) | ||
expected_predictions = get_expected_predictions(X, y, kfold3, classes, | ||
est, method) | ||
assert_array_almost_equal(expected_predictions, predictions) | ||
|
||
|
||
def test_score_memmap(): | ||
# Ensure a scalar score of memmap type is accepted | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we should be inspecting the shape of the prediction, rather than doing this based on name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem lies is that
predict_proba
andpredict_log_proba
return a 2D array when there are 2 classes in the training set butdecision_function
returns a 1D array. So, I chose to go by method to differentiate the two.