-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[MRG+2] add validation for non-empty input data #4214
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
from itertools import product | ||
|
||
from sklearn.utils import as_float_array, check_array, check_symmetric | ||
from sklearn.utils import check_X_y | ||
|
||
from sklearn.utils.estimator_checks import NotAnArray | ||
|
||
|
@@ -19,12 +20,12 @@ | |
from sklearn.svm import SVR | ||
|
||
from sklearn.datasets import make_blobs | ||
from sklearn.utils import as_float_array, check_array | ||
from sklearn.utils.estimator_checks import NotAnArray | ||
from sklearn.utils.validation import ( | ||
NotFittedError, | ||
has_fit_parameter, | ||
check_is_fitted) | ||
NotFittedError, | ||
10000 | has_fit_parameter, | |
check_is_fitted) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. STY: One import per line? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. I did not intend to change those import lines as part of this PR but apparently my editor fixed the indenting without asking me :) |
||
|
||
from sklearn.utils.testing import assert_raise_message | ||
|
||
|
||
def test_as_float_array(): | ||
|
@@ -177,7 +178,7 @@ def test_check_array(): | |
Xs = [X_csc, X_coo, X_dok, X_int, X_float] | ||
accept_sparses = [['csr', 'coo'], ['coo', 'dok']] | ||
for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, | ||
copys): | ||
copys): | ||
X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, | ||
copy=copy) | ||
if dtype is not None: | ||
|
@@ -210,6 +211,55 @@ def test_check_array(): | |
assert_true(isinstance(result, np.ndarray)) | ||
|
||
|
||
def test_check_array_min_samples_and_features_messages(): | ||
# empty list is considered 2D by default: | ||
msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required." | ||
assert_raise_message(ValueError, msg, check_array, []) | ||
|
||
# If considered a 1D collection when ensure_2d=False, then the minimum | ||
# number of samples will break: | ||
msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required." | ||
assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False) | ||
|
||
# Invalid edge case when checking the default minimum sample of a scalar | ||
msg = "Singleton array array(42) cannot be considered a valid collection." | ||
assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False) | ||
|
||
# But this works if the input data is forced to look like a 2 array with | ||
# one sample and one feature: | ||
X_checked = check_array(42, ensure_2d=True) | ||
assert_array_equal(np.array([[42]]), X_checked) | ||
|
||
# Simulate a model that would need at least 2 samples to be well defined | ||
X = np.ones((1, 10)) | ||
y = np.ones(1) | ||
msg = "1 sample(s) (shape=(1, 10)) while a minimum of 2 is required." | ||
assert_raise_message(ValueError, msg, check_X_y, X, y, | ||
ensure_min_samples=2) | ||
|
||
# Simulate a model that would require at least 3 features (e.g. SelectKBest | ||
# with k=3) | ||
X = np.ones((10, 2)) | ||
y = np.ones(2) | ||
msg = "2 feature(s) (shape=(10, 2)) while a minimum of 3 is required." | ||
assert_raise_message(ValueError, msg, check_X_y, X, y, | ||
ensure_min_features=3) | ||
|
||
# Simulate a case where a pipeline stage as trimmed all the features of a | ||
# 2D dataset. | ||
X = np.empty(0).reshape(10, 0) | ||
y = np.ones(10) | ||
msg = "0 feature(s) (shape=(10, 0)) while a minimum of 1 is required." | ||
assert_raise_message(ValueError, msg, check_X_y, X, y) | ||
|
||
# nd-data is not checked for any minimum number of features by default: | ||
X = np.ones((10, 0, 28, 28)) | ||
y = np.ones(10) | ||
X_checked, y_checked = check_X_y(X, y, allow_nd=True) | ||
assert_array_equal(X, X_checked) | ||
assert_array_equal(y, y_checked) | ||
|
||
|
||
def test_has_fit_parameter(): | ||
assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) | ||
assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) | ||
|
@@ -274,6 +324,6 @@ def test_check_is_fitted(): | |
|
||
ard.fit(*make_blobs()) | ||
svr.fit(*make_blobs()) | ||
|
||
assert_equal(None, check_is_fitted(ard, "coef_")) | ||
assert_equal(None, check_is_fitted(svr, "support_")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we keep this exception?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem is that
self.constant
is often a scalar. If I don not disable theensure_min_samples
check it raises aTypeError
because the number of samples is undefined in that case (see_num_samples
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, I thought about making
_num_samples
return 1 for scalar / singleton arrays but I thought treating that edge-case that way might hide bugs. This is why I decided to do it that way. Let me know what you think.Singleton arrays are weird beasts. I think they are a design mistake in numpy but I did not want this PR to change the behavior of the
Dummy
models.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good as it is.