8000 [MRG+2] add validation for non-empty input data by ogrisel · Pull Request #4214 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+2] add validation for non-empty input data #4214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def fit(self, X, y, sample_weight=None):

self.constant = check_array(self.constant,
accept_sparse=['csr', 'csc', 'coo'],
ensure_2d=False)
ensure_2d=False, ensure_min_samples=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we keep this exception?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that self.constant is often a scalar. If I don not disable the ensure_min_samples check it raises a TypeError because the number of samples is undefined in that case (see _num_samples).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I thought about making _num_samples return 1 for scalar / singleton arrays but I thought treating that edge-case that way might hide bugs. This is why I decided to do it that way. Let me know what you think.

Singleton arrays are weird beasts. I think they are a design mistake in numpy but I did not want this PR to change the behavior of the Dummy models.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good as it is.


if self.output_2d_ and self.constant.shape[0] != y.shape[1]:
raise ValueError(
Expand Down
64 changes: 57 additions & 7 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from itertools import product

from sklearn.utils import as_float_array, check_array, check_symmetric
from sklearn.utils import check_X_y

from sklearn.utils.estimator_checks import NotAnArray

Expand All @@ -19,12 +20,12 @@
from sklearn.svm import SVR

from sklearn.datasets import make_blobs
from sklearn.utils import as_float_array, check_array
from sklearn.utils.estimator_checks import NotAnArray
from sklearn.utils.validation import (
NotFittedError,
has_fit_parameter,
check_is_fitted)
NotFittedError,
has_fit_parameter,
check_is_fitted)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STY: One import per line?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. I did not intend to change those import lines as part of this PR but apparently my editor fixed the indenting without asking me :)


from sklearn.utils.testing import assert_raise_message


def test_as_float_array():
Expand Down Expand Up @@ -177,7 +178,7 @@ def test_check_array():
Xs = [X_csc, X_coo, X_dok, X_int, X_float]
accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
copys):
copys):
X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse,
copy=copy)
if dtype is not None:
Expand Down Expand Up @@ -210,6 +211,55 @@ def test_check_array():
assert_true(isinstance(result, np.ndarray))


def test_check_array_min_samples_and_features_messages():
# empty list is considered 2D by default:
msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
assert_raise_message(ValueError, msg, check_array, [])

# If considered a 1D collection when ensure_2d=False, then the minimum
# number of samples will break:
msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required."
assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False)

# Invalid edge case when checking the default minimum sample of a scalar
msg = "Singleton array array(42) cannot be considered a valid collection."
assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False)

# But this works if the input data is forced to look like a 2 array with
# one sample and one feature:
X_checked = check_array(42, ensure_2d=True)
assert_array_equal(np.array([[42]]), X_checked)

# Simulate a model that would need at least 2 samples to be well defined
X = np.ones((1, 10))
y = np.ones(1)
msg = "1 sample(s) (shape=(1, 10)) while a minimum of 2 is required."
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_samples=2)

# Simulate a model that would require at least 3 features (e.g. SelectKBest
# with k=3)
X = np.ones((10, 2))
y = np.ones(2)
msg = "2 feature(s) (shape=(10, 2)) while a minimum of 3 is required."
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_features=3)

# Simulate a case where a pipeline stage as trimmed all the features of a
# 2D dataset.
X = np.empty(0).reshape(10, 0)
y = np.ones(10)
msg = "0 feature(s) (shape=(10, 0)) while a minimum of 1 is required."
assert_raise_message(ValueError, msg, check_X_y, X, y)

# nd-data is not checked for any minimum number of features by default:
X = np.ones((10, 0, 28, 28))
y = np.ones(10)
X_checked, y_checked = check_X_y(X, y, allow_nd=True)
assert_array_equal(X, X_checked)
assert_array_equal(y, y_checked)


def test_has_fit_parameter():
assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight"))
assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight"))
Expand Down Expand Up @@ -274,6 +324,6 @@ def test_check_is_fitted():

ard.fit(*make_blobs())
svr.fit(*make_blobs())

assert_equal(None, check_is_fitted(ard, "coef_"))
assert_equal(None, check_is_fitted(svr, "support_"))
57 changes: 50 additions & 7 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,13 @@ def _num_samples(x):
x = np.asarray(x)
else:
raise TypeError("Expected sequence or array-like, got %r" % x)
return x.shape[0] if hasattr(x, 'shape') else len(x)
if hasattr(x, 'shape'):
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
return x.shape[0]
else:
return len(x)


def check_consistent_length(*arrays):
Expand Down Expand Up @@ -222,10 +228,11 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy,


def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
force_all_finite=True, ensure_2d=True, allow_nd=False):
force_all_finite=True, ensure_2d=True, allow_nd=False,
ensure_min_samples=1, ensure_min_features=1):
"""Input validation on an array, list, sparse matrix or similar.

By default, the input is converted to an at least 2nd numpy array.
By default, the input is converted to an at least 2d numpy array.

Parameters
----------
Expand Down Expand Up @@ -257,6 +264,16 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
allow_nd : boolean (default=False)
Whether to allow X.ndim > 2.

ensure_min_samples : int (default=1)
Make sure that the array has a minimum number of samples in its first
axis (rows for a 2D array). Setting to 0 disables this check.

ensure_min_features : int (default=1)
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when ``ensure_2d`` is True and
``allow_nd`` is False. Setting to 0 disables this check.

Returns
-------
X_converted : object
Expand All @@ -278,12 +295,26 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False,
if force_all_finite:
_assert_all_finite(array)

if ensure_min_samples > 0:
n_samples = _num_samples(array)
if n_samples < ensure_min_samples:
raise ValueError("Found array with %d sample(s) (shape=%r) while a"
" minimum of %d is required."
% (n_samples, array.shape, ensure_min_samples))

if ensure_min_features > 0 and ensure_2d and not allow_nd:
n_features = array.shape[1]
if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%r) while"
" a minimum of %d is required."
% (n_features, array.shape, ensure_min_features))
return array


def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
force_all_finite=True, ensure_2d=True, allow_nd=False,
multi_output=False):
multi_output=False, ensure_min_samples=1,
ensure_min_features=1):
"""Input validation for standard estimators.

Checks X and y for consistent length, enforces X 2d and y 1d.
Expand Down Expand Up @@ -327,13 +358,24 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False,
Whether to allow 2-d y (array or sparse matrix). If false, y will be
validated as a vector.

ensure_min_samples : int (default=1)
Make sure that X has a minimum number of samples in its first
axis (rows for a 2D array).

ensure_min_features : int (default=1)
Make sure that the 2D X has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when ``ensure_2d`` is True and
``allow_nd`` is False.

Returns
-------
X_converted : object
The converted and validated X.
"""
X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
ensure_2d, allow_nd)
ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features)
if multi_output:
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False)
else:
Expand All @@ -353,7 +395,7 @@ def column_or_1d(y, warn=False):
y : array-like

warn : boolean, default False
To control display of warnings.
To control display of warnings.

Returns
-------
Expand Down Expand Up @@ -406,6 +448,7 @@ def check_random_state(seed):
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)


def has_fit_parameter(estimator, parameter):
"""Checks whether the estimator's fit method supports the given parameter.

Expand Down Expand Up @@ -512,4 +555,4 @@ def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
attributes = [attributes]

if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
raise NotFittedError(msg % {'name' : type(estimator).__name__})
raise NotFittedError(msg % {'name': type(estimator).__name__})
0