10000 [MRG+2] Fix empty input data common checks by ogrisel · Pull Request #4245 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+2] Fix empty input data common checks #4245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,9 @@ API changes summary
- Estimators will treat input with dtype object as numeric when possible.
By `Andreas Müller`_


- Estimators now raise `ValueError` consistently when fitted on empty
data (less than 1 sample or less than 1 feature for 2D input).
By `Olivier Grisel`_.

.. _changes_0_15_2:

Expand Down
8 changes: 3 additions & 5 deletions sklearn/ensemble/forest.py
10000
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,8 @@ def fit(self, X, y, sample_weight=None):
self : object
Returns self.
"""
# Convert data
# ensure_2d=False because there are actually unit test checking we fail
# for 1d. FIXME make this consistent in the future.
X = check_array(X, dtype=DTYPE, ensure_2d=False, accept_sparse="csc")
# Validate or convert input data
X = check_array(X, dtype=DTYPE, accept_sparse="csc")
if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amueller I removed ensure_2d=False here and no unit test check failed contrary to what the FIXME inline comment says. Do remember which tests used to fail?

This change is technically not required for this PR. I can revert it if you prefer to deal with this case in another PR such as the one tackling #4252 for instance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was a tree-specific test.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or rather forest-specific.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does no longer fail apparently.

# ensemble sorts the indices.
Expand All @@ -207,7 +205,7 @@ def fit(self, X, y, sample_weight=None):
if y.ndim == 2 and y.shape[1] == 1:
warn("A column-vector y was passed when a 1d array was"
" expected. Please change the shape of y to "
"(n_samples, ), for example using ravel().",
"(n_samples,), for example using ravel().",
DataConversionWarning, stacklevel=2)

if y.ndim == 1:
Expand Down
5 changes: 2 additions & 3 deletions sklearn/kernel_approximation.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,10 +437,8 @@ def fit(self, X, y=None):
X : array-like, shape=(n_samples, n_feature)
Training data.
"""

X = check_array(X, accept_sparse='csr')
rnd = check_random_state(self.random_state)
if not sp.issparse(X):
X = np.asarray(X)
n_samples = X.shape[0]

# get basis vectors
Expand Down Expand Up @@ -487,6 +485,7 @@ def transform(self, X):
Transformed data.
"""
check_is_fitted(self, 'components_')
X = check_array(X, accept_sparse='csr')

kernel_params = self._get_kernel_params()
embedded = pairwise_kernels(X, self.components_,
Expand Down
2 changes: 2 additions & 0 deletions sklearn/linear_model/coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,8 @@ def fit(self, X, y):
Target values
"""
y = np.asarray(y, dtype=np.float64)
if y.shape[0] == 0:
raise ValueError("y has 0 samples: %r" % y)

if hasattr(self, 'l1_ratio'):
model_str = 'ElasticNet'
Expand Down
6 changes: 6 additions & 0 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..utils.fixes import in1d
from ..utils import deprecated, column_or_1d
from ..utils.validation import check_array
from ..utils.validation import _num_samples
from ..utils.multiclass import unique_labels
from ..utils.multiclass import type_of_target

Expand Down Expand Up @@ -315,6 +316,8 @@ def fit(self, y):
if 'multioutput' in self.y_type_:
raise ValueError("Multioutput target data is not supported with "
"label binarization")
if _num_samples(y) == 0:
raise ValueError('y has 0 samples: %r' % y)

self.sparse_input_ = sp.issparse(y)
self.classes_ = unique_labels(y)
Expand Down Expand Up @@ -465,6 +468,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)
else:
if _num_samples(y) == 0:
raise ValueError('y has 0 samples: %r' % y)
if neg_label >= pos_label:
raise ValueError("neg_label={0} must be strictly less than "
"pos_label={1}.".format(neg_label, pos_label))
Expand Down
5 changes: 5 additions & 0 deletions sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
check_regressors_pickle,
check_transformer_pickle,
check_transformers_unfitted,
check_estimators_empty_data_messages,
check_estimators_nan_inf,
check_estimators_unfitted,
check_classifiers_one_label,
Expand Down Expand Up @@ -99,6 +100,10 @@ def test_non_meta_estimators():
yield check_fit_score_takes_y, name, Estimator
yield check_dtype_object, name, Estimator

# Check that all estimator yield informative messages when
# trained on empty datasets
yield check_estimators_empty_data_messages, name, Estimator

if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']:
# SpectralEmbedding is non-deterministic,
# see issue #4236
Expand Down
19 changes: 19 additions & 0 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from sklearn.externals.six.moves import zip
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
Expand Down Expand Up @@ -346,6 +347,24 @@ def check_estimators_dtypes(name, Estimator):
pass


def check_estimators_empty_data_messages(name, Estimator):
e = Estimator()
set_fast_parameters(e)
set_random_state(e, 1)

X_zero_samples = np.empty(0).reshape(0, 3)
# The precise message can change depending on whether X or y is
# validated first. Let us test the type of exception only:
assert_raises(ValueError, e.fit, X_zero_samples, [])

X_zero_features = np.empty(0).reshape(3, 0)
# the following y should be accepted by both classifiers and regressors
# and ignored by unsupervised models
y = multioutput_estimator_convert_y_2d(name, np.array([1, 0, 1]))
msg = "0 feature(s) (shape=(3, 0)) while a minimum of 1 is required."
assert_raise_message(ValueError, msg, e.fit, X_zero_features, y)


def check_estimators_nan_inf(name, Estimator):
rnd = np.random.RandomState(0)
X_train_finite = rnd.uniform(size=(10, 3))
Expand Down
10 changes: 10 additions & 0 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ def test_check_array_min_samples_and_features_messages():
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_samples=2)

# The same message is raised if the data has 2 dimensions even if this is
# not mandatory
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_samples=2, ensure_2d=False)

# Simulate a model that would require at least 3 features (e.g. SelectKBest
# with k=3)
X = np.ones((10, 2))
Expand All @@ -245,6 +250,11 @@ def test_check_array_min_samples_and_features_messages():
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_features=3)

# Only the feature check is enabled whenever the number of dimensions is 2
# even if allow_nd is enabled:
assert_raise_message(ValueError, msg, check_X_y, X, y,
ensure_min_features=3, allow_nd=True)

# Simulate a case where a pipeline stage as trimmed all the features of a
# 2D dataset.
X = np.empty(0).reshape(10, 0)
Expand Down
22 changes: 14 additions & 8 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,9 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=Fal
ensure_min_features : int (default=1)
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when ``ensure_2d`` is True and
``allow_nd`` is False. Setting to 0 disables this check.
This check is only enforced when the input data has effectively 2
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
disables this check.

Returns
-------
Expand Down Expand Up @@ -347,7 +348,8 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=Fal
" minimum of %d is required."
% (n_samples, shape_repr, ensure_min_samples))

if ensure_min_features > 0 and ensure_2d and not allow_nd:

if ensure_min_features > 0 and array.ndim == 2:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this modification tested?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a bunch of common checks that would fail otherwise. I can add a couple of unittest in test_validation.py to make it more explicit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done in 7d7b95a

n_features = array.shape[1]
if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%s) while"
Expand Down Expand Up @@ -411,13 +413,16 @@ def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False,
axis (rows for a 2D array).

ensure_min_features : int (default=1)
Make sure that the 2D X has some minimum number of features
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when ``ensure_2d`` is True and
``allow_nd`` is False.
This check is only enforced when X has effectively 2 dimensions or
is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
this check.

y_numeric : boolean (default=False)
Whether to ensure that y has a numeric type. If dtype of y is object,
it is converted to float64. Should only be used for regression algorithms.
it is converted to float64. Should only be used for regression
algorithms.

Returns
-------
Expand All @@ -428,7 +433,8 @@ def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False,
ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features)
if multi_output:
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, dtype=None)
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
dtype=None)
else:
y = column_or_1d(y, warn=True)
_assert_all_finite(y)
Expand Down
0