diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 9f6a07a664e10..2bb4fdb94ca53 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -435,7 +435,7 @@ def fit(self, X, y, sample_weight=None): self.constant = check_array(self.constant, accept_sparse=['csr', 'csc', 'coo'], - ensure_2d=False) + ensure_2d=False, ensure_min_samples=0) if self.output_2d_ and self.constant.shape[0] != y.shape[1]: raise ValueError( diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 91d43bc482e4c..affeae899c254 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -8,6 +8,7 @@ from itertools import product from sklearn.utils import as_float_array, check_array, check_symmetric +from sklearn.utils import check_X_y from sklearn.utils.estimator_checks import NotAnArray @@ -19,12 +20,12 @@ from sklearn.svm import SVR from sklearn.datasets import make_blobs -from sklearn.utils import as_float_array, check_array -from sklearn.utils.estimator_checks import NotAnArray from sklearn.utils.validation import ( - NotFittedError, - has_fit_parameter, - check_is_fitted) + NotFittedError, + has_fit_parameter, + check_is_fitted) + +from sklearn.utils.testing import assert_raise_message def test_as_float_array(): @@ -177,7 +178,7 @@ def test_check_array(): Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, - copys): + copys): X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if dtype is not None: @@ -210,6 +211,55 @@ def test_check_array(): assert_true(isinstance(result, np.ndarray)) +def test_check_array_min_samples_and_features_messages(): + # empty list is considered 2D by default: + msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required." + assert_raise_message(ValueError, msg, check_array, []) + + # If considered a 1D collection when ensure_2d=False, then the minimum + # number of samples will break: + msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required." + assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False) + + # Invalid edge case when checking the default minimum sample of a scalar + msg = "Singleton array array(42) cannot be considered a valid collection." + assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False) + + # But this works if the input data is forced to look like a 2 array with + # one sample and one feature: + X_checked = check_array(42, ensure_2d=True) + assert_array_equal(np.array([[42]]), X_checked) + + # Simulate a model that would need at least 2 samples to be well defined + X = np.ones((1, 10)) + y = np.ones(1) + msg = "1 sample(s) (shape=(1, 10)) while a minimum of 2 is required." + assert_raise_message(ValueError, msg, check_X_y, X, y, + ensure_min_samples=2) + + # Simulate a model that would require at least 3 features (e.g. SelectKBest + # with k=3) + X = np.ones((10, 2)) + y = np.ones(2) + msg = "2 feature(s) (shape=(10, 2)) while a minimum of 3 is required." + assert_raise_message(ValueError, msg, check_X_y, X, y, + ensure_min_features=3) + + # Simulate a case where a pipeline stage as trimmed all the features of a + # 2D dataset. + X = np.empty(0).reshape(10, 0) + y = np.ones(10) + msg = "0 feature(s) (shape=(10, 0)) while a minimum of 1 is required." + assert_raise_message(ValueError, msg, check_X_y, X, y) + + # nd-data is not checked for any minimum number of features by default: + X = np.ones((10, 0, 28, 28)) + y = np.ones(10) + X_checked, y_checked = check_X_y(X, y, allow_nd=True) + assert_array_equal(X, X_checked) + assert_array_equal(y, y_checked) + + def test_has_fit_parameter(): assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) @@ -274,6 +324,6 @@ def test_check_is_fitted(): ard.fit(*make_blobs()) svr.fit(*make_blobs()) - + assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_")) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 34e829c34338e..cc954b1ff51b0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -110,7 +110,13 @@ def _num_samples(x): x = np.asarray(x) else: raise TypeError("Expected sequence or array-like, got %r" % x) - return x.shape[0] if hasattr(x, 'shape') else len(x) + if hasattr(x, 'shape'): + if len(x.shape) == 0: + raise TypeError("Singleton array %r cannot be considered" + " a valid collection." % x) + return x.shape[0] + else: + return len(x) def check_consistent_length(*arrays): @@ -222,10 +228,11 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy, def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, - force_all_finite=True, ensure_2d=True, allow_nd=False): + force_all_finite=True, ensure_2d=True, allow_nd=False, + ensure_min_samples=1, ensure_min_features=1): """Input validation on an array, list, sparse matrix or similar. - By default, the input is converted to an at least 2nd numpy array. + By default, the input is converted to an at least 2d numpy array. Parameters ---------- @@ -257,6 +264,16 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, allow_nd : boolean (default=False) Whether to allow X.ndim > 2. + ensure_min_samples : int (default=1) + Make sure that the array has a minimum number of samples in its first + axis (rows for a 2D array). Setting to 0 disables this check. + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when ``ensure_2d`` is True and + ``allow_nd`` is False. Setting to 0 disables this check. + Returns ------- X_converted : object @@ -278,12 +295,26 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, if force_all_finite: _assert_all_finite(array) + if ensure_min_samples > 0: + n_samples = _num_samples(array) + if n_samples < ensure_min_samples: + raise ValueError("Found array with %d sample(s) (shape=%r) while a" + " minimum of %d is required." + % (n_samples, array.shape, ensure_min_samples)) + + if ensure_min_features > 0 and ensure_2d and not allow_nd: + n_features = array.shape[1] + if n_features < ensure_min_features: + raise ValueError("Found array with %d feature(s) (shape=%r) while" + " a minimum of %d is required." + % (n_features, array.shape, ensure_min_features)) return array def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, - multi_output=False): + multi_output=False, ensure_min_samples=1, + ensure_min_features=1): """Input validation for standard estimators. Checks X and y for consistent length, enforces X 2d and y 1d. @@ -327,13 +358,24 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, Whether to allow 2-d y (array or sparse matrix). If false, y will be validated as a vector. + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D X has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when ``ensure_2d`` is True and + ``allow_nd`` is False. + Returns ------- X_converted : object The converted and validated X. """ X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite, - ensure_2d, allow_nd) + ensure_2d, allow_nd, ensure_min_samples, + ensure_min_features) if multi_output: y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False) else: @@ -353,7 +395,7 @@ def column_or_1d(y, warn=False): y : array-like warn : boolean, default False - To control display of warnings. + To control display of warnings. Returns ------- @@ -406,6 +448,7 @@ def check_random_state(seed): raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed) + def has_fit_parameter(estimator, parameter): """Checks whether the estimator's fit method supports the given parameter. @@ -512,4 +555,4 @@ def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): attributes = [attributes] if not all_or_any([hasattr(estimator, attr) for attr in attributes]): - raise NotFittedError(msg % {'name' : type(estimator).__name__}) + raise NotFittedError(msg % {'name': type(estimator).__name__})