diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index c6d44665e72bf..2ce000a207163 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -432,8 +432,8 @@ def transform(self, X): """ check_is_fitted(self) - X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan", reset=False) X *= self.scale_ X += self.min_ @@ -760,9 +760,10 @@ def partial_fit(self, X, y=None, sample_weight=None): self : object Fitted scaler. """ + first_call = not hasattr(self, "n_samples_seen_") X = self._validate_data(X, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + force_all_finite='allow-nan', reset=first_call) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -1097,9 +1098,10 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + copy=self.copy, reset=False, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): inplace_column_scale(X, 1.0 / self.scale_) @@ -1398,9 +1400,10 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + copy=self.copy, estimator=self, + dtype=FLOAT_DTYPES, reset=False, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_scaling: @@ -1735,8 +1738,8 @@ def transform(self, X): """ check_is_fitted(self) - X = check_array(X, order='F', dtype=FLOAT_DTYPES, - accept_sparse=('csr', 'csc')) + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) n_samples, n_features = X.shape @@ -2038,7 +2041,7 @@ def transform(self, X, copy=None): Transformed array. """ copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr', reset=False) return normalize(X, norm=self.norm, axis=1, copy=copy) def _more_tags(self): @@ -2195,7 +2198,11 @@ def transform(self, X, copy=None): Transformed array. """ copy = copy if copy is not None else self.copy - return binarize(X, threshold=self.threshold, copy=copy) + # TODO: This should be refactored because binarize also calls + # check_array + X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy, + reset=False) + return binarize(X, threshold=self.threshold, copy=False) def _more_tags(self): return {'stateless': True} @@ -2291,7 +2298,7 @@ def transform(self, K, copy=True): """ check_is_fitted(self) - K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) + K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False) K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis] @@ -2689,16 +2696,7 @@ def _transform_col(self, X_col, quantiles, inverse): def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): """Check inputs before fit and transform.""" - # In theory reset should be equal to `in_fit`, but there are tests - # checking the input number of feature and they expect a specific - # string, which is not the same one raised by check_n_features. So we - # don't check n_features_in_ here for now (it's done with adhoc code in - # the estimator anyway). - # TODO: set reset=in_fit when addressing reset in - # predict/transform/etc. - reset = True - - X = self._validate_data(X, reset=reset, + X = self._validate_data(X, reset=in_fit, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -2718,16 +2716,6 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, return X - def _check_is_fitted(self, X): - """Check the inputs before transforming.""" - check_is_fitted(self) - # check that the dimension of X are adequate with the fitted data - if X.shape[1] != self.quantiles_.shape[1]: - raise ValueError('X does not have the same number of features as' - ' the previously fitted data. Got {} instead of' - ' {}.'.format(X.shape[1], - self.quantiles_.shape[1])) - def _transform(self, X, inverse=False): """Forward and inverse transform. @@ -2777,8 +2765,8 @@ def transform(self, X): Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) The projected data. """ + check_is_fitted(self) X = self._check_inputs(X, in_fit=False, copy=self.copy) - self._check_is_fitted(X) return self._transform(X, inverse=False) @@ -2798,9 +2786,9 @@ def inverse_transform(self, X): Xt : {ndarray, sparse matrix} of (n_samples, n_features) The projected data. """ + check_is_fitted(self) X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, copy=self.copy) - self._check_is_fitted(X) return self._transform(X, inverse=True) @@ -3262,6 +3250,10 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, ---------- X : array-like of shape (n_samples, n_features) + in_fit : bool + Whether or not `_check_input` is called from `fit` or other + methods, e.g. `predict`, `transform`, etc. + check_positive : bool, default=False If True, check that all data is positive and non-zero (only if ``self.method=='box-cox'``). @@ -3273,7 +3265,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, If True, check that the transformation method is valid. """ X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, - copy=self.copy, force_all_finite='allow-nan') + copy=self.copy, force_all_finite='allow-nan', + reset=in_fit) with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0dddebbf2823c..a628533ac13d0 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -289,12 +289,7 @@ def transform(self, X): # check input and attribute dtypes dtype = (np.float64, np.float32) if self.dtype is None else self.dtype - Xt = check_array(X, copy=True, dtype=dtype) - - n_features = self.n_bins_.shape[0] - if Xt.shape[1] != n_features: - raise ValueError("Incorrect number of features. Expecting {}, " - "received {}.".format(n_features, Xt.shape[1])) + Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False) bin_edges = self.bin_edges_ for jj in range(Xt.shape[1]): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a25809c3d514b..b4ffbe2ce40f6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1310,12 +1310,8 @@ def test_quantile_transform_check_error(): X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) - err_msg = ("X does not have the same number of features as the previously" - " fitted " "data. Got 2 instead of 3.") - with pytest.raises(ValueError, match=err_msg): - transformer.transform(X_bad_feat) - err_msg = ("X does not have the same number of features " - "as the previously fitted data. Got 2 instead of 3.") + err_msg = ("X has 2 features, but QuantileTransformer is expecting " + "3 features as input.") with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) @@ -2434,7 +2430,8 @@ def test_power_transformer_shape_exception(method): # Exceptions should be raised for arrays with different num_columns # than during fitting - wrong_shape_message = 'Input data has a different number of features' + wrong_shape_message = (r"X has \d+ features, but PowerTransformer is " + r"expecting \d+ features") with pytest.raises(ValueError, match=wrong_shape_message): pt.transform(X[:, 0:1]) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index fa25a125d9349..9d607c82d5831 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -101,14 +101,6 @@ def test_fit_transform_n_bins_array(strategy, expected): assert bin_edges.shape == (n_bins + 1, ) -def test_invalid_n_features(): - est = KBinsDiscretizer(n_bins=3).fit(X) - bad_X = np.arange(25).reshape(5, -1) - err_msg = "Incorrect number of features. Expecting 4, received 5" - with pytest.raises(ValueError, match=err_msg): - est.transform(bad_X) - - @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile']) def test_same_min_max(strategy): warnings.simplefilter("always") diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 46129b1ec084e..53e4702265d50 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -352,7 +352,6 @@ def test_search_cv(estimator, check, request): 'naive_bayes', 'neighbors', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', 'svm',