8000 ENH Checks n_features_in_ in preprocessing module by thomasjpfan · Pull Request #18577 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Checks n_features_in_ in preprocessing module #18577

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 21, 2020
Merged
67 changes: 30 additions & 37 deletions sklearn/preprocessing/_data.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -432,8 +432,8 @@ def transform(self, X):
"""
check_is_fitted(self)

X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES,
force_all_finite="allow-nan")
X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES,
force_all_finite="allow-nan", reset=False)

X *= self.scale_
X += self.min_
Expand Down Expand Up @@ -760,9 +760,10 @@ def partial_fit(self, X, y=None, sample_weight=None):
self : object
Fitted scaler.
"""
first_call = not hasattr(self, "n_samples_seen_")
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
force_all_finite='allow-nan', reset=first_call)

if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X,
Expand Down Expand Up @@ -1097,9 +1098,10 @@ def transform(self, X):
Transformed array.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
copy=self.copy, reset=False,
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')

if sparse.issparse(X):
inplace_column_scale(X, 1.0 / self.scale_)
Expand Down Expand Up @@ -1398,9 +1400,10 @@ def transform(self, X):
Transformed array.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
estimator=self, dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
copy=self.copy, estimator=self,
dtype=FLOAT_DTYPES, reset=False,
force_all_finite='allow-nan')

if sparse.issparse(X):
if self.with_scaling:
Expand Down Expand Up @@ -1735,8 +1738,8 @@ def transform(self, X):
"""
check_is_fitted(self)

X = check_array(X, order='F', dtype=FLOAT_DTYPES,
accept_sparse=('csr', 'csc'))
X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False,
accept_sparse=('csr', 'csc'))

n_samples, n_features = X.shape

Expand Down Expand Up @@ -2038,7 +2041,7 @@ def transform(self, X, copy=None):
Transformed array.
"""
copy = copy if copy is not None else self.copy
X = check_array(X, accept_sparse='csr')
X = self._validate_data(X, accept_sparse='csr', reset=False)
return normalize(X, norm=self.norm, axis=1, copy=copy)

def _more_tags(self):
Expand Down Expand Up @@ -2195,7 +2198,11 @@ def transform(self, X, copy=None):
Transformed array.
"""
copy = copy if copy is not None else self.copy
return binarize(X, threshold=self.threshold, copy=copy)
# TODO: This should be refactored because binarize also calls
# check_array
X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy,
reset=False)
return binarize(X, threshold=self.threshold, copy=False)

def _more_tags(self):
return {'stateless': True}
Expand Down Expand Up @@ -2291,7 +2298,7 @@ def transform(self, K, copy=True):
"""
check_is_fitted(self)

K = check_array(K, copy=copy, dtype=FLOAT_DTYPES)
K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)

K_pred_cols = (np.sum(K, axis=1) /
self.K_fit_rows_.shape[0])[:, np.newaxis]
Expand Down Expand Up @@ -2689,16 +2696,7 @@ def _transform_col(self, X_col, quantiles, inverse):
def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
copy=False):
"""Check inputs before fit and transform."""
# In theory reset should be equal to `in_fit`, but there are tests
# checking the input number of feature and they expect a specific
# string, which is not the same one raised by check_n_features. So we
# don't check n_features_in_ here for now (it's done with adhoc code in
# the estimator anyway).
# TODO: set reset=in_fit when addressing reset in
# predict/transform/etc.
reset = True

X = self._validate_data(X, reset=reset,
X = self._validate_data(X, reset=in_fit,
accept_sparse='csc', copy=copy,
dtype=FLOAT_DTYPES,
force_all_finite='allow-nan')
Expand All @@ -2718,16 +2716,6 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False,

return X

def _check_is_fitted(self, X):
"""Check the inputs before transforming."""
check_is_fitted(self)
# check that the dimension of X are adequate with the fitted data
if X.shape[1] != self.quantiles_.shape[1]:
raise ValueError('X does not have the same number of features as'
' the previously fitted data. Got {} instead of'
' {}.'.format(X.shape[1],
self.quantiles_.shape[1]))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!


def _transform(self, X, inverse=False):
"""Forward and inverse transform.

Expand Down Expand Up @@ -2777,8 +2765,8 @@ def transform(self, X):
Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
The projected data.
"""
check_is_fitted(self)
X = self._check_inputs(X, in_fit=False, copy=self.copy)
self._check_is_fitted(X)

return self._transform(X, inverse=False)

Expand All @@ -2798,9 +2786,9 @@ def inverse_transform(self, X):
Xt : {ndarray, sparse matrix} of (n_samples, n_features)
The projected data.
"""
check_is_fitted(self)
X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True,
copy=self.copy)
self._check_is_fitted(X)

return self._transform(X, inverse=True)

Expand Down Expand Up @@ -3262,6 +3250,10 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
----------
X : array-like of shape (n_samples, n_features)

in_fit : bool
Whether or not `_check_input` is called from `fit` or other
methods, e.g. `predict`, `transform`, etc.

check_positive : bool, default=False
If True, check that all data is positive and non-zero (only if
``self.method=='box-cox'``).
Expand All @@ -3273,7 +3265,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
If True, check that the transformation method is valid.
"""
X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES,
copy=self.copy, force_all_finite='allow-nan')
copy=self.copy, force_all_finite='allow-nan',
reset=in_fit)

with np.warnings.catch_warnings():
np.warnings.filterwarnings(
Expand Down
7 changes: 1 addition & 6 deletions sklearn/preprocessing/_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,12 +289,7 @@ def transform(self, X):

# check input and attribute dtypes
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
Xt = check_array(X, copy=True, dtype=dtype)

n_features = self.n_bins_.shape[0]
if Xt.shape[1] != n_features:
raise ValueError("Incorrect number of features. Expecting {}, "
"received {}.".format(n_features, Xt.shape[1]))
Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)

bin_edges = self.bin_edges_
for jj in range(Xt.shape[1]):
Expand Down
11 changes: 4 additions & 7 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,12 +1310,8 @@ def test_quantile_transform_check_error():

X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]])
err_msg = ("X does not have the same number of features as the previously"
" fitted " "data. Got 2 instead of 3.")
with pytest.raises(ValueError, match=err_msg):
transformer.transform(X_bad_feat)
err_msg = ("X does not have the same number of features "
"as the previously fitted data. Got 2 instead of 3.")
err_msg = ("X has 2 features, but QuantileTransformer is expecting "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we remove this check since we have a common check?

"3 features as input.")
with pytest.raises(ValueError, match=err_msg):
transformer.inverse_transform(X_bad_feat)

Expand Down Expand Up @@ -2434,7 +2430,8 @@ def test_power_transformer_shape_exception(method):

# Exceptions should be raised for arrays with different num_columns
# than during fitting
wrong_shape_message = 'Input data has a different number of features'
wrong_shape_message = (r"X has \d+ features, but PowerTransformer is "
r"expecting \d+ features")

with pytest.raises(ValueError, match=wrong_shape_message):
pt.transform(X[:, 0:1])
Expand Down
8 changes: 0 additions & 8 deletions sklearn/preprocessing/tests/test_discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,6 @@ def test_fit_transform_n_bins_array(strategy, expected):
assert bin_edges.shape == (n_bins + 1, )


def test_invalid_n_features():
est = KBinsDiscretizer(n_bins=3).fit(X)
bad_X = np.arange(25).reshape(5, -1)
err_msg = "Incorrect number of features. Expecting 4, received 5"
with pytest.raises(ValueError, match=err_msg):
est.transform(bad_X)


@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
def test_same_min_max(strategy):
warnings.simplefilter("always")
Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ def test_search_cv(estimator, check, request):
'naive_bayes',
'neighbors',
'pipeline',
'preprocessing',
'random_projection',
'semi_supervised',
'svm',
Expand Down
0