From 439703308052c5ce772d6deece3632becda7538b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 8 Oct 2020 20:31:48 -0400 Subject: [PATCH 1/9] ENH Checks n_features_in_ in preprocessing module --- sklearn/preprocessing/_data.py | 63 ++++++++----------- sklearn/preprocessing/_discretization.py | 7 +-- sklearn/preprocessing/tests/test_data.py | 40 +++++++++--- .../tests/test_discretization.py | 3 +- sklearn/tests/test_common.py | 1 - 5 files changed, 60 insertions(+), 54 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index c6d44665e72bf..35b059153d316 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -432,8 +432,8 @@ def transform(self, X): """ check_is_fitted(self) - X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan", reset=False) X *= self.scale_ X += self.min_ @@ -760,9 +760,10 @@ def partial_fit(self, X, y=None, sample_weight=None): self : object Fitted scaler. """ + first_call = not hasattr(self, "n_samples_seen_") X = self._validate_data(X, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + force_all_finite='allow-nan', reset=first_call) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -1097,9 +1098,10 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + copy=self.copy, reset=False, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): inplace_column_scale(X, 1.0 / self.scale_) @@ -1398,9 +1400,10 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + copy=self.copy, estimator=self, + dtype=FLOAT_DTYPES, reset=False, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_scaling: @@ -1735,8 +1738,8 @@ def transform(self, X): """ check_is_fitted(self) - X = check_array(X, order='F', dtype=FLOAT_DTYPES, - accept_sparse=('csr', 'csc')) + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) n_samples, n_features = X.shape @@ -2038,7 +2041,7 @@ def transform(self, X, copy=None): Transformed array. """ copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr', reset=False) return normalize(X, norm=self.norm, axis=1, copy=copy) def _more_tags(self): @@ -2195,7 +2198,11 @@ def transform(self, X, copy=None): Transformed array. """ copy = copy if copy is not None else self.copy - return binarize(X, threshold=self.threshold, copy=copy) + # TODO: This should be refactored because binarize also calls + # check_array + X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy, + reset=False) + return binarize(X, threshold=self.threshold, copy=False) def _more_tags(self): return {'stateless': True} @@ -2291,7 +2298,7 @@ def transform(self, K, copy=True): """ check_is_fitted(self) - K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) + K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False) K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis] @@ -2689,16 +2696,7 @@ def _transform_col(self, X_col, quantiles, inverse): def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): """Check inputs before fit and transform.""" - # In theory reset should be equal to `in_fit`, but there are tests - # checking the input number of feature and they expect a specific - # string, which is not the same one raised by check_n_features. So we - # don't check n_features_in_ here for now (it's done with adhoc code in - # the estimator anyway). - # TODO: set reset=in_fit when addressing reset in - # predict/transform/etc. - reset = True - - X = self._validate_data(X, reset=reset, + X = self._validate_data(X, reset=in_fit, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -2718,16 +2716,6 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, return X - def _check_is_fitted(self, X): - """Check the inputs before transforming.""" - check_is_fitted(self) - # check that the dimension of X are adequate with the fitted data - if X.shape[1] != self.quantiles_.shape[1]: - raise ValueError('X does not have the same number of features as' - ' the previously fitted data. Got {} instead of' - ' {}.'.format(X.shape[1], - self.quantiles_.shape[1])) - def _transform(self, X, inverse=False): """Forward and inverse transform. @@ -2777,8 +2765,8 @@ def transform(self, X): Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) The projected data. """ + check_is_fitted(self) X = self._check_inputs(X, in_fit=False, copy=self.copy) - self._check_is_fitted(X) return self._transform(X, inverse=False) @@ -2798,9 +2786,9 @@ def inverse_transform(self, X): Xt : {ndarray, sparse matrix} of (n_samples, n_features) The projected data. """ + check_is_fitted(self) X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, copy=self.copy) - self._check_is_fitted(X) return self._transform(X, inverse=True) @@ -3273,7 +3261,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, If True, check that the transformation method is valid. """ X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, - copy=self.copy, force_all_finite='allow-nan') + copy=self.copy, force_all_finite='allow-nan', + reset=in_fit) with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0dddebbf2823c..a628533ac13d0 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -289,12 +289,7 @@ def transform(self, X): # check input and attribute dtypes dtype = (np.float64, np.float32) if self.dtype is None else self.dtype - Xt = check_array(X, copy=True, dtype=dtype) - - n_features = self.n_bins_.shape[0] - if Xt.shape[1] != n_features: - raise ValueError("Incorrect number of features. Expecting {}, " - "received {}.".format(n_features, Xt.shape[1])) + Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False) bin_edges = self.bin_edges_ for jj in range(Xt.shape[1]): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a25809c3d514b..17e85c89266c1 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1310,12 +1310,10 @@ def test_quantile_transform_check_error(): X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) - err_msg = ("X does not have the same number of features as the previously" - " fitted " "data. Got 2 instead of 3.") + err_msg = ("X has 2 features, but QuantileTransformer is expecting " + "3 features as input.") with pytest.raises(ValueError, match=err_msg): transformer.transform(X_bad_feat) - err_msg = ("X does not have the same number of features " - "as the previously fitted data. Got 2 instead of 3.") with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) @@ -1919,7 +1917,8 @@ def test_maxabs_scaler_partial_fit(): n_samples_seen=scaler_incr.n_samples_seen_) -def test_normalizer_l1(): +@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) +def test_normalizer_norms(norm): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) @@ -1939,11 +1938,13 @@ def test_normalizer_l1(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) + normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is not X X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) + normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is X X_norm2 = toarray(X_norm) @@ -1957,7 +1958,9 @@ def test_normalizer_l1(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + normalizer = Normalizer(norm='l2', copy=False) + normalizer.n_features_in_ = X.shape[1] + X_norm = normalizer.transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -1988,11 +1991,13 @@ def test_normalizer_l2(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l2', copy=True) + normalizer.n_features_in_ = X.shape[1] X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='l2', copy=False) + normalizer.n_features_in_ = X.shape[1] X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) @@ -2005,7 +2010,9 @@ def test_normalizer_l2(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + normalizer = Normalizer(norm='l2', copy=False) + normalizer.n_features_in_ = X.shape[1] + X_norm = normalizer.transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -2036,11 +2043,13 @@ def test_normalizer_max(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='max', copy=True) + normalizer.n_features_in_ = X.shape[1] X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='max', copy=False) + normalizer.n_features_in_ = X.shape[1] X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) @@ -2054,7 +2063,9 @@ def test_normalizer_max(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) + normalizer = Normalizer(norm='l2', copy=False) + normalizer.n_features_in_ = X.shape[1] + X_norm = normalizer.transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -2079,6 +2090,7 @@ def test_normalizer_max_sign(): for X in (X_dense, X_all_neg, X_all_neg_sparse): normalizer = Normalizer(norm='max') + normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is not X X_norm = toarray(X_norm) @@ -2144,6 +2156,7 @@ def test_binarizer(): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) + binarizer.n_features_in_ = 3 X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 4 assert np.sum(X_bin == 1) == 2 @@ -2157,6 +2170,7 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=True) + binarizer.n_features_in_ = 3 X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) @@ -2164,11 +2178,13 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=False) + binarizer.n_features_in_ = 3 X_bin = binarizer.transform(X) if init is not list: assert X_bin is X binarizer = Binarizer(copy=False) + binarizer.n_features_in_ = 3 X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) X_bin = binarizer.transform(X_float) if init is not list: @@ -2179,6 +2195,7 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(threshold=-0.5, copy=True) + binarizer.n_features_in_ = 3 for init in (np.array, list): X = init(X_.copy()) @@ -2434,7 +2451,8 @@ def test_power_transformer_shape_exception(method): # Exceptions should be raised for arrays with different num_columns # than during fitting - wrong_shape_message = 'Input data has a different number of features' + wrong_shape_message = (r"X has \d+ features, but PowerTransformer is " + r"expecting \d+ features") with pytest.raises(ValueError, match=wrong_shape_message): pt.transform(X[:, 0:1]) @@ -2459,6 +2477,7 @@ def test_power_transformer_lambda_zero(): # Test the lambda = 0 case pt.lambdas_ = np.array([0]) + pt.n_features_in_ = 1 X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) @@ -2469,6 +2488,7 @@ def test_power_transformer_lambda_one(): X = np.abs(X_2d)[:, 0:1] pt.lambdas_ = np.array([1]) + pt.n_features_in_ = 1 X_trans = pt.transform(X) assert_array_almost_equal(X_trans, X) @@ -2491,10 +2511,12 @@ def test_optimization_power_transformer(method, lmbda): X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) pt = PowerTransformer(method=method, standardize=False) + pt.n_features_in_ = 1 pt.lambdas_ = [lmbda] X_inv = pt.inverse_transform(X) pt = PowerTransformer(method=method, standardize=False) + pt.n_features_in_ = 1 X_inv_trans = pt.fit_transform(X_inv) assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index fa25a125d9349..901f43e407cb8 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -104,7 +104,8 @@ def test_fit_transform_n_bins_array(strategy, expected): def test_invalid_n_features(): est = KBinsDiscretizer(n_bins=3).fit(X) bad_X = np.arange(25).reshape(5, -1) - err_msg = "Incorrect number of features. Expecting 4, received 5" + err_msg = ("X has 5 features, but KBinsDiscretizer is expecting 4 " + "features as input") with pytest.raises(ValueError, match=err_msg): est.transform(bad_X) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b84b66d1fb919..d74e5df221a0c 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -307,7 +307,6 @@ def test_strict_mode_parametrize_with_checks(estimator, check): 'naive_bayes', 'neighbors', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', 'svm', From 1c6fe9d1ca53096b51ca2fb9779fb3a32a3775b0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 8 Oct 2020 20:34:24 -0400 Subject: [PATCH 2/9] REV Lower diffs --- sklearn/preprocessing/tests/test_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 17e85c89266c1..a086974aa6ead 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1917,8 +1917,7 @@ def test_maxabs_scaler_partial_fit(): n_samples_seen=scaler_incr.n_samples_seen_) -@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) -def test_normalizer_norms(norm): +def test_normalizer_l1(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) From 3dddafc211bf2c0742d58584e77a4457154329f1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 8 Oct 2020 22:39:38 -0400 Subject: [PATCH 3/9] DOC Uses fit_transform --- doc/modules/preprocessing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 024dd074e2e41..6430f6470703b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -693,7 +693,7 @@ as each sample is treated independently of others:: It is possible to adjust the threshold of the binarizer:: >>> binarizer = preprocessing.Binarizer(threshold=1.1) - >>> binarizer.transform(X) + >>> binarizer.fit_transform(X) array([[0., 0., 1.], [1., 0., 0.], [0., 0., 0.]]) @@ -791,5 +791,5 @@ error with a ``filterwarnings``:: ... category=UserWarning, append=False) For a full code example that demonstrates using a :class:`FunctionTransformer` -to extract features from text data see +to extract features from text data see :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` From c70679efaf1bbab2305a8b42558504e559982dc0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 9 Oct 2020 15:25:47 -0400 Subject: [PATCH 4/9] ENH Do not check when n_features_in_ is not defined --- sklearn/base.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index d9fc4b7092971..d96fa35d3b0af 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -370,18 +370,16 @@ def _check_n_features(self, X, reset): if reset: self.n_features_in_ = n_features - else: - if not hasattr(self, 'n_features_in_'): - raise RuntimeError( - "The reset parameter is False but there is no " - "n_features_in_ attribute. Is this estimator fitted?" - ) - if n_features != self.n_features_in_: - raise ValueError( - 'X has {} features, but {} is expecting {} features ' - 'as input.'.format(n_features, self.__class__.__name__, - self.n_features_in_) - ) + return + + fitted_n_features_in = getattr(self, 'n_features_in_', None) + if fitted_n_features_in is None: + return + + if n_features != self.n_features_in_: + raise ValueError( + f"X has {n_features} features, but {self.__class__.__name__} " + f"is expecting {self.n_features_in_} features as input.") def _validate_data(self, X, y='no_validation', reset=True, validate_separately=False, **check_params): From be4bb32e9b5b36031e990c82aded9139c183b7e9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 9 Oct 2020 15:32:04 -0400 Subject: [PATCH 5/9] REV Reduces diff --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/tests/test_data.py | 28 +++--------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 697629ad4ad56..79d57913a9565 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -721,7 +721,7 @@ as each sample is treated independently of others:: It is possible to adjust the threshold of the binarizer:: >>> binarizer = preprocessing.Binarizer(threshold=1.1) - >>> binarizer.fit_transform(X) + >>> binarizer.transform(X) array([[0., 0., 1.], [1., 0., 0.], [0., 0., 0.]]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index a086974aa6ead..58858195dbdb1 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1937,13 +1937,11 @@ def test_normalizer_l1(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) - normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is not X X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) - normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is X X_norm2 = toarray(X_norm) @@ -1957,9 +1955,7 @@ def test_normalizer_l1(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - normalizer = Normalizer(norm='l2', copy=False) - normalizer.n_features_in_ = X.shape[1] - X_norm = normalizer.transform(X) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -1990,13 +1986,11 @@ def test_normalizer_l2(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l2', copy=True) - normalizer.n_features_in_ = X.shape[1] X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='l2', copy=False) - normalizer.n_features_in_ = X.shape[1] X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) @@ -2009,9 +2003,7 @@ def test_normalizer_l2(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - normalizer = Normalizer(norm='l2', copy=False) - normalizer.n_features_in_ = X.shape[1] - X_norm = normalizer.transform(X) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -2042,13 +2034,11 @@ def test_normalizer_max(): for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='max', copy=True) - normalizer.n_features_in_ = X.shape[1] X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='max', copy=False) - normalizer.n_features_in_ = X.shape[1] X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) @@ -2062,9 +2052,7 @@ def test_normalizer_max(): # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) - normalizer = Normalizer(norm='l2', copy=False) - normalizer.n_features_in_ = X.shape[1] - X_norm = normalizer.transform(X) + X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) @@ -2089,7 +2077,6 @@ def test_normalizer_max_sign(): for X in (X_dense, X_all_neg, X_all_neg_sparse): normalizer = Normalizer(norm='max') - normalizer.n_features_in_ = X.shape[1] X_norm = normalizer.transform(X) assert X_norm is not X X_norm = toarray(X_norm) @@ -2155,7 +2142,6 @@ def test_binarizer(): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) - binarizer.n_features_in_ = 3 X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 4 assert np.sum(X_bin == 1) == 2 @@ -2169,7 +2155,6 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=True) - binarizer.n_features_in_ = 3 X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) @@ -2177,13 +2162,11 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=False) - binarizer.n_features_in_ = 3 X_bin = binarizer.transform(X) if init is not list: assert X_bin is X binarizer = Binarizer(copy=False) - binarizer.n_features_in_ = 3 X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) X_bin = binarizer.transform(X_float) if init is not list: @@ -2194,7 +2177,6 @@ def test_binarizer(): assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(threshold=-0.5, copy=True) - binarizer.n_features_in_ = 3 for init in (np.array, list): X = init(X_.copy()) @@ -2476,7 +2458,6 @@ def test_power_transformer_lambda_zero(): # Test the lambda = 0 case pt.lambdas_ = np.array([0]) - pt.n_features_in_ = 1 X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) @@ -2487,7 +2468,6 @@ def test_power_transformer_lambda_one(): X = np.abs(X_2d)[:, 0:1] pt.lambdas_ = np.array([1]) - pt.n_features_in_ = 1 X_trans = pt.transform(X) assert_array_almost_equal(X_trans, X) @@ -2510,12 +2490,10 @@ def test_optimization_power_transformer(method, lmbda): X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) pt = PowerTransformer(method=method, standardize=False) - pt.n_features_in_ = 1 pt.lambdas_ = [lmbda] X_inv = pt.inverse_transform(X) pt = PowerTransformer(method=method, standardize=False) - pt.n_features_in_ = 1 X_inv_trans = pt.fit_transform(X_inv) assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, From a2d8a1693ab2f1239f31264f3135a08275eed702 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 13 Oct 2020 23:24:24 +1100 Subject: [PATCH 6/9] DOC Whats new tweaks (#18604) Co-authored-by: Christian Lorentzen Co-authored-by: Nicolas Hug --- doc/whats_new/v0.24.rst | 206 ++++++++++++++++++++-------------------- 1 file changed, 103 insertions(+), 103 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index edd3afe62dbdf..e60f50a737073 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -82,17 +82,17 @@ Changelog :mod:`sklearn.compose` ...................... -- |FIX| :class:`compose.ColumnTransformer` will skip transformers the +- |Fix| :class:`compose.ColumnTransformer` will skip transformers the column selector is a list of bools that are False. :pr:`17616` by `Thomas Fan`_. -- |FIX| :class:`compose.ColumnTransformer` now displays the remainder in the +- |Fix| :class:`compose.ColumnTransformer` now displays the remainder in the diagram display. :pr:`18167` by `Thomas Fan`_. :mod:`sklearn.covariance` ......................... -- |API| Deprecates `cv_alphas_` in favor of `cv_results['alphas']` and +- |API| Deprecates `cv_alphas_` in favor of `cv_results_['alphas']` and `grid_scores_` in favor of split scores in `cv_results_` in :class:`covariance.GraphicalLassoCV`. `cv_alphas_` and `grid_scores_` will be removed in version 0.26. :pr:`16392` by `Thomas Fan`_. @@ -100,6 +100,16 @@ Changelog :mod:`sklearn.cross_decomposition` .................................. +- |Fix| Fixed a bug in :class:`cross_decomposition.PLSSVD` which would + sometimes return components in the reversed order of importance. + :pr:`17095` by `Nicolas Hug`_. + +- |Fix| Fixed a bug in :class:`cross_decomposition.PLSSVD`, + :class:`cross_decomposition.CCA`, and + :class:`cross_decomposition.PLSCanonical`, which would lead to incorrect + predictions for `est.transform(Y)` when the training data is single-target. + :pr:`17095` by `Nicolas Hug`_. + - |API| The bounds of the `n_components` parameter is now restricted: - into `[1, min(n_samples, n_features, n_targets)]`, for @@ -116,28 +126,18 @@ Changelog retrieved by calling `transform` on the training data. The `norm_y_weights` attribute will also be removed. :pr:`17095` by `Nicolas Hug`_. -- |Fix| Fixed a bug in :class:`cross_decomposition.PLSSVD` which would - sometimes return components in the reversed order of importance. - :pr:`17095` by `Nicolas Hug`_. - -- |Fix| Fixed a bug in :class:`cross_decomposition.PLSSVD`, - :class:`cross_decomposition.CCA`, and - :class:`cross_decomposition.PLSCanonical`, which would lead to incorrect - predictions for `est.transform(Y)` when the training data is single-target. - :pr:`17095` by `Nicolas Hug`_. - :mod:`sklearn.datasets` ....................... +- |Feature| :func:`datasets.fetch_openml` now validates md5 checksum of arff + files downloaded or cached to ensure data integrity. + :pr:`14800` by :user:`Shashank Singh ` and `Joel Nothman`_. + - |Enhancement| :func:`datasets.fetch_openml` now allows argument `as_frame` to be 'auto', which tries to convert returned data to pandas DataFrame unless data is sparse. :pr:`17396` by :user:`Jiaxiang `. -- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff - files downloaded or cached to ensure data integrity. - :pr:`14800` by :user:`Shashank Singh ` and `Joel Nothman`_. - - |Enhancement| :func:`datasets.fetch_covtype` now now supports the optional argument `as_frame`; when it is set to True, the returned Bunch object's `data` and `frame` members are pandas DataFrames, and the `target` member is @@ -158,6 +158,16 @@ Changelog :mod:`sklearn.decomposition` ............................ +- |Enhancement| :func:`decomposition.FactorAnalysis` now supports the optional + argument `rotation`, which can take the value `None`, `'varimax'` or + `'quartimax'`. :pr:`11064` by :user:`Jona Sassenhagen `. + +- |Enhancement| :class:`decomposition.NMF` now supports the optional parameter + `regularization`, which can take the values `None`, 'components', + 'transformation' or 'both', in accordance with + :func:`decomposition.NMF.non_negative_factorization`. + :pr:`17414` by :user:`Bharat Raghunathan `. + - |Fix| :class:`decomposition.KernelPCA` behaviour is now more consistent between 32-bits and 64-bits data input when the kernel has small positive eigenvalues. Small positive eigenvalues were not correctly discarded for @@ -170,16 +180,6 @@ Changelog redundant with the `dictionary` attribute and constructor parameter. :pr:`17679` by :user:`Xavier Dupré `. -- |Enhancement| :func:`decomposition.FactorAnalysis` now supports the optional - argument `rotation`, which can take the value `None`, `'varimax'` or - `'quartimax'.` :pr:`11064` by :user:`Jona Sassenhagen `. - -- |Enhancement| :class:`decomposition.NMF` now supports the optional parameter - `regularization`, which can take the values `None`, `components`, - `transformation` or `both`, in accordance with - :func:`decomposition.NMF.non_negative_factorization`. - :pr:`17414` by :user:`Bharat Raghunathan `. - - |Fix| :meth:`TruncatedSVD.fit_transform` consistently returns the same as :meth:`TruncatedSVD.fit` followed by :meth:`TruncatedSVD.transform`. :pr:`18528` by :user:`Albert Villanova del Moral ` and @@ -214,20 +214,20 @@ Changelog :pr:`18341` by `Olivier Grisel`_, `Nicolas Hug`_, `Thomas Fan`_, and :user:`Egor Smirnov `. -- |API|: The parameter ``n_classes_`` is now deprecated in +- |Fix| Fixed a bug in + :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` which can now accept data + with `uint8` dtype in `predict`. :pr:`18410` by `Nicolas Hug`_. + +- |API| The parameter ``n_classes_`` is now deprecated in :class:`ensemble.GradientBoostingRegressor` and returns `1`. :pr:`17702` by :user:`Simona Maggio `. -- |API|: Mean absolute error ('mae') is now deprecated for the parameter +- |API| Mean absolute error ('mae') is now deprecated for the parameter ``criterion`` in :class:`ensemble.GradientBoostingRegressor` and :class:`ensemble.GradientBoostingClassifier`. :pr:`18326` by :user:`Madhura Jayaratne `. -- |Fix|: Fixed a bug in - :class:`ensemble.HistGradientBoostingRegressor` and - :class:`ensemble.HistGradientBoostingClassifier` which can now accept data - with `uint8` dtype in `predict`. :pr:`18410` by `Nicolas Hug`_. - :mod:`sklearn.exceptions` ......................... @@ -255,18 +255,18 @@ Changelog attribute name/path or a `callable` for extracting feature importance from the estimator. :pr:`15361` by :user:`Venkatachalam N `. -- |Enhancement| :class:`feature_selection.RFE` supports the option for the - number of `n_features_to_select` to be given as a float representing the - percentage of features to select. - :pr:`17090` by :user:`Lisa Schwetlick ` and - :user:`Marija Vlajic Wheeler `. - - |Efficiency| Reduce memory footprint in :func:`feature_selection.mutual_info_classif` and :func:`feature_selection.mutual_info_regression` by calling :class:`neighbors.KDTree` for counting nearest neighbors. :pr:`17878` by :user:`Noel Rogers `. +- |Enhancement| :class:`feature_selection.RFE` supports the option for the + number of `n_features_to_select` to be given as a float representing the + percentage of features to select. + :pr:`17090` by :user:`Lisa Schwetlick ` and + :user:`Marija Vlajic Wheeler `. + :mod:`sklearn.gaussian_process` ............................... @@ -279,12 +279,6 @@ Changelog :mod:`sklearn.impute` ..................... -- |Fix| replace the default values in :class:`impute.IterativeImputer` - of `min_value` and `max_value` parameters to `-np.inf` and `np.inf`, - respectively instead of `None`. However, the behaviour of the class does not - change since `None` was defaulting to these values already. - :pr:`16493` by :user:`Darshan N `. - - |Feature| :class:`impute.SimpleImputer` now supports a list of strings when ``strategy='most_frequent'`` or ``strategy='constant'``. :pr:`17526` by :user:`Ayako YAGI ` and @@ -294,6 +288,12 @@ Changelog revert imputed data to original when instantiated with ``add_indicator=True``. :pr:`17612` by :user:`Srimukh Sripada `. +- |Fix| replace the default values in :class:`impute.IterativeImputer` + of `min_value` and `max_value` parameters to `-np.inf` and `np.inf`, + respectively instead of `None`. However, the behaviour of the class does not + change since `None` was defaulting to these values already. + :pr:`16493` by :user:`Darshan N `. + - |Fix| :class:`impute.IterativeImputer` will not attempt to set the estimator's `random_state` attribute, allowing to use it with more external classes. :pr:`15636` by :user:`David Cortes `. @@ -356,16 +356,16 @@ Changelog :mod:`sklearn.manifold` ....................... +- |Efficiency| Fixed :issue:`10493`. Improve Local Linear Embedding (LLE) + that raised `MemoryError` exception when used with large inputs. + :pr:`17997` by :user:`Bertrand Maisonneuve `. + - |Enhancement| Add `square_distances` parameter to :class:`manifold.TSNE`, which provides backward compatibility during deprecation of legacy squaring behavior. Distances will be squared by default in 0.26, and this parameter will be removed in 0.28. :pr:`17662` by :user:`Joshua Newton `. -- |Efficiency| Fixed :issue:`10493`. Improve Local Linear Embedding (LLE) - that raised `MemoryError` exception when used with large inputs. - :pr:`17997` by :user:`Bertrand Maisonneuve `. - - |Fix| :class:`manifold.MDS` now correctly sets its `_pairwise` attribute. :pr:`18278` by `Thomas Fan`_. @@ -387,10 +387,9 @@ Changelog some practical test cases were taken from PR :pr:`10711` by :user:`Mohamed Ali Jamaoui `. -- |Fix| Fixed a bug in - :func:`metrics.classification_report` which was raising AttributeError - when called with `output_dict=True` for 0-length values. - :pr:`17777` by :user:`Shubhanshu Mishra `. +- |Feature| :func:`metrics.plot_confusion_matrix` now supports making colorbar + optional in the matplotlib plot by setting `colorbar=False`. :pr:`17192` by + :user:`Avi Gupta ` - |Enhancement| Add `sample_weight` parameter to :func:`metrics.median_absolute_error`. :pr:`17225` by @@ -401,15 +400,16 @@ Changelog class to be used when computing the precision and recall statistics. :pr:`17569` by :user:`Guillaume Lemaitre `. -- |Feature| :func:`metrics.plot_confusion_matrix` now supports making colorbar - optional in the matplotlib plot by setting colorbar=False. :pr:`17192` by - :user:`Avi Gupta ` - - |Enhancement| Add `pos_label` parameter in :func:`metrics.plot_roc_curve` in order to specify the positive class to be used when computing the roc auc statistics. :pr:`17651` by :user:`Clara Matos `. +- |Fix| Fixed a bug in + :func:`metrics.classification_report` which was raising AttributeError + when called with `output_dict=True` for 0-length values. + :pr:`17777` by :user:`Shubhanshu Mishra `. + - |Fix| Fixed a bug in :func:`metrics.jaccard_score` which recommended the `zero_division` parameter when called with no true or predicted samples. @@ -431,18 +431,7 @@ Changelog :mod:`sklearn.model_selection` .............................. -- |Enhancement| :class:`model_selection.TimeSeriesSplit` has two new keyword - arguments `test_size` and `gap`. `test_size` allows the out-of-sample - time series length to be fixed for all folds. `gap` removes a fixed number of - samples between the train and test set on each fold. - :pr:`13204` by :user:`Kyle Kosic `. - -- |Feature| :class:`model_selection.RandomizedSearchCV` and - :class:`model_selection.GridSearchCV` now have the method, ``score_samples`` - :pr:`17478` by :user:`Teon Brooks ` and - :user:`Mohamed Maskani `. - -- |Feature| Added (experimental) parameter search estimators +- |MajorFeature| Added (experimental) parameter search estimators :class:`model_selection.HalvingRandomSearchCV` and :class:`model_selection.HalvingGridSearchCV` which implement Successive Halving, and can be used as a drop-in replacements for @@ -450,15 +439,16 @@ Changelog :class:`model_selection.GridSearchCV`. :pr:`13900` by `Nicolas Hug`_, `Joel Nothman`_ and `Andreas Müller`_. -- |Fix| Fixed the `len` of :class:`model_selection.ParameterSampler` when - all distributions are lists and `n_iter` is more than the number of unique - parameter combinations. :pr:`18222` by `Nicolas Hug`_. +- |Feature| :class:`model_selection.RandomizedSearchCV` and + :class:`model_selection.GridSearchCV` now have the method ``score_samples`` + :pr:`17478` by :user:`Teon Brooks ` and + :user:`Mohamed Maskani `. -- |Fix| A fix to raise warning when one or more CV splits of - :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` results in non-finite scores. - :pr:`18266` by :user:`Subrat Sahu `, - :user:`Nirvan ` and :user:`Arthur Book `. +- |Enhancement| :class:`model_selection.TimeSeriesSplit` has two new keyword + arguments `test_size` and `gap`. `test_size` allows the out-of-sample + time series length to be fixed for all folds. `gap` removes a fixed number of + samples between the train and test set on each fold. + :pr:`13204` by :user:`Kyle Kosic `. - |Enhancement| :func:`model_selection.cross_val_score`, :func:`model_selection.cross_validate`, @@ -468,6 +458,16 @@ Changelog will be raised. :pr:`18343` by `Guillaume Lemaitre`_ and :user:`Devi Sandeep `. +- |Fix| Fixed the `len` of :class:`model_selection.ParameterSampler` when + all distributions are lists and `n_iter` is more than the number of unique + parameter combinations. :pr:`18222` by `Nicolas Hug`_. + +- |Fix| A fix to raise warning when one or more CV splits of + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` results in non-finite scores. + :pr:`18266` by :user:`Subrat Sahu `, + :user:`Nirvan ` and :user:`Arthur Book `. + :mod:`sklearn.multiclass` ......................... @@ -485,11 +485,6 @@ Changelog :mod:`sklearn.multioutput` .......................... -- |Fix| A fix to accept tuples for the ``order`` parameter - in :class:`multioutput.ClassifierChain`. - :pr:`18124` by :user:`Gus Brocchini ` and - :user:`Amanda Dsouza `. - - |Enhancement| :class:`multioutput.MultiOutputClassifier` and :class:`multioutput.MultiOutputRegressor` now accepts the inputs with missing values. Hence, estimators which can handle missing @@ -497,21 +492,26 @@ Changelog estimators) can be used as a estimator for multiclass wrappers. :pr:`17987` by :user:`Venkatachalam N `. +- |Fix| A fix to accept tuples for the ``order`` parameter + in :class:`multioutput.ClassifierChain`. + :pr:`18124` by :user:`Gus Brocchini ` and + :user:`Amanda Dsouza `. + :mod:`sklearn.naive_bayes` .......................... -- |API|: The attributes ``coef_`` and ``intercept_`` are now deprecated in - :class:`naive_bayes.MultinomialNB`, :class:`naive_bayes.ComplementNB`, - :class:`naive_bayes.BernoulliNB` and :class:`naive_bayes.CategoricalNB`, - and will be removed in v0.26. :pr:`17427` by - :user:`Juan Carlos Alfaro Jiménez `. - - |Enhancement| Adds a parameter `min_categories` to :class:`naive_bayes.CategoricalNB` that allows a minimum number of categories per feature to be specified. This allows categories unseen during training to be accounted for. :pr:`16326` by :user:`George Armstrong `. +- |API| The attributes ``coef_`` and ``intercept_`` are now deprecated in + :class:`naive_bayes.MultinomialNB`, :class:`naive_bayes.ComplementNB`, + :class:`naive_bayes.BernoulliNB` and :class:`naive_bayes.CategoricalNB`, + and will be removed in v0.26. :pr:`17427` by + :user:`Juan Carlos Alfaro Jiménez `. + :mod:`sklearn.neighbors` ........................ @@ -587,6 +587,12 @@ Changelog which clips the transformed values of test data to ``feature_range``. :pr:`17833` by :user:`Yashika Sharma `. +- |Feature| Add ``sample_weight`` parameter to + :class:`preprocessing.StandardScaler` for when X is dense. Allows setting + individual weights for each sample. :pr:`18510` and + :pr:`18447` and :pr:`16066` by :user:`Maria Telenczuk ` and + :user:`Albert Villanova ` and :user:`panpiort8`. + - |Enhancement| Verbose output of :class:`model_selection.GridSearchCV` has been improved for readability. :pr:`16935` by :user:`Raghav Rajagopalan ` and :user:`Chiara Marmo `. @@ -600,16 +606,10 @@ Changelog :class:`preprocessing.KBinsDiscretizer`. :pr:`16335` by :user:`Arthur Imbert `. -- |Feature| Add ``sample_weight`` parameter to - :class:`preprocessing.StandardScaler` for when X is dense. Allows setting - individual weights for each sample. :pr:`18510` and - :pr:`18447` and :pr:`16066` by :user:`Maria Telenczuk ` and - :user:`Albert Villanova ` and :user:`` - :mod:`sklearn.svm` .................. -- |Enhancement| invoke scipy blas api for svm kernel function in ``fit``, +- |Enhancement| invoke SciPy BLAS API for SVM kernel function in ``fit``, ``predict`` and related methods of :class:`svm.SVC`, :class:`svm.NuSVC`, :class:`svm.SVR`, :class:`svm.NuSVR`, :class:`OneClassSVM`. :pr:`16530` by :user:`Shuhua Fan `. @@ -620,7 +620,7 @@ Changelog - |Enhancement| :func:`tree.plot_tree` now uses colors from the matplotlib configuration settings. :pr:`17187` by `Andreas Müller`_. -- |API|: The parameter ``X_idx_sorted`` is now deprecated in +- |API| The parameter ``X_idx_sorted`` is now deprecated in :meth:`tree.DecisionTreeClassifier.fit` and :meth:`tree.DecisionTreeRegressor.fit`, and has not effect. :pr:`17614` by :user:`Juan Carlos Alfaro Jiménez `. @@ -628,6 +628,11 @@ Changelog :mod:`sklearn.utils` .................... +- |Enhancement| Add ``check_methods_sample_order_invariance`` to + :func:`~utils.estimator_checks.check_estimator`, which checks that + estimator methods are invariant if applied to the same dataset + with different sample order :pr:`17598` by :user:`Jason Ngo `. + - |Fix| Raise ValueError with clear error message in :func:`check_array` for sparse DataFrames with mixed types. :pr:`17992` by :user:`Thomas J. Fan ` and @@ -637,11 +642,6 @@ Changelog with different endianness. :pr:`17644` by :user:`Qi Zhang `. -- |Enhancement| Add ``check_methods_sample_order_invariance`` to - :func:`~utils.estimator_checks.check_estimator`, which checks that - estimator methods are invariant if applied to the same dataset - with different sample order :pr:`17598` by :user:`Jason Ngo `. - Miscellaneous ............. @@ -653,4 +653,4 @@ Code and Documentation Contributors ----------------------------------- Thanks to everyone who has contributed to the maintenance and improvement of -the project since version 0.20, including: +the project since version 0.23, including: From 43df0de4c05f090d8ced4669a291a51cd355413e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 14 Oct 2020 14:58:18 +0200 Subject: [PATCH 7/9] Remove unrelated change to whats_new/0.24.rst --- doc/whats_new/v0.24.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index e60f50a737073..0804e45b77e42 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -450,6 +450,12 @@ Changelog samples between the train and test set on each fold. :pr:`13204` by :user:`Kyle Kosic `. +- |Enhancement| :func:`model_selection.permutation_test_score` and + :func:`model_selection.validation_curve` now accept fit_params + to pass additional estimator parameters. + :pr:`18527` by :user:`Gaurav Dhingra `, + :user:`Julien Jerphanion ` and :user:`Amanda Dsouza `. + - |Enhancement| :func:`model_selection.cross_val_score`, :func:`model_selection.cross_validate`, :class:`model_selection.GridSearchCV`, and From 442bebfde89bcc46bc214458acefa2069b24f478 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 14 Oct 2020 18:15:59 -0400 Subject: [PATCH 8/9] CLN Address comments --- sklearn/preprocessing/tests/test_data.py | 2 -- sklearn/preprocessing/tests/test_discretization.py | 9 --------- 2 files changed, 11 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 58858195dbdb1..b4ffbe2ce40f6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1312,8 +1312,6 @@ def test_quantile_transform_check_error(): [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) err_msg = ("X has 2 features, but QuantileTransformer is expecting " "3 features as input.") - with pytest.raises(ValueError, match=err_msg): - transformer.transform(X_bad_feat) with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 901f43e407cb8..9d607c82d5831 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -101,15 +101,6 @@ def test_fit_transform_n_bins_array(strategy, expected): assert bin_edges.shape == (n_bins + 1, ) -def test_invalid_n_features(): - est = KBinsDiscretizer(n_bins=3).fit(X) - bad_X = np.arange(25).reshape(5, -1) - err_msg = ("X has 5 features, but KBinsDiscretizer is expecting 4 " - "features as input") - with pytest.raises(ValueError, match=err_msg): - est.transform(bad_X) - - @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile']) def test_same_min_max(strategy): warnings.simplefilter("always") From f2a88f564ea9c468d09c717a0c9128996ece63af Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Oct 2020 18:09:32 +0200 Subject: [PATCH 9/9] DOC add information about in_fit in QuantileTransformer._check_input --- sklearn/preprocessing/_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 35b059153d316..2ce000a207163 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -3250,6 +3250,10 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, ---------- X : array-like of shape (n_samples, n_features) + in_fit : bool + Whether or not `_check_input` is called from `fit` or other + methods, e.g. `predict`, `transform`, etc. + check_positive : bool, default=False If True, check that all data is positive and non-zero (only if ``self.method=='box-cox'``).