From 40767dd03e8a688848bb5d0937dc5cddee4aac93 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 15 Nov 2022 12:25:57 +0100 Subject: [PATCH 01/13] MAINT test globally setting output via context manager --- sklearn/tests/test_common.py | 16 ++++++++ sklearn/utils/estimator_checks.py | 61 +++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e8f7f129a31ef..2ae197a10cbd1 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -76,6 +76,7 @@ check_transformer_get_feature_names_out_pandas, check_set_output_transform, check_set_output_transform_pandas, + check_global_ouptut_transform_pandas, ) @@ -541,3 +542,18 @@ def test_set_output_transform_pandas(estimator): _set_checking_parameters(estimator) with ignore_warnings(category=(FutureWarning)): check_set_output_transform_pandas(estimator.__class__.__name__, estimator) + + +@pytest.mark.parametrize( + "estimator", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids +) +def test_global_output_transform_pandas(estimator): + name = estimator.__class__.__name__ + if not hasattr(estimator, "set_output"): + pytest.skip( + f"Skipping check_global_ouptut_transform_pandas for {name}: Does not" + " support set_output API yet" + ) + _set_checking_parameters(estimator) + with ignore_warnings(category=(FutureWarning)): + check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7026159f16287..ba8c77b19bf2c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4221,3 +4221,64 @@ def fit_transform(est): X_trans_no_setting, columns=transformer.get_feature_names_out() ) pd.testing.assert_frame_equal(X_trans_pandas, expected_dataframe) + + +def check_global_ouptut_transform_pandas(name, transformer_orig): + """Check that setting globally the output of a transformer to pandas lead to the + right results.""" + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column name consistency for pandas" + ) + + tags = transformer_orig._get_tags() + if "2darray" not in tags["X_types"] or tags["no_validation"]: + return + + rng = np.random.RandomState(0) + transformer = clone(transformer_orig) + + X = rng.uniform(size=(20, 5)) + X = _enforce_estimator_tags_X(transformer_orig, X) + y = rng.randint(0, 2, size=20) + y = _enforce_estimator_tags_y(transformer_orig, y) + set_random_state(transformer) + + feature_names_in = [f"col{i}" for i in range(X.shape[1])] + df = pd.DataFrame(X, columns=feature_names_in) + + def fit_then_transform(est): + if name in CROSS_DECOMPOSITION: + return est.fit(df, y).transform(df, y) + return est.fit(df, y).transform(df) + + def fit_transform(est): + return est.fit_transform(df, y) + + transform_methods = [fit_then_transform, fit_transform] + + for transform_method in transform_methods: + transformer = clone(transformer) + X_trans_no_setting = transform_method(transformer) + with config_context(transform_output="pandas"): + # Auto wrapping only wraps the first array + if name in CROSS_DECOMPOSITION: + X_trans_no_setting = X_trans_no_setting[0] + + try: + X_trans_pandas = transform_method(transformer) + except ValueError as e: + # transformer does not support sparse data + assert str(e) == "Pandas output does not support sparse data.", e + return + + if name in CROSS_DECOMPOSITION: + X_trans_pandas = X_trans_pandas[0] + + assert isinstance(X_trans_pandas, pd.DataFrame) + expected_dataframe = pd.DataFrame( + X_trans_no_setting, columns=transformer.get_feature_names_out() + ) + pd.testing.assert_frame_equal(X_trans_pandas, expected_dataframe) From 4a14a024c116a9d428cc117e0b89fc1d847e8b8e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 Nov 2022 14:10:09 +0100 Subject: [PATCH 02/13] FIX intial imputer should work on NumPy array --- sklearn/impute/_iterative.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 73d73b5bd8a6d..de1babdbc19a9 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -6,6 +6,7 @@ from scipy import stats import numpy as np +from .. import config_context from ..base import clone from ..exceptions import ConvergenceWarning from ..preprocessing import normalize @@ -566,15 +567,16 @@ def _initial_imputation(self, X, in_fit=False): X_missing_mask = _get_mask(X, self.missing_values) mask_missing_values = X_missing_mask.copy() - if self.initial_imputer_ is None: - self.initial_imputer_ = SimpleImputer( - missing_values=self.missing_values, - strategy=self.initial_strategy, - keep_empty_features=self.keep_empty_features, - ) - X_filled = self.initial_imputer_.fit_transform(X) - else: - X_filled = self.initial_imputer_.transform(X) + with config_context(transform_output="default"): + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy, + keep_empty_features=self.keep_empty_features, + ) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) valid_mask = np.flatnonzero( np.logical_not(np.isnan(self.initial_imputer_.statistics_)) From 4858f39447a1d45cc2234091f063cd48b3d4fc2d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 Nov 2022 15:22:24 +0100 Subject: [PATCH 03/13] refactoring --- sklearn/utils/estimator_checks.py | 141 +++++++++++++++++------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ba8c77b19bf2c..a3ef1cbcaaf09 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4162,6 +4162,57 @@ def fit_transform(est): assert_allclose_dense_sparse(X_trans_no_setting, X_trans_default) +def _output_from_fit_transform(transformer, name, X, df, y): + outputs = {} + + # fit then transform case: + cases = [ + "fit.transform/fit_df/transform_df", + "fit.transform/fit_df/transform_array", + "fit.transform/fit_array/transform_df", + "fit.transform/fit_array/transform_array", + ] + for case, data_fit, data_transform, target in zip( + cases, [df, df, X, X], [df, X, df, X], [y, y, y, y] + ): + transformer.fit(data_fit, target) + if name in CROSS_DECOMPOSITION: + X_trans = transformer.transform(data_transform, target)[0] + else: + X_trans = transformer.transform(data_transform) + outputs[case] = (X_trans, transformer.get_feature_names_out()) + + # fit_transform case: + cases = ["fit_transform/df", "fit_transform/array"] + for case, data, target in zip(cases, [df, X], [y, y]): + if name in CROSS_DECOMPOSITION: + X_trans = transformer.fit_transform(data, target)[0] + else: + X_trans = transformer.fit_transform(data, target) + outputs[case] = (X_trans, transformer.get_feature_names_out()) + + return outputs + + +def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): + import pandas as pd + + X_trans, feature_names_default = outputs_default + df_trans, feature_names_pandas = outputs_pandas + + assert isinstance(df_trans, pd.DataFrame) + expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas) + + try: + pd.testing.assert_frame_equal(df_trans, expected_dataframe) + except AssertionError as e: + raise AssertionError( + f"{name} does not generate a valid dataframe in the {case} " + "case. The generated dataframe is not equal to the expected " + f"dataframe. The error message is: {e}" + ) from e + + def check_set_output_transform_pandas(name, transformer_orig): # Check transformer.set_output configures the output of transform="pandas". try: @@ -4187,40 +4238,20 @@ def check_set_output_transform_pandas(name, transformer_orig): feature_names_in = [f"col{i}" for i in range(X.shape[1])] df = pd.DataFrame(X, columns=feature_names_in) - def fit_then_transform(est): - if name in CROSS_DECOMPOSITION: - return est.fit(df, y).transform(df, y) - return est.fit(df, y).transform(df) - - def fit_transform(est): - return est.fit_transform(df, y) - - transform_methods = [fit_then_transform, fit_transform] - - for transform_method in transform_methods: - transformer = clone(transformer).set_output(transform="default") - X_trans_no_setting = transform_method(transformer) - - # Auto wrapping only wraps the first array - if name in CROSS_DECOMPOSITION: - X_trans_no_setting = X_trans_no_setting[0] - - transformer.set_output(transform="pandas") - try: - X_trans_pandas = transform_method(transformer) - except ValueError as e: - # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e - return - - if name in CROSS_DECOMPOSITION: - X_trans_pandas = X_trans_pandas[0] + transformer_default = clone(transformer).set_output(transform="default") + outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) + transformer_pandas = clone(transformer).set_output(transform="pandas") + try: + outputs_pandas = _output_from_fit_transform(transformer_pandas, name, X, df, y) + except ValueError as e: + # transformer does not support sparse data + assert str(e) == "Pandas output does not support sparse data.", e + return - assert isinstance(X_trans_pandas, pd.DataFrame) - expected_dataframe = pd.DataFrame( - X_trans_no_setting, columns=transformer.get_feature_names_out() + for case in outputs_default: + _check_generated_dataframe( + name, case, outputs_default[case], outputs_pandas[case] ) - pd.testing.assert_frame_equal(X_trans_pandas, expected_dataframe) def check_global_ouptut_transform_pandas(name, transformer_orig): @@ -4249,36 +4280,20 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): feature_names_in = [f"col{i}" for i in range(X.shape[1])] df = pd.DataFrame(X, columns=feature_names_in) - def fit_then_transform(est): - if name in CROSS_DECOMPOSITION: - return est.fit(df, y).transform(df, y) - return est.fit(df, y).transform(df) - - def fit_transform(est): - return est.fit_transform(df, y) - - transform_methods = [fit_then_transform, fit_transform] - - for transform_method in transform_methods: - transformer = clone(transformer) - X_trans_no_setting = transform_method(transformer) + transformer_default = clone(transformer).set_output(transform="default") + outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) + transformer_pandas = clone(transformer) + try: with config_context(transform_output="pandas"): - # Auto wrapping only wraps the first array - if name in CROSS_DECOMPOSITION: - X_trans_no_setting = X_trans_no_setting[0] - - try: - X_trans_pandas = transform_method(transformer) - except ValueError as e: - # transformer does not support sparse data - assert str(e) == "Pandas output does not support sparse data.", e - return - - if name in CROSS_DECOMPOSITION: - X_trans_pandas = X_trans_pandas[0] - - assert isinstance(X_trans_pandas, pd.DataFrame) - expected_dataframe = pd.DataFrame( - X_trans_no_setting, columns=transformer.get_feature_names_out() + outputs_pandas = _output_from_fit_transform( + transformer_pandas, name, X, df, y ) - pd.testing.assert_frame_equal(X_trans_pandas, expected_dataframe) + except ValueError as e: + # transformer does not support sparse data + assert str(e) == "Pandas output does not support sparse data.", e + return + + for case in outputs_default: + _check_generated_dataframe( + name, case, outputs_default[case], outputs_pandas[case] + ) From 6518fdb552537444ec60c7d392b88d8cc06a4aa3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 Nov 2022 15:27:09 +0100 Subject: [PATCH 04/13] add some comments --- sklearn/utils/estimator_checks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a3ef1cbcaaf09..71de91ced5c9b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4163,6 +4163,12 @@ def fit_transform(est): def _output_from_fit_transform(transformer, name, X, df, y): + """Generate output to set test `set_output` for different configuration: + + - calling either `fit.transform` or `fit_transform`; + - passing either a dataframe or a numpy array to fit; + - passing either a dataframe or a numpy array to transform. + """ outputs = {} # fit then transform case: @@ -4201,6 +4207,9 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): df_trans, feature_names_pandas = outputs_pandas assert isinstance(df_trans, pd.DataFrame) + # We always rely on the output of `get_feature_names_out` of the + # transformer used to generate the dataframe as a ground-truth of the + # columns. expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas) try: From b560b5998b2957e9a36b5b8355c22190dc64b98e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 Nov 2022 18:12:38 +0100 Subject: [PATCH 05/13] make IterativeImputer container agnostic --- sklearn/impute/_iterative.py | 59 +++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index de1babdbc19a9..58308c434f1d5 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -6,7 +6,6 @@ from scipy import stats import numpy as np -from .. import config_context from ..base import clone from ..exceptions import ConvergenceWarning from ..preprocessing import normalize @@ -363,8 +362,16 @@ def _impute_one_feature( missing_row_mask = mask_missing_values[:, feat_idx] if fit_mode: - X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask) - y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask) + X_train = _safe_indexing( + _safe_indexing(X_filled, neighbor_feat_idx, axis=1), + ~missing_row_mask, + axis=0, + ) + y_train = _safe_indexing( + _safe_indexing(X_filled, feat_idx, axis=1), + ~missing_row_mask, + axis=0, + ) estimator.fit(X_train, y_train) # if no missing values, don't predict @@ -372,7 +379,11 @@ def _impute_one_feature( return X_filled, estimator # get posterior samples if there is at least one missing value - X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask) + X_test = _safe_indexing( + _safe_indexing(X_filled, neighbor_feat_idx, axis=1), + missing_row_mask, + axis=0, + ) if self.sample_posterior: mus, sigmas = estimator.predict(X_test, return_std=True) imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) @@ -403,7 +414,12 @@ def _impute_one_feature( ) # update the feature - X_filled[missing_row_mask, feat_idx] = imputed_values + if hasattr(self, "iloc"): + X_filled.iloc[missing_row_mask, feat_idx] = imputed_values + else: + print(X_filled[missing_row_mask, feat_idx].shape) + print(imputed_values.shape) + X_filled[missing_row_mask, feat_idx] = imputed_values return X_filled, estimator def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): @@ -567,16 +583,15 @@ def _initial_imputation(self, X, in_fit=False): X_missing_mask = _get_mask(X, self.missing_values) mask_missing_values = X_missing_mask.copy() - with config_context(transform_output="default"): - if self.initial_imputer_ is None: - self.initial_imputer_ = SimpleImputer( - missing_values=self.missing_values, - strategy=self.initial_strategy, - keep_empty_features=self.keep_empty_features, - ) - X_filled = self.initial_imputer_.fit_transform(X) - else: - X_filled = self.initial_imputer_.transform(X) + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy, + keep_empty_features=self.keep_empty_features, + ) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) valid_mask = np.flatnonzero( np.logical_not(np.isnan(self.initial_imputer_.statistics_)) @@ -709,7 +724,7 @@ def fit_transform(self, X, y=None): Xt, estimator = self._impute_one_feature( Xt, mask_missing_values, - feat_idx, + int(feat_idx), neighbor_feat_idx, estimator=None, fit_mode=True, @@ -745,7 +760,11 @@ def fit_transform(self, X, y=None): "[IterativeImputer] Early stopping criterion not reached.", ConvergenceWarning, ) - Xt[~mask_missing_values] = X[~mask_missing_values] + if hasattr(Xt, "iloc"): + for feat_idx, feat_mask in enumerate(mask_missing_values.T): + Xt.iloc[~feat_mask, feat_idx] = X[~feat_mask, feat_idx] + else: + Xt[~mask_missing_values] = X[~mask_missing_values] return super()._concatenate_indicator(Xt, X_indicator) def transform(self, X): @@ -798,7 +817,11 @@ def transform(self, X): ) i_rnd += 1 - Xt[~mask_missing_values] = X[~mask_missing_values] + if hasattr(Xt, "iloc"): + for feat_idx, feat_mask in enumerate(mask_missing_values.T): + Xt.iloc[~feat_mask, feat_idx] = X[~feat_mask, feat_idx] + else: + Xt[~mask_missing_values] = X[~mask_missing_values] return super()._concatenate_indicator(Xt, X_indicator) def fit(self, X, y=None): From 66c4196fcdf65ce18b988999521973bd5c528bbc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 Nov 2022 18:16:49 +0100 Subject: [PATCH 06/13] remove debug --- sklearn/impute/_iterative.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 58308c434f1d5..214fa7a25d4fb 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -417,8 +417,6 @@ def _impute_one_feature( if hasattr(self, "iloc"): X_filled.iloc[missing_row_mask, feat_idx] = imputed_values else: - print(X_filled[missing_row_mask, feat_idx].shape) - print(imputed_values.shape) X_filled[missing_row_mask, feat_idx] = imputed_values return X_filled, estimator From 2c6c2482b9b90ff1405a8fa14248db8f6e3b22a8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Nov 2022 11:58:34 +0100 Subject: [PATCH 07/13] MAINT introduce _safe_assign in IterativeImputer --- sklearn/impute/_iterative.py | 50 +++++++++++++++++++++++++----------- sklearn/utils/__init__.py | 30 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 214fa7a25d4fb..7a18c33a8daa8 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -9,7 +9,13 @@ from ..base import clone from ..exceptions import ConvergenceWarning from ..preprocessing import normalize -from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan +from ..utils import ( + check_array, + check_random_state, + is_scalar_nan, + _safe_assign, + _safe_indexing, +) from ..utils.validation import FLOAT_DTYPES, check_is_fitted from ..utils.validation import _check_feature_names_in from ..utils._mask import _get_mask @@ -414,10 +420,12 @@ def _impute_one_feature( ) # update the feature - if hasattr(self, "iloc"): - X_filled.iloc[missing_row_mask, feat_idx] = imputed_values - else: - X_filled[missing_row_mask, feat_idx] = imputed_values + _safe_assign( + X_filled, + imputed_values, + row_indexer=missing_row_mask, + column_indexer=feat_idx, + ) return X_filled, estimator def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): @@ -758,11 +766,17 @@ def fit_transform(self, X, y=None): "[IterativeImputer] Early stopping criterion not reached.", ConvergenceWarning, ) - if hasattr(Xt, "iloc"): - for feat_idx, feat_mask in enumerate(mask_missing_values.T): - Xt.iloc[~feat_mask, feat_idx] = X[~feat_mask, feat_idx] - else: - Xt[~mask_missing_values] = X[~mask_missing_values] + for feat_idx, feat_mask in enumerate(mask_missing_values.T): + _safe_assign( + Xt, + _safe_indexing( + _safe_indexing(X, feat_idx, axis=1), + ~feat_mask, + axis=0, + ), + row_indexer=~feat_mask, + column_indexer=feat_idx, + ) return super()._concatenate_indicator(Xt, X_indicator) def transform(self, X): @@ -815,11 +829,17 @@ def transform(self, X): ) i_rnd += 1 - if hasattr(Xt, "iloc"): - for feat_idx, feat_mask in enumerate(mask_missing_values.T): - Xt.iloc[~feat_mask, feat_idx] = X[~feat_mask, feat_idx] - else: - Xt[~mask_missing_values] = X[~mask_missing_values] + for feat_idx, feat_mask in enumerate(mask_missing_values.T): + _safe_assign( + Xt, + _safe_indexing( + _safe_indexing(X, feat_idx, axis=1), + ~feat_mask, + axis=0, + ), + row_indexer=~feat_mask, + column_indexer=feat_idx, + ) return super()._concatenate_indicator(Xt, X_indicator) def fit(self, X, y=None): diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index bed2556bf3194..bcc63eecac051 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -362,6 +362,36 @@ def _safe_indexing(X, indices, *, axis=0): return _list_indexing(X, indices, indices_dtype) +def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): + """Safe assignment to a numpy array, sparse matrix, or pandas dataframe. + + Parameters + ---------- + X : {ndarray, sparse-matrix, dataframe} + Array to be modified. It is expected to be 2-dimensional. + + values : ndarray + The values to be assigned to `X`. + + row_indexer : array-like, dtype={int, bool}, default=None + A 1-dimensional array to select the rows of interest. If `None`, all + rows are selected. + + column_indexer : array-like, dtype={int, bool}, default=None + A 1-dimensional array to select the columns of interest. If `None`, all + columns are selected. + """ + row_indexer = slice(None, None, None) if row_indexer is None else row_indexer + column_indexer = ( + slice(None, None, None) if column_indexer is None else column_indexer + ) + + if hasattr(X, "iloc"): + X.iloc[row_indexer, column_indexer] = values + else: + X[row_indexer, column_indexer] = values + + def _get_column_indices(X, key): """Get feature column indices for input data X and key. From a845b757e49a2b59f7db36844d717f591d8ddf77 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Nov 2022 12:05:14 +0100 Subject: [PATCH 08/13] iter --- sklearn/preprocessing/_encoders.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index f61d9ac8f5c9c..51d83448f84fa 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,7 +10,7 @@ from scipy import sparse from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin -from ..utils import check_array, is_scalar_nan +from ..utils import check_array, is_scalar_nan, _safe_indexing from ..utils.validation import check_is_fitted from ..utils.validation import _check_feature_names_in from ..utils._param_validation import Interval, StrOptions, Hidden @@ -67,11 +67,7 @@ def _check_X(self, X, force_all_finite=True): return X_columns, n_samples, n_features def _get_feature(self, X, feature_idx): - if hasattr(X, "iloc"): - # pandas dataframes - return X.iloc[:, feature_idx] - # numpy arrays, sparse arrays - return X[:, feature_idx] + return _safe_indexing(X, feature_idx, axis=1) def _fit( self, X, handle_unknown="error", force_all_finite=True, return_counts=False From 65ae84a93bf55c2f2b4872e83ae9c73996c0f0b1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Nov 2022 18:17:51 +0100 Subject: [PATCH 09/13] iter --- sklearn/impute/_iterative.py | 2 +- sklearn/utils/__init__.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 7a18c33a8daa8..048336ea551c3 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -730,7 +730,7 @@ def fit_transform(self, X, y=None): Xt, estimator = self._impute_one_feature( Xt, mask_missing_values, - int(feat_idx), + feat_idx, neighbor_feat_idx, estimator=None, fit_mode=True, diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index bcc63eecac051..d04945dd86f2a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -186,12 +186,7 @@ def _array_indexing(array, key, key_dtype, axis): def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" - if hasattr(key, "shape"): - # Work-around for indexing with read-only key in pandas - # FIXME: solved in pandas 0.25 - key = np.asarray(key) - key = key if key.flags.writeable else key.copy() - elif isinstance(key, tuple): + if isinstance(key, tuple): key = list(key) if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): From c71afea1da319de78f87633301c6b606ad6404db Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Nov 2022 19:04:14 +0100 Subject: [PATCH 10/13] TST add test for _safe_assign --- sklearn/utils/tests/test_utils.py | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 4fc483e5a85b2..848985f267c92 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -23,6 +23,7 @@ from sklearn.utils import safe_mask from sklearn.utils import column_or_1d from sklearn.utils import _safe_indexing +from sklearn.utils import _safe_assign from sklearn.utils import shuffle from sklearn.utils import gen_even_slices from sklearn.utils import _message_with_time, _print_elapsed_time @@ -742,3 +743,37 @@ def test_to_object_array(sequence): assert isinstance(out, np.ndarray) assert out.dtype.kind == "O" assert out.ndim == 1 + + +@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) +def test_safe_assign(array_type): + """Check that `_safe_assign` works as expected.""" + rng = np.random.RandomState(0) + X_array = rng.randn(10, 5) + + row_indexer = [1, 2] + values = rng.randn(len(row_indexer), X_array.shape[1]) + X = _convert_container(X_array, array_type) + _safe_assign(X, values, row_indexer=row_indexer) + + assigned_portion = _safe_indexing(X, row_indexer, axis=0) + assert_allclose_dense_sparse( + assigned_portion, _convert_container(values, array_type) + ) + + column_indexer = [1, 2] + values = rng.randn(X_array.shape[0], len(column_indexer)) + X = _convert_container(X_array, array_type) + _safe_assign(X, values, column_indexer=column_indexer) + + assigned_portion = _safe_indexing(X, column_indexer, axis=1) + assert_allclose_dense_sparse( + assigned_portion, _convert_container(values, array_type) + ) + + row_indexer, column_indexer = None, None + values = rng.randn(*X.shape) + X = _convert_container(X_array, array_type) + _safe_assign(X, values, column_indexer=column_indexer) + + assert_allclose_dense_sparse(X, _convert_container(values, array_type)) From f6c0383b2bc952888b395ff4d741c5a0658cc0f0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Nov 2022 19:12:04 +0100 Subject: [PATCH 11/13] iter --- sklearn/utils/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index d04945dd86f2a..8af238991286e 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -186,7 +186,14 @@ def _array_indexing(array, key, key_dtype, axis): def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" - if isinstance(key, tuple): + if hasattr(key, "shape") and not np.isscalar(key): + # Surprisingly `np.int64` will have a `shape` attribute while being + # a scalar. + # Work-around for indexing with read-only key in pandas + # FIXME: solved in pandas 0.25 + key = np.asarray(key) + key = key if key.flags.writeable else key.copy() + elif isinstance(key, tuple): key = list(key) if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): From a642ec97a67931b643d28e2d3d5b0ae17cef46f5 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 23 Nov 2022 14:32:12 +0100 Subject: [PATCH 12/13] address review comments + simplify --- sklearn/impute/_iterative.py | 46 ++++++++++++----------- sklearn/inspection/_partial_dependence.py | 6 +-- sklearn/preprocessing/_encoders.py | 5 +-- sklearn/utils/__init__.py | 14 ++----- sklearn/utils/estimator_checks.py | 33 +++++++++------- 5 files changed, 50 insertions(+), 54 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 048336ea551c3..1d918bc0c4643 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -31,6 +31,26 @@ ) +def _assign_where(X1, X2, cond): + """Assign X2 to X1 where cond is True. + + Parameters + ---------- + X1 : ndarray or dataframe of shape (n_samples, n_features) + Data. + + X2 : ndarray of shape (n_samples, n_features) + Data to be assigned. + + cond : ndarray of shape (n_samples, n_features) + Boolean mask to assign data. + """ + if hasattr(X1, "mask"): # pandas dataframes + X1.mask(cond=cond, other=X2, inplace=True) + else: # ndarrays + X1[cond] = X2[cond] + + class IterativeImputer(_BaseImputer): """Multivariate imputer that estimates each feature from all the others. @@ -766,17 +786,8 @@ def fit_transform(self, X, y=None): "[IterativeImputer] Early stopping criterion not reached.", ConvergenceWarning, ) - for feat_idx, feat_mask in enumerate(mask_missing_values.T): - _safe_assign( - Xt, - _safe_indexing( - _safe_indexing(X, feat_idx, axis=1), - ~feat_mask, - axis=0, - ), - row_indexer=~feat_mask, - column_indexer=feat_idx, - ) + _assign_where(Xt, X, cond=~mask_missing_values) + return super()._concatenate_indicator(Xt, X_indicator) def transform(self, X): @@ -829,17 +840,8 @@ def transform(self, X): ) i_rnd += 1 - for feat_idx, feat_mask in enumerate(mask_missing_values.T): - _safe_assign( - Xt, - _safe_indexing( - _safe_indexing(X, feat_idx, axis=1), - ~feat_mask, - axis=0, - ), - row_indexer=~feat_mask, - column_indexer=feat_idx, - ) + _assign_where(Xt, X, cond=~mask_missing_values) + return super()._concatenate_indicator(Xt, X_indicator) def fit(self, X, y=None): diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index ebb7a11e16835..9bc70a50ce197 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -16,6 +16,7 @@ from ..utils import check_array from ..utils import check_matplotlib_support # noqa from ..utils import _safe_indexing +from ..utils import _safe_assign from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted @@ -149,10 +150,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method): X_eval = X.copy() for new_values in grid: for i, variable in enumerate(features): - if hasattr(X_eval, "iloc"): - X_eval.iloc[:, variable] = new_values[i] - else: - X_eval[:, variable] = new_values[i] + _safe_assign(X_eval, new_values[i], column_indexer=variable) try: # Note: predictions is of shape diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 51d83448f84fa..4d90d993e0dc7 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -58,7 +58,7 @@ def _check_X(self, X, force_all_finite=True): X_columns = [] for i in range(n_features): - Xi = self._get_feature(X, feature_idx=i) + Xi = _safe_indexing(X, indices=i, axis=1) Xi = check_array( Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation ) @@ -66,9 +66,6 @@ def _check_X(self, X, force_all_finite=True): return X_columns, n_samples, n_features - def _get_feature(self, X, feature_idx): - return _safe_indexing(X, feature_idx, axis=1) - def _fit( self, X, handle_unknown="error", force_all_finite=True, return_counts=False ): diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8af238991286e..97db16e73868f 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -35,6 +35,7 @@ indexable, check_symmetric, check_scalar, + _is_arraylike_not_scalar, ) from .. import get_config from ._bunch import Bunch @@ -186,15 +187,8 @@ def _array_indexing(array, key, key_dtype, axis): def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" - if hasattr(key, "shape") and not np.isscalar(key): - # Surprisingly `np.int64` will have a `shape` attribute while being - # a scalar. - # Work-around for indexing with read-only key in pandas - # FIXME: solved in pandas 0.25 + if _is_arraylike_not_scalar(key): key = np.asarray(key) - key = key if key.flags.writeable else key.copy() - elif isinstance(key, tuple): - key = list(key) if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): # using take() instead of iloc[] ensures the return value is a "proper" @@ -388,9 +382,9 @@ def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): slice(None, None, None) if column_indexer is None else column_indexer ) - if hasattr(X, "iloc"): + if hasattr(X, "iloc"): # pandas dataframe X.iloc[row_indexer, column_indexer] = values - else: + else: # numpy array or sparse matrix X[row_indexer, column_indexer] = values diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 71de91ced5c9b..944a4f28e1c6b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4163,7 +4163,7 @@ def fit_transform(est): def _output_from_fit_transform(transformer, name, X, df, y): - """Generate output to set test `set_output` for different configuration: + """Generate output to test `set_output` for different configuration: - calling either `fit.transform` or `fit_transform`; - passing either a dataframe or a numpy array to fit; @@ -4173,28 +4173,33 @@ def _output_from_fit_transform(transformer, name, X, df, y): # fit then transform case: cases = [ - "fit.transform/fit_df/transform_df", - "fit.transform/fit_df/transform_array", - "fit.transform/fit_array/transform_df", - "fit.transform/fit_array/transform_array", + ("fit.transform/df/df", df, df), + ("fit.transform/df/array", df, X), + ("fit.transform/array/df", X, df), + ("fit.transform/array/array", X, X), ] - for case, data_fit, data_transform, target in zip( - cases, [df, df, X, X], [df, X, df, X], [y, y, y, y] - ): - transformer.fit(data_fit, target) + for ( + case, + data_fit, + data_transform, + ) in cases: + transformer.fit(data_fit, y) if name in CROSS_DECOMPOSITION: - X_trans = transformer.transform(data_transform, target)[0] + X_trans, _ = transformer.transform(data_transform, y) else: X_trans = transformer.transform(data_transform) outputs[case] = (X_trans, transformer.get_feature_names_out()) # fit_transform case: - cases = ["fit_transform/df", "fit_transform/array"] - for case, data, target in zip(cases, [df, X], [y, y]): + cases = [ + ("fit_transform/df", df), + ("fit_transform/array", X), + ] + for case, data in cases: if name in CROSS_DECOMPOSITION: - X_trans = transformer.fit_transform(data, target)[0] + X_trans, _ = transformer.fit_transform(data, y) else: - X_trans = transformer.fit_transform(data, target) + X_trans = transformer.fit_transform(data, y) outputs[case] = (X_trans, transformer.get_feature_names_out()) return outputs From eb044cd1f18de360e73059087f004eba9e979a39 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 23 Nov 2022 16:53:45 +0100 Subject: [PATCH 13/13] TST add test for _assign_where --- sklearn/impute/tests/test_base.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/impute/tests/test_base.py b/sklearn/impute/tests/test_base.py index 837575765f884..fedfdebb20a1f 100644 --- a/sklearn/impute/tests/test_base.py +++ b/sklearn/impute/tests/test_base.py @@ -2,8 +2,11 @@ import numpy as np -from sklearn.impute._base import _BaseImputer from sklearn.utils._mask import _get_mask +from sklearn.utils._testing import _convert_container, assert_allclose + +from sklearn.impute._base import _BaseImputer +from sklearn.impute._iterative import _assign_where @pytest.fixture @@ -87,3 +90,20 @@ def test_base_no_precomputed_mask_transform(data): imputer.transform(data) with pytest.raises(ValueError, match=err_msg): imputer.fit_transform(data) + + +@pytest.mark.parametrize("X1_type", ["array", "dataframe"]) +def test_assign_where(X1_type): + """Check the behaviour of the private helpers `_assign_where`.""" + rng = np.random.RandomState(0) + + n_samples, n_features = 10, 5 + X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type) + X2 = rng.randn(n_samples, n_features) + mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool) + + _assign_where(X1, X2, mask) + + if X1_type == "dataframe": + X1 = X1.to_numpy() + assert_allclose(X1[mask], X2[mask])