8000 MAINT test globally setting output via context manager (#24932) · scikit-learn/scikit-learn@af16e59 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit af16e59

Browse files
MAINT test globally setting output via context manager (#24932)
Co-authored-by: jeremie du boisberranger <jeremiedbb@yahoo.fr>
1 parent f99b8b1 commit af16e59

File tree

8 files changed

+276
-53
lines changed

8 files changed

+276
-53
lines changed

sklearn/impute/_iterative.py

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99
from ..base import clone
1010
from ..exceptions import ConvergenceWarning
1111
from ..preprocessing import normalize
12-
from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan
12+
from ..utils import (
13+
check_array,
14+
check_random_state,
15+
is_scalar_nan,
16+
_safe_assign,
17+
_safe_indexing,
18+
)
1319
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
1420
from ..utils.validation import _check_feature_names_in
1521
from ..utils._mask import _get_mask
@@ -25,6 +31,26 @@
2531
)
2632

2733

34+
def _assign_where(X1, X2, cond):
35+
"""Assign X2 to X1 where cond is True.
36+
37+
Parameters
38+
----------
39+
X1 : ndarray or dataframe of shape (n_samples, n_features)
40+
Data.
41+
42+
X2 : ndarray of shape (n_samples, n_features)
43+
Data to be assigned.
44+
45+
cond : ndarray of shape (n_samples, n_features)
46+
Boolean mask to assign data.
47+
"""
48+
if hasattr(X1, "mask"): # pandas dataframes
49+
X1.mask(cond=cond, other=X2, inplace=True)
50+
else: # ndarrays
51+
X1[cond] = X2[cond]
52+
53+
2854
class IterativeImputer(_BaseImputer):
2955
"""Multivariate imputer that estimates each feature from all the others.
3056
@@ -362,16 +388,28 @@ def _impute_one_feature(
362388

363389
missing_row_mask = mask_missing_values[:, feat_idx]
364390
if fit_mode:
365-
X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask)
366-
y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)
391+
X_train = _safe_indexing(
392+
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
393+
~missing_row_mask,
394+
axis=0,
395+
)
396+
y_train = _safe_indexing(
397+
_ 9E81 safe_indexing(X_filled, feat_idx, axis=1),
398+
~missing_row_mask,
399+
axis=0,
400+
)
367401
estimator.fit(X_train, y_train)
368402

369403
# if no missing values, don't predict
370404
if np.sum(missing_row_mask) == 0:
371405
return X_filled, estimator
372406

373407
# get posterior samples if there is at least one missing value
374-
X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask)
408+
X_test = _safe_indexing(
409+
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
410+
missing_row_mask,
411+
axis=0,
412+
)
375413
if self.sample_posterior:
376414
mus, sigmas = estimator.predict(X_test, return_std=True)
377415
imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
@@ -402,7 +440,12 @@ def _impute_one_feature(
402440
)
403441

404442
# update the feature
405-
X_filled[missing_row_mask, feat_idx] = imputed_values
443+
_safe_assign(
444+
X_filled,
445+
imputed_values,
446+
row_indexer=missing_row_mask,
447+
column_indexer=feat_idx,
448+
)
406449
return X_filled, estimator
407450

408451
def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):
@@ -743,7 +786,8 @@ def fit_transform(self, X, y=None):
743786
"[IterativeImputer] Early stopping criterion not reached.",
744787
ConvergenceWarning,
745788
)
746-
Xt[~mask_missing_values] = X[~mask_missing_values]
789+
_assign_where(Xt, X, cond=~mask_missing_values)
790+
747791
return super()._concatenate_indicator(Xt, X_indicator)
748792

749793
def transform(self, X):
@@ -796,7 +840,8 @@ def transform(self, X):
796840
)
797841
i_rnd += 1
798842

799-
Xt[~mask_missing_values] = X[~mask_missing_values]
843+
_assign_where(Xt, X, cond=~mask_missing_values)
844+
800845
return super()._concatenate_indicator(Xt, X_indicator)
801846

802847
def fit(self, X, y=None):

sklearn/impute/tests/test_base.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
import numpy as np
44

5-
from sklearn.impute._base import _BaseImputer
65
from sklearn.utils._mask import _get_mask
6+
from sklearn.utils._testing import _convert_container, assert_allclose
7+
8+
from sklearn.impute._base import _BaseImputer
9+
from sklearn.impute._iterative import _assign_where
710

811

912
@pytest.fixture
@@ -87,3 +90,20 @@ def test_base_no_precomputed_mask_transform(data):
8790
imputer.transform(data)
8891
with pytest.raises(ValueError, match=err_msg):
8992
imputer.fit_transform(data)
93+
94+
95+
@pytest.mark.parametrize("X1_type", ["array", "dataframe"])
96+
def test_assign_where(X1_type):
97+
"""Check the behaviour of the private helpers `_assign_where`."""
98+
rng = np.random.RandomState(0)
99+
100+
n_samples, n_features = 10, 5
101+
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
102+
X2 = rng.randn(n_samples, n_features)
103+
mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)
104+
105+
_assign_where(X1, X2, mask)
106+
107+
if X1_type == "dataframe":
108+
X1 = X1.to_numpy()
109+
assert_allclose(X1[mask], X2[mask])

sklearn/inspection/_partial_dependence.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from ..utils import check_array
1717
from ..utils import check_matplotlib_support # noqa
1818
from ..utils import _safe_indexing
19+
from ..utils import _safe_assign
1920
from ..utils import _determine_key_type
2021
from ..utils import _get_column_indices
2122
from ..utils.validation import check_is_fitted
@@ -149,10 +150,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
149150
X_eval = X.copy()
150151
for new_values in grid:
151152
for i, variable in enumerate(features):
152-
if hasattr(X_eval, "iloc"):
153-
X_eval.iloc[:, variable] = new_values[i]
154-
else:
155-
X_eval[:, variable] = new_values[i]
153+
_safe_assign(X_eval, new_values[i], column_indexer=variable)
156154

157155
try:
158156
# Note: predictions is of shape

sklearn/preprocessing/_encoders.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from scipy import sparse
1111

1212
from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
13-
from ..utils import check_array, is_scalar_nan
13+
from ..utils import check_array, is_scalar_nan, _safe_indexing
1414
from ..utils.validation import check_is_fitted
1515
from ..utils.validation import _check_feature_names_in
1616
from ..utils._param_validation import Interval, StrOptions, Hidden
@@ -58,21 +58,14 @@ def _check_X(self, X, force_all_finite=True):
5858
X_columns = []
5959

6060
for i in range(n_features):
61-
Xi = self._get_feature(X, feature_idx=i)
61+
Xi = _safe_indexing(X, indices=i, axis=1)
6262
Xi = check_array(
6363
Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation
6464
)
6565
X_columns.append(Xi)
6666

6767
return X_columns, n_samples, n_features
6868

69-
def _get_feature(self, X, feature_idx):
70-
if hasattr(X, "iloc"):
71-
# pandas dataframes
72-
return X.iloc[:, feature_idx]
73-
# numpy arrays, sparse arrays
74-
return X[:, feature_idx]
75-
7669
def _fit(
7770
self, X, handle_unknown="error", force_all_finite=True, return_counts=False
7871
):

sklearn/tests/test_common.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
check_transformer_get_feature_names_out_pandas,
8080
check_set_output_transform,
8181
check_set_output_transform_pandas,
82+
check_global_ouptut_transform_pandas,
8283
)
8384

8485

@@ -544,3 +545,18 @@ def test_set_output_transform_pandas(estimator):
544545
_set_checking_parameters(estimator)
545546
with ignore_warnings(category=(FutureWarning)):
546547
check_set_output_transform_pandas(estimator.__class__.__name__, estimator)
548+
549+
550+
@pytest.mark.parametrize(
551+
"estimator", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
552+
)
553+
def test_global_output_transform_pandas(estimator):
554+
name = estimator.__class__.__name__
555+
if not hasattr(estimator, "set_output"):
556+
pytest.skip(
557+
f"Skipping check_global_ouptut_transform_pandas for {name}: Does not"
558+
" support set_output API yet"
559+
)
560+
_set_checking_parameters(estimator)
561+
with ignore_warnings(category=(FutureWarning)):
562+
check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator)

sklearn/utils/__init__.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
indexable,
3636
check_symmetric,
3737
check_scalar,
38+
_is_arraylike_not_scalar,
3839
)
3940
from .. import get_config
4041
from ._bunch import Bunch
@@ -186,13 +187,8 @@ def _array_indexing(array, key, key_dtype, axis):
186187

187188
def _pandas_indexing(X, key, key_dtype, axis):
188189
"""Index a pandas dataframe or a series."""
189-
if hasattr(key, "shape"):
190-
# Work-around for indexing with read-only key in pandas
191-
# FIXME: solved in pandas 0.25
190+
if _is_arraylike_not_scalar(key):
192191
key = np.asarray(key)
193-
key = key if key.flags.writeable else key.copy()
194-
elif isinstance(key, tuple):
195-
key = list(key)
196192

197193
if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
198194
# using take() instead of iloc[] ensures the return value is a "proper"
@@ -362,6 +358,36 @@ def _safe_indexing(X, indices, *, axis=0):
362358
return _list_indexing(X, indices, indices_dtype)
363359

364360

361+
def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
362+
"""Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
363+
364+
Parameters
365+
----------
366+
X : {ndarray, sparse-matrix, dataframe}
367+
Array to be modified. It is expected to be 2-dimensional.
368+
369+
values : ndarray
370+
The values to be assigned to `X`.
371+
372+
row_indexer : array-like, dtype={int, bool}, default=None
373+
A 1-dimensional array to select the rows of interest. If `None`, all
374+
rows are selected.
375+
376+
column_indexer : array-like, dtype={int, bool}, default=None
377+
A 1-dimensional array to select the columns of interest. If `None`, all
378+
columns are selected.
379+
"""
380+
row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
381+
column_indexer = (
382+
slice(None, None, None) if column_indexer is None else column_indexer
383+
)
384+
385+
if hasattr(X, "iloc"): # pandas dataframe
386+
X.iloc[row_indexer, column_indexer] = values
387+
else: # numpy array or sparse matrix
388+
X[row_indexer, column_indexer] = values
389+
390+
365391
def _get_column_indices(X, key):
366392
"""Get feature column indices for input data X and key.
367393

0 commit comments

Comments
 (0)
0