8000 MAINT test globally setting output via context manager by glemaitre · Pull Request #24932 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

MAINT test globally setting output via context manager #24932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Nov 24, 2022
59 changes: 52 additions & 7 deletions sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
from ..base import clone
from ..exceptions import ConvergenceWarning
from ..preprocessing import normalize
from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan
from ..utils import (
check_array,
check_random_state,
is_scalar_nan,
_safe_assign,
_safe_indexing,
)
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
from ..utils.validation import _check_feature_names_in
from ..utils._mask import _get_mask
Expand All @@ -25,6 +31,26 @@
)


def _assign_where(X1, X2, cond):
"""Assign X2 to X1 where cond is True.

Parameters
----------
X1 : ndarray or dataframe of shape (n_samples, n_features)
Data.

X2 : ndarray of shape (n_samples, n_features)
Data to be assigned.

cond : ndarray of shape (n_samples, n_features)
Boolean mask to assign data.
"""
if hasattr(X1, "mask"): # pandas dataframes
X1.mask(cond=cond, other=X2, inplace=True)
else: # ndarrays
X1[cond] = X2[cond]


class IterativeImputer(_BaseImputer):
"""Multivariate imputer that estimates each feature from all the others.

Expand Down Expand Up @@ -362,16 +388,28 @@ def _impute_one_feature(

missing_row_mask = mask_missing_values[:, feat_idx]
if fit_mode:
X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask)
y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)
X_train = _safe_indexing(
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
~missing_row_mask,
axis=0,
)
y_train = _safe_indexing(
_safe_indexing(X_filled, feat_idx, axis=1),
~missing_row_mask,
axis=0,
)
estimator.fit(X_train, y_train)

# if no missing values, don't predict
if np.sum(missing_row_mask) == 0:
return X_filled, estimator

# get posterior samples if there is at least one missing value
X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask)
X_test = _safe_indexing(
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
missing_row_mask,
axis=0,
)
if self.sample_posterior:
mus, sigmas = estimator.predict(X_test, return_std=True)
imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
Expand Down Expand Up @@ -402,7 +440,12 @@ def _impute_one_feature(
)

# update the feature
X_filled[missing_row_mask, feat_idx] = imputed_values
_safe_assign(
X_filled,
imputed_values,
row_indexer=missing_row_mask,
column_indexer=feat_idx,
)
return X_filled, estimator

def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):
Expand Down Expand Up @@ -743,7 +786,8 @@ def fit_transform(self, X, y=None):
"[IterativeImputer] Early stopping criterion not reached.",
ConvergenceWarning,
)
Xt[~mask_missing_values] = X[~mask_missing_values]
_assign_where(Xt, X, cond=~mask_missing_values)

return super()._concatenate_indicator(Xt, X_indicator)

def transform(self, X):
Expand Down Expand Up @@ -796,7 +840,8 @@ def transform(self, X):
)
i_rnd += 1

Xt[~mask_missing_values] = X[~mask_missing_values]
_assign_where(Xt, X, cond=~mask_missing_values)

return super()._concatenate_indicator(Xt, X_indicator)

def fit(self, X, y=None):
Expand Down
22 changes: 21 additions & 1 deletion sklearn/impute/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

import numpy as np

from sklearn.impute._base import _BaseImputer
from sklearn.utils._mask import _get_mask
from sklearn.utils._testing import _convert_container, assert_allclose

from sklearn.impute._base import _BaseImputer
from sklearn.impute._iterative import _assign_where


@pytest.fixture
Expand Down Expand Up @@ -87,3 +90,20 @@ def test_base_no_precomputed_mask_transform(data):
imputer.transform(data)
with pytest.raises(ValueError, match=err_msg):
imputer.fit_transform(data)


@pytest.mark.parametrize("X1_type", ["array", "dataframe"])
def test_assign_where(X1_type):
"""Check the behaviour of the private helpers `_assign_where`."""
rng = np.random.RandomState(0)

n_samples, n_features = 10, 5
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
X2 = rng.randn(n_samples, n_features)
mask EDBE = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)

_assign_where(X1, X2, mask)

if X1_type == "dataframe":
X1 = X1.to_numpy()
assert_allclose(X1[mask], X2[mask])
6 changes: 2 additions & 4 deletions sklearn/inspection/_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..utils import check_array
from ..utils import check_matplotlib_support # noqa
from ..utils import _safe_indexing
from ..utils import _safe_assign
from ..utils import _determine_key_type
from ..utils import _get_column_indices
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -149,10 +150,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
X_eval = X.copy()
for new_values in grid:
for i, variable in enumerate(features):
if hasattr(X_eval, "iloc"):
X_eval.iloc[:, variable] = new_values[i]
else:
X_eval[:, variable] = new_values[i]
_safe_assign(X_eval, new_values[i], column_indexer=variable)

try:
# Note: predictions is of shape
Expand Down
11 changes: 2 additions & 9 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from scipy import sparse

from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from ..utils import check_array, is_scalar_nan
from ..utils import check_array, is_scalar_nan, _safe_indexing
from ..utils.validation import check_is_fitted
from ..utils.validation import _check_feature_names_in
from ..utils._param_validation import Interval, StrOptions, Hidden
Expand Down Expand Up @@ -58,21 +58,14 @@ def _check_X(self, X, force_all_finite=True):
X_columns = []

for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = _safe_indexing(X, indices=i, axis=1)
Xi = check_array(
Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation
)
X_columns.append(Xi)

return X_columns, n_samples, n_features

def _get_feature(self, X, feature_idx):
if hasattr(X, "iloc"):
# pandas dataframes
return X.iloc[:, feature_idx]
# numpy arrays, sparse arrays
return X[:, feature_idx]

def _fit(
self, X, handle_unknown="error", force_all_finite=True, return_counts=False
):
Expand Down
16 changes: 16 additions & 0 deletions sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
check_transformer_get_feature_names_out_pandas,
check_set_output_transform,
check_set_output_transform_pandas,
check_global_ouptut_transform_pandas,
)


Expand Down Expand Up @@ -544,3 +545,18 @@ def test_set_output_transform_pandas(estimator):
_set_checking_parameters(estimator)
with ignore_warnings(category=(FutureWarning)):
check_set_output_transform_pandas(estimator.__class__.__name__, estimator)


@pytest.mark.parametrize(
"estimator", SET_OUTPUT_ESTIMATORS, ids=_get_check_estimator_ids
)
def test_global_output_transform_pandas(estimator):
name = estimator.__class__.__name__
if not hasattr(estimator, "set_output"):
pytest.skip(
f"Skipping check_global_ouptut_transform_pandas for {name}: Does not"
" support set_output API yet"
)
_set_checking_parameters(estimator)
with ignore_warnings(category=(FutureWarning)):
check_global_ouptut_transform_pandas(estimator.__class__.__name__, estimator)
38 changes: 32 additions & 6 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
indexable,
check_symmetric,
check_scalar,
_is_arraylike_not_scalar,
)
from .. import get_config
from ._bunch import Bunch
Expand Down Expand Up @@ -186,13 +187,8 @@ def _array_indexing(array, key, key_dtype, axis):

def _pandas_indexing(X, key, key_dtype, axis):
"""Index a pandas dataframe or a series."""
if hasattr(key, "shape"):
# Work-around for indexing with read-only key in pandas
# FIXME: solved in pandas 0.25
if _is_arraylike_not_scalar(key):
key = np.asarray(key)
key = key if key.flags.writeable else key.copy()
elif isinstance(key, tuple):
key = list(key)

if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
# using take() instead of iloc[] ensures the return value is a "proper"
Expand Down Expand Up @@ -362,6 +358,36 @@ def _safe_indexing(X, indices, *, axis=0):
return _list_indexing(X, indices, indices_dtype)


def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
"""Safe assignment to a numpy array, sparse matrix, or pandas dataframe.

Parameters
----------
X : {ndarray, sparse-matrix, dataframe}
Array to be modified. It is expected to be 2-dimensional.

values : ndarray
The values to be assigned to `X`.

row_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the rows of interest. If `None`, all
rows are selected.

column_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the columns of interest. If `None`, all
columns are selected.
"""
row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
column_indexer = (
slice(None, None, None) if column_indexer is None else column_indexer
)

if hasattr(X, "iloc"): # pandas dataframe
X.iloc[row_indexer, column_indexer] = values
else: # numpy array or sparse matrix
X[row_indexer, column_indexer] = values


def _get_column_indices(X, key):
"""Get feature column indices for input data X and key.

Expand Down
Loading
0