8000 Merge pull request #1 from SkuaD01/d01-19352-imputer · SkuaD01/scikit-learn@51f6e19 · GitHub
[go: up one dir, main page]

Skip to content

Commit 51f6e19

Browse files
authored
Merge pull request #1 from SkuaD01/d01-19352-imputer
D01 19352 imputer
2 parents 22d9d97 + 4be3bf2 commit 51f6e19

File tree

2 files changed

+36
-2
lines changed

2 files changed

+36
-2
lines changed

sklearn/impute/_iterative.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@ def _impute_one_feature(self,
344344
self._min_value[feat_idx],
345345
self._max_value[feat_idx])
346346

347+
# If estimator prediction returns a multi-dimensional numpy array we
348+
# need to re-cast it to the wanted shape
349+
shape_imputed_values = imputed_values.shape
350+
if len(shape_imputed_values) > 1:
351+
imputed_values = imputed_values.reshape(shape_imputed_values[0])
352+
347353
# update the feature
348354
X_filled[missing_row_mask, feat_idx] = imputed_values
349355
return X_filled, estimator

sklearn/impute/tests/test_impute.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import io
1010

11+
from sklearn.cross_decomposition import PLSRegression
1112
from sklearn.utils._testing import assert_allclose
1213
from sklearn.utils._testing import assert_allclose_dense_sparse
1314
from sklearn.utils._testing import assert_array_equal
@@ -16,7 +17,7 @@
1617
# make IterativeImputer available
1718
from sklearn.experimental import enable_iterative_imputer # noqa
1819

19-
from sklearn.datasets import load_diabetes
20+
from sklearn.datasets import load_diabetes, fetch_california_housing
2021
from sklearn.impute import MissingIndicator
2122
from sklearn.impute import SimpleImputer, IterativeImputer
2223
from sklearn.dummy import DummyRegressor
@@ -633,7 +634,7 @@ def test_iterative_imputer_imputation_order(imputation_order):
633634

634635
@pytest.mark.parametrize(
635636
"estimator",
636-
[None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
637+
[None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV(), PLSRegression()]
637638
)
638639
def test_iterative_imputer_estimators(estimator):
639640
rng = np.random.RandomState(0)
@@ -660,6 +661,33 @@ def test_iterative_imputer_estimators(estimator):
660661
assert len(set(hashes)) == len(hashes)
661662

662663

664+
def test_iterative_imputer_multiple_components():
665+
666+
rng = np.random.RandomState(42)
667+
668+
# get sample data from california housing dataset
669+
X_california, y_california = fetch_california_housing(return_X_y=True)
670+
n_samples, n_features = X_california.shape
671+
672+
# Add missing values in 75% of the lines
673+
missing_rate = 0.75
674+
n_missing_samples = int(n_samples * missing_rate)
675+
missing_samples = np.zeros(n_samples, dtype=bool)
676+
missing_samples[: n_missing_samples] = True
677+
678+
rng.shuffle(missing_samples)
679+
missing_features = rng.randint(0, n_features, n_missing_samples)
680+
X_missing = X_california.copy()
681+
X_missing[missing_samples, missing_features] = np.nan
682+
683+
# PLSRegression returns multi-dimensional numpy array as
684+
# opposed to other estimators which return 1-D array,
685+
# but this should not cause issues in the imputer
686+
imputer = IterativeImputer(estimator=PLSRegression(n_components=2))
687+
X_imputed = imputer.fit_transform(X_missing)
688+
assert X_imputed.shape == X_california.shape
689+
690+
663691
def test_iterative_imputer_clip():
664692
rng = np.random.RandomState(0)
665693
n = 100

0 commit comments

Comments
 (0)
0