8000 EHN make some meta-estimators lenient towards missing values (#17987) · scikit-learn/scikit-learn@8ba49f6 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 8ba49f6

Browse files
venkyyuvyglemaitre
andauthored
EHN make some meta-estimators lenient towards missing values (#17987)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent e126a85 commit 8ba49f6

File tree

7 files changed

+114
-12
lines changed

7 files changed

+114
-12
lines changed

doc/whats_new/v0.24.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,12 @@ Changelog
361361
validity of the input is now delegated to the base estimator.
362362
:pr:`17233` by :user:`Zolisa Bleki <zoj613>`.
363363

364+
- |Enhancement| :class:`multiclass.OneVsOneClassifier` now accepts
365+
the inputs with missing values. Hence, estimators which can handle
366+
missing values (may be a pipeline with imputation step) can be used as
367+
a estimator for multiclass wrappers.
368+
:pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.
369+
364370
:mod:`sklearn.multioutput`
365371
..........................
366372

@@ -369,6 +375,13 @@ Changelog
369375
:pr:`18124` by :user:`Gus Brocchini <boldloop>` and
370376
:user:`Amanda Dsouza <amy12xx>`.
371377

378+
- |Enhancement| :class:`multioutput.MultiOutputClassifier` and
379+
:class:`multioutput.MultiOutputRegressor` now accepts the inputs
380+
with missing values. Hence, estimators which can handle missing
381+
values (may be a pipeline with imputation step, HistGradientBoosting
382+
estimators) can be used as a estimator for multiclass wrappers.
383+
:pr:`17987` by :user:`Venkatachalam N <venkyyuvy>`.
384+
372385
:mod:`sklearn.naive_bayes`
373386
..........................
374387

sklearn/ensemble/tests/test_common.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
import pytest
23

34
from sklearn.base import clone
@@ -6,14 +7,20 @@
67

78
from sklearn.datasets import make_classification
89
from sklearn.datasets import make_regression
9-
10+
from sklearn.datasets import load_iris, load_diabetes
11+
from sklearn.impute import SimpleImputer
1012
from sklearn.linear_model import LogisticRegression, LinearRegression
1113
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
14+
from sklearn.pipeline import make_pipeline
1215
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
1316

1417
from sklearn.ensemble import StackingClassifier, StackingRegressor
1518
from sklearn.ensemble import VotingClassifier, VotingRegressor
1619

20+
X, y = load_iris(return_X_y=True)
21+
22+
X_r, y_r = load_diabetes(return_X_y=True)
23+
1724

1825
@pytest.mark.parametrize(
1926
"X, y, estimator",
@@ -170,3 +177,28 @@ def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
170177
estimator.set_params(lr='drop')
171178
with pytest.raises(ValueError, match="All estimators are dropped."):
172179
estimator.fit(X, y)
180+
181+
182+
@pytest.mark.parametrize(
183+
"Ensemble, Estimator, X, y",
184+
[(StackingClassifier, LogisticRegression,
185+
X, y),
186+
(StackingRegressor, LinearRegression,
187+
X_r, y_r),
188+
(VotingClassifier, LogisticRegression,
189+
X, y),
190+
(VotingRegressor, LinearRegression,
191+
X_r, y_r)]
192+
)
193+
# FIXME: we should move this test in `estimator_checks` once we are able
194+
# to construct meta-estimator instances
195+
def test_heterogeneous_ensemble_support_missing_values(Ensemble,
196+
Estimator, X, y):
197+
# check that Voting and Stacking predictor delegate the missing values
198+
# validation to the underlying estimator.
199+
X = X.copy()
200+
mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
201+
X[mask] = np.nan
202+
pipe = make_pipeline(SimpleImputer(), Estimator())
203+
ensemble = Ensemble(estimators=[('pipe1', pipe), ('pipe2', pipe)])
204+
ensemble.fit(X, y).score(X, y)

sklearn/multiclass.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def partial_fit(self, X, y, classes=None):
294294
if _check_partial_fit_first_call(self, classes):
295295
if not hasattr(self.estimator, "partial_fit"):
296296
raise ValueError(("Base estimator {0}, doesn't have "
297-
"partial_fit method").format(self.estimator))
297+
"partial_fit method").format(self.estimator))
298298
self.estimators_ = [clone(self.estimator) for _ in range
299299
(self.n_classes_)]
300300

@@ -307,8 +307,8 @@ def partial_fit(self, X, y, classes=None):
307307

308308
if len(np.setdiff1d(y, self.classes_)):
309309
raise ValueError(("Mini-batch contains {0} while classes " +
310-
"must be subset of {1}").format(np.unique(y),
311-
self.classes_))
310+
"must be subset of {1}").format(np.unique(y),
311+
self.classes_))
312312

313313
Y = self.label_binarizer_.transform(y)
314314
Y = Y.tocsc()
@@ -578,7 +578,8 @@ def fit(self, X, y):
578578
-------
579579
self
580580
"""
581-
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
581+
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
582+
force_all_finite=False)
582583
check_classification_targets(y)
583584

584585
self.classes_ = np.unique(y)
@@ -635,7 +636,8 @@ def partial_fit(self, X, y, classes=None):
635636
"must be subset of {1}".format(np.unique(y),
636637
self.classes_))
637638

638-
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
639+
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
640+
force_all_finite=False)
639641
check_classification_targets(y)
640642
combinations = itertools.combinations(range(self.n_classes_), 2)
641643
self.estimators_ = Parallel(

sklearn/multioutput.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
101101
self : object
102102
"""
103103
X, y = check_X_y(X, y,
104+
force_all_finite=False,
104105
multi_output=True,
105106
accept_sparse=True)
106107

@@ -153,7 +154,9 @@ def fit(self, X, y, sample_weight=None, **fit_params):
153154
raise ValueError("The base estimator should implement"
154155
" a fit method")
155156

156-
X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True)
157+
X, y = self._validate_data(X, y,
158+
force_all_finite=False,
159+
multi_output=True, accept_sparse=True)
157160

158161
if is_classifier(self):
159162
check_classification_targets(y)
@@ -196,7 +199,7 @@ def predict(self, X):
196199
raise ValueError("The base estimator should implement"
197200
" a predict method")
198201

199-
X = check_array(X, accept_sparse=True)
202+
X = check_array(X, force_all_finite=False, accept_sparse=True)
200203

201204
y = Parallel(n_jobs=self.n_jobs)(
202205
delayed(e.predict)(X)

sklearn/tests/test_multiclass.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
SGDClassifier)
3030
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
3131
from sklearn.model_selection import GridSearchCV, cross_val_score
32-
from sklearn.pipeline import Pipeline
32+
from sklearn.pipeline import Pipeline, make_pipeline
33+
from sklearn.impute import SimpleImputer
3334
from sklearn import svm
3435
from sklearn import datasets
3536

@@ -776,3 +777,21 @@ def test_pairwise_cross_val_score():
776777
score_precomputed = cross_val_score(ovr_true, linear_kernel, y)
777778
score_linear = cross_val_score(ovr_false, X, y)
778779
assert_array_equal(score_precomputed, score_linear)
780+
781+
782+
@pytest.mark.parametrize("MultiClassClassifier",
783+
[OneVsRestClassifier, OneVsOneClassifier])
784+
# FIXME: we should move this test in `estimator_checks` once we are able
785+
# to construct meta-estimator instances
786+
def test_support_missing_values(MultiClassClassifier):
787+
# smoke test to check that pipeline OvR and OvO classifiers are letting
788+
# the validation of missing values to
789+
# the underlying pipeline or classifiers
790+
rng = np.random.RandomState(42)
791+
X, y = iris.data, iris.target
792+
mask = rng.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
793+
X[mask] = np.nan
794+
lr = make_pipeline(SimpleImputer(),
795+
LogisticRegression(random_state=rng))
796+
797+
MultiClassClassifier(lr).fit(X, y).score(X, y)

sklearn/tests/test_multioutput.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from sklearn.utils import shuffle
3232
from sklearn.model_selection import GridSearchCV
3333
from sklearn.dummy import DummyRegressor, DummyClassifier
34+
from sklearn.pipeline import make_pipeline
35+
from sklearn.impute import SimpleImputer
3436

3537

3638
def test_multi_target_regression():
@@ -302,7 +304,7 @@ def test_multiclass_multioutput_estimator():
302304
multi_class_svc_ = clone(multi_class_svc) # create a clone
303305
multi_class_svc_.fit(X, y[:, i])
304306
assert (list(multi_class_svc_.predict(X)) ==
305-
list(predictions[:, i]))
307+
list(predictions[:, i]))
306308

307309

308310
def test_multiclass_multioutput_estimator_predict_proba():
@@ -463,7 +465,7 @@ def test_classifier_chain_vs_independent_models():
463465
Y_pred_chain = chain.predict(X_test)
464466

465467
assert (jaccard_score(Y_test, Y_pred_chain, average='samples') >
466-
jaccard_score(Y_test, Y_pred_ovr, average='samples'))
468+
jaccard_score(Y_test, Y_pred_ovr, average='samples'))
467469

468470

469471
def test_base_chain_fit_and_predict():
@@ -476,7 +478,7 @@ def test_base_chain_fit_and_predict():
476478
Y_pred = chain.predict(X)
477479
assert Y_pred.shape == Y.shape
478480
assert ([c.coef_.size for c in chain.estimators_] ==
479-
list(range(X.shape[1], X.shape[1] + Y.shape[1])))
481+
list(range(X.shape[1], X.shape[1] + Y.shape[1])))
480482

481483
Y_prob = chains[1].predict_proba(X)
482484
Y_binary = (Y_prob >= .5)
@@ -603,6 +605,26 @@ def fit(self, X, y, **fit_params):
603605
assert est.sample_weight_ is weight
604606

605607

608+
@pytest.mark.parametrize(
609+
'MultiOutputEstimator, Estimator',
610+
[(MultiOutputClassifier, LogisticRegression),
611+
(MultiOutputRegressor, Ridge)]
612+
)
613+
# FIXME: we should move this test in `estimator_checks` once we are able
614+
# to construct meta-estimator instances
615+
def test_support_missing_values(MultiOutputEstimator, Estimator):
616+
# smoke test to check that pipeline MultioutputEstimators are letting
617+
# the validation of missing values to
618+
# the underlying pipeline, regressor or classifier
619+
rng = np.random.RandomState(42)
620+
X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
621+
mask = rng.choice([1, 0], X.shape, p=[.01, .99]).astype(bool)
622+
X[mask] = np.nan
623+
624+
pipe = make_pipeline(SimpleImputer(), Estimator())
625+
MultiOutputEstimator(pipe).fit(X, y).score(X, y)
626+
627+
606628
@pytest.mark.parametrize("order_type", [list, np.array, tuple])
607629
def test_classifier_chain_tuple_order(order_type):
608630
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]

sklearn/tests/test_pipeline.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from sklearn.feature_extraction.text import CountVectorizer
3737
from sklearn.experimental import enable_hist_gradient_boosting # noqa
3838
from sklearn.ensemble import HistGradientBoostingClassifier
39+
from sklearn.impute import SimpleImputer
3940

4041
iris = load_iris()
4142

@@ -1222,6 +1223,16 @@ def transform(self, X, y=None):
12221223
t.fit_transform(X, y, a=0)
12231224

12241225

1226+
def test_pipeline_missing_values_leniency():
1227+
# check that pipeline let the missing values validation to
1228+
# the underlying transformers and predictors.
1229+
X, y = iris.data, iris.target
1230+
mask = np.random.choice([1, 0], X.shape, p=[.1, .9]).astype(bool)
1231+
X[mask] = np.nan
1232+
pipe = make_pipeline(SimpleImputer(), LogisticRegression())
1233+
assert pipe.fit(X, y).score(X, y) > 0.4
1234+
1235+
12251236
def test_feature_union_warns_unknown_transformer_weight():
12261237
# Warn user when transformer_weights containers a key not present in
12271238
# transformer_list

0 commit comments

Comments
 (0)
0