8000 FIX delete feature_names_in_ when refitting on a ndarray (#21389) · scikit-learn/scikit-learn@cd927c0 · GitHub
[go: up one dir, main page]

Skip to content

Commit cd927c0

Browse files
jeremiedbbogrisel
authored andcommitted
FIX delete feature_names_in_ when refitting on a ndarray (#21389)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
1 parent ae223ee commit cd927c0

File tree

11 files changed

+65
-70
lines changed

11 files changed

+65
-70
lines changed

doc/whats_new/v1.0.rst

+8
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,14 @@ Fixed models
128128
where the underlying check for an attribute did not work with NumPy arrays.
129129
:pr:`21145` by :user:`Zahlii <Zahlii>`.
130130

131+
Miscellaneous
132+
.............
133+
134+
- |Fix| Fitting an estimator on a dataset that has no feature names, that was previously
135+
fitted on a dataset with feature names no longer keeps the old feature names stored in
136+
the `feature_names_in_` attribute. :pr:`21389` by
137+
:user:`Jérémie du Boisberranger <jeremiedbb>`.
138+
131139
.. _changes_1_0:
132140

133141
Version 1.0.0

sklearn/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,10 @@ def _check_feature_names(self, X, *, reset):
421421
feature_names_in = _get_feature_names(X)
422422
if feature_names_in is not None:
423423
self.feature_names_in_ = feature_names_in
424+
elif hasattr(self, "feature_names_in_"):
425+
# Delete the attribute when the estimator is fitted on a new dataset
426+
# that has no feature names.
427+
delattr(self, "feature_names_in_")
424428
return
425429

426430
fitted_feature_names = getattr(self, "feature_names_in_", None)

sklearn/cluster/_agglomerative.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,22 @@ def fit(self, X, y=None):
915915
Returns the fitted instance.
916916
"""
917917
X = self._validate_data(X, ensure_min_samples=2, estimator=self)
918+
return self._fit(X)
919+
920+
def _fit(self, X):
921+
"""Fit without validation
922+
923+
Parameters
924+
----------
925+
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
926+
Training instances to cluster, or distances between instances if
927+
``affinity='precomputed'``.
928+
929+
Returns
930+
-------
931+
self : object
932+
Returns the fitted instance.
933+
"""
918934
memory = check_memory(self.memory)
919935

920936
if self.n_clusters is not None and self.n_clusters <= 0:
@@ -1218,17 +1234,8 @@ def fit(self, X, y=None):
12181234
self : object
12191235
Returns the transformer.
12201236
"""
1221-
X = self._validate_data(
1222-
X,
1223-
accept_sparse=["csr", "csc", "coo"],
1224-
ensure_min_features=2,
1225-
estimator=self,
1226-
)
1227-
# save n_features_in_ attribute here to reset it after, because it will
1228-
# be overridden in AgglomerativeClustering since we passed it X.T.
1229-
n_features_in_ = self.n_features_in_
1230-
AgglomerativeClustering.fit(self, X.T)
1231-
self.n_features_in_ = n_features_in_
1237+
X = self._validate_data(X, ensure_min_features=2, estimator=self)
1238+
super()._fit(X.T)
12321239
return self
12331240

12341241
@property

sklearn/decomposition/_lda.py

+4-20
Original file line numberDiff line numberDiff line change
@@ -684,20 +684,6 @@ def _unnormalized_transform(self, X):
684684
doc_topic_distr : ndarray of shape (n_samples, n_components)
685685
Document topic distribution for X.
686686
"""
687-
check_is_fitted(self)
688-
689-
# make sure feature size is the same in fitted model and in X
690-
X = self._check_non_neg_array(
691-
X, reset_n_features=True, whom="LatentDirichletAllocation.transform"
692-
)
693-
n_samples, n_features = X.shape
694-
if n_features != self.components_.shape[1]:
695-
raise ValueError(
696-
"The provided data has %d dimensions while "
697-
"the model was trained with feature size %d."
698-
% (n_features, self.components_.shape[1])
699-
)
700-
701687
doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False)
702688

703689
return doc_topic_distr
@@ -851,12 +837,6 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False)
851837
score : float
852838
Perplexity score.
853839
"""
854-
check_is_fitted(self)
855-
856-
X = self._check_non_neg_array(
857-
X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
858-
)
859-
860840
if doc_topic_distr is None:
861841
doc_topic_distr = self._unnormalized_transform(X)
862842
else:
@@ -902,4 +882,8 @@ def perplexity(self, X, sub_sampling=False):
902882
score : float
903883
Perplexity score.
904884
"""
885+
check_is_fitted(self)
886+
X = self._check_non_neg_array(
887+
X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
888+
)
905889
return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)

sklearn/ensemble/_bagging.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,15 @@ def fit(self, X, y, sample_weight=None):
257257
self : object
258258
Fitted estimator.
259259
"""
260+
# Convert data (X is required to be 2d and indexable)
261+
X, y = self._validate_data(
262+
X,
263+
y,
264+
accept_sparse=["csr", "csc"],
265+
dtype=None,
266+
force_all_finite=False,
267+
multi_output=True,
268+
)
260269
return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
261270

262271
def _parallel_args(self):
@@ -295,15 +304,6 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
295304
"""
296305
random_state = check_random_state(self.random_state)
297306

298-
# Convert data (X is required to be 2d and indexable)
299-
X, y = self._validate_data(
300-
X,
301-
y,
302-
accept_sparse=["csr", "csc"],
303-
dtype=None,
304-
force_all_finite=False,
305-
multi_output=True,
306-
)
307307
if sample_weight is not None:
308308
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
309309

sklearn/ensemble/_forest.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
6868
from ..utils.fixes import _joblib_parallel_args
6969
from ..utils.multiclass import check_classification_targets, type_of_target
7070
from ..utils.validation import check_is_fitted, _check_sample_weight
71+
from ..utils.validation import _num_samples
7172

7273

7374
__all__ = [
@@ -2627,14 +2628,8 @@ def fit_transform(self, X, y=None, sample_weight=None):
26272628
X_transformed : sparse matrix of shape (n_samples, n_out)
26282629
Transformed dataset.
26292630
"""
2630-
X = self._validate_data(X, accept_sparse=["csc"])
2631-
if issparse(X):
2632-
# Pre-sort indices to avoid that each individual tree of the
2633-
# ensemble sorts the indices.
2634-
X.sort_indices()
2635-
26362631
rnd = check_random_state(self.random_state)
2637-
y = rnd.uniform(size=X.shape[0])
2632+
y = rnd.uniform(size=_num_samples(X))
26382633
super().fit(X, y, sample_weight=sample_weight)
26392634

26402635
self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)

sklearn/linear_model/_ridge.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -700,16 +700,6 @@ def fit(self, X, y, sample_weight=None):
700700
self.normalize, default=False, estimator_name=self.__class__.__name__
701701
)
702702

703-
_dtype = [np.float64, np.float32]
704-
_accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
705-
X, y = self._validate_data(
706-
X,
707-
y,
708-
accept_sparse=_accept_sparse,
709-
dtype=_dtype,
710-
multi_output=True,
711-
y_numeric=True,
712-
)
713703
if self.solver == "lbfgs" and not self.positive:
714704
raise ValueError(
715705
"'lbfgs' solver can be used only when positive=True. "
@@ -1008,6 +998,15 @@ def fit(self, X, y, sample_weight=None):
1008998
self : object
1009999
Fitted estimator.
10101000
"""
1001+
_accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
1002+
X, y = self._validate_data(
1003+
X,
1004+
y,
1005+
accept_sparse=_accept_sparse,
1006+
dtype=[np.float64, np.float32],
1007+
multi_output=True,
1008+
y_numeric=True,
1009+
)
10111010
return super().fit(X, y, sample_weight=sample_weight)
10121011

10131012

sklearn/linear_model/_stochastic_gradient.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -648,19 +648,12 @@ def _fit(
648648
):
649649
self._validate_params()
650650
if hasattr(self, "classes_"):
651-
self.classes_ = None
652-
653-
X, y = self._validate_data(
654-
X,
655-
y,
656-
accept_sparse="csr",
657-
dtype=np.float64,
658-
order="C",
659-
accept_large_sparse=False,
660-
)
651+
# delete the attribute otherwise _partial_fit thinks it's not the first call
652+
delattr(self, "classes_")
661653

662654
# labels can be encoded as float, int, or string literals
663655
# np.unique sorts in asc order; largest class id is positive class
656+
y = self._validate_data(y=y)
664657
classes = np.unique(y)
665658

666659
if self.warm_start and hasattr(self, "coef_"):

sklearn/model_selection/tests/test_successive_halving.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ def test_groups_support(Est):
665665
]
666666
error_msg = "The 'groups' parameter should not be None."
667667
for cv in group_cvs:
668-
gs = Est(clf, grid, cv=cv)
668+
gs = Est(clf, grid, cv=cv, random_state=0)
669669
with pytest.raises(ValueError, match=error_msg):
670670
gs.fit(X, y)
671671
gs.fit(X, y, groups=groups)

sklearn/naive_bayes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def fit(self, X, y, sample_weight=None):
241241
self : object
242242
Returns the instance itself.
243243
"""
244-
X, y = self._validate_data(X, y)
244+
y = self._validate_data(y=y)
245245
return self._partial_fit(
246246
X, y, np.unique(y), _refit=True, sample_weight=sample_weight
247247
)

sklearn/tests/test_base.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,11 @@ def transform(self, X):
638638
trans = NoOpTransformer().fit(df)
639639
assert_array_equal(trans.feature_names_in_, df.columns)
640640

641+
# fit again but on ndarray does not keep the previous feature names (see #21383)
642+
trans.fit(X_np)
643+
assert not hasattr(trans, "feature_names_in_")
644+
645+
trans.fit(df)
641646
msg = "The feature names should match those that were passed"
642647
df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
643648
with pytest.warns(FutureWarning, match=msg):
@@ -665,7 +670,7 @@ def transform(self, X):
665670
assert not record
666671

667672
# fit on dataframe with no feature names or all integer feature names
668-
# -> do not warn on trainsform
673+
# -> do not warn on transform
669674
Xs = [X_np, df_int_names]
670675
for X in Xs:
671676
with pytest.warns(None) as record:

0 commit comments

Comments
 (0)
0