diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a2389fdba1fb3..c2b453af42b7a 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -72,6 +72,13 @@ Changelog when the global configuration sets `transform_output="pandas"`. :pr:`25500` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.preprocessing` +............................ + +- |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports + `encoded_missing_value` or `unknown_value` set to a categories' cardinality + when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_. + :mod:`sklearn.utils` .................... diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b8665f8be7b59..ec1bbeea62448 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1300,15 +1300,7 @@ def fit(self, X, y=None): # `_fit` will only raise an error when `self.handle_unknown="error"` self._fit(X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan") - if self.handle_unknown == "use_encoded_value": - for feature_cats in self.categories_: - if 0 <= self.unknown_value < len(feature_cats): - raise ValueError( - "The used value for unknown_value " - f"{self.unknown_value} is one of the " - "values already used for encoding the " - "seen categories." - ) + cardinalities = [len(categories) for categories in self.categories_] # stores the missing indices per category self._missing_indices = {} @@ -1316,8 +1308,22 @@ def fit(self, X, y=None): for i, cat in enumerate(categories_for_idx): if is_scalar_nan(cat): self._missing_indices[cat_idx] = i + + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value + cardinalities[cat_idx] -= 1 continue + if self.handle_unknown == "use_encoded_value": + for cardinality in cardinalities: + if 0 <= self.unknown_value < cardinality: + raise ValueError( + "The used value for unknown_value " + f"{self.unknown_value} is one of the " + "values already used for encoding the " + "seen categories." + ) + if self._missing_indices: if np.dtype(self.dtype).kind != "f" and is_scalar_nan( self.encoded_missing_value @@ -1336,9 +1342,9 @@ def fit(self, X, y=None): # known category invalid_features = [ cat_idx - for cat_idx, categories_for_idx in enumerate(self.categories_) + for cat_idx, cardinality in enumerate(cardinalities) if cat_idx in self._missing_indices - and 0 <= self.encoded_missing_value < len(categories_for_idx) + and 0 <= self.encoded_missing_value < cardinality ] if invalid_features: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 632a486b6e4b5..9927e7e365865 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2003,3 +2003,15 @@ def test_predefined_categories_dtype(): for n, cat in enumerate(enc.categories_): assert cat.dtype == object assert_array_equal(categories[n], cat) + + +def test_ordinal_encoder_missing_unknown_encoding_max(): + """Check missing value or unknown encoding can equal the cardinality.""" + X = np.array([["dog"], ["cat"], [np.nan]], dtype=object) + X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X) + assert_allclose(X_trans, [[1], [0], [2]]) + + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X) + X_test = np.array([["snake"]]) + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[2]])