diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 3508d85fdcbff..f5241157a8c21 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -167,6 +167,10 @@ Changelog :mod:`sklearn.preprocessing` ............................ +- |Efficiency| :class:`preprocessing.OrdinalEncoder` avoids calculating + missing indices twice to improve efficiency. + :pr:`27017` by `Xuefeng Xu `. + - |Fix| :class:`preprocessing.OneHotEncoder` shows a more informative error message when `sparse_output=True` and the output is configured to be pandas. :pr:`26931` by `Thomas Fan`_. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5dc9929ded704..2c4ea4af450f2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1508,15 +1508,11 @@ def fit(self, X, y=None): if infrequent is not None: cardinalities[feature_idx] -= len(infrequent) - # stores the missing indices per category - self._missing_indices = {} + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value for cat_idx, categories_for_idx in enumerate(self.categories_): - for i, cat in enumerate(categories_for_idx): + for cat in categories_for_idx: if is_scalar_nan(cat): - self._missing_indices[cat_idx] = i - - # missing values are not considered part of the cardinality - # when considering unknown categories or encoded_missing_value cardinalities[cat_idx] -= 1 continue