diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 997bccf66782d..beb91d8780de8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -537,8 +537,8 @@ scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). -:class:`OrdinalEncoder` will also passthrough missing values that are -indicated by `np.nan`. +By default, :class:`OrdinalEncoder` will also passthrough missing values that +are indicated by `np.nan`. >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male'], ['female'], [np.nan], ['female']] @@ -548,6 +548,32 @@ indicated by `np.nan`. [nan], [ 0.]]) +:class:`OrdinalEncoder` provides a parameter `encoded_missing_value` to encode +the missing values without the need to create a pipeline and using +:class:`~sklearn.impute.SimpleImputer`. + + >>> enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1) + >>> X = [['male'], ['female'], [np.nan], ['female']] + >>> enc.fit_transform(X) + array([[ 1.], + [ 0.], + [-1.], + [ 0.]]) + +The above processing is equivalent to the following pipeline:: + + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.impute import SimpleImputer + >>> enc = Pipeline(steps=[ + ... ("encoder", preprocessing.OrdinalEncoder()), + ... ("imputer", SimpleImputer(strategy="constant", fill_value=-1)), + ... ]) + >>> enc.fit_transform(X) + array([[ 1.], + [ 0.], + [-1.], + [ 0.]]) + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index b62ad01cdacc4..b432673704d71 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -803,6 +803,9 @@ Changelog the model. The option is only available when `strategy` is set to `quantile`. :pr:`21445` by :user:`Felipe Bidu ` and :user:`Amanda Dsouza `. +- |Enhancement| Adds `encoded_missing_value` to :class:`preprocessing.OrdinalEncoder` + to configure the encoded value for missing data. :pr:`21988` by `Thomas Fan`_. + - |Enhancement| Added the `get_feature_names_out` method and a new parameter `feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set `feature_names_out` to 'one-to-one' to use the input features names as the diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ada054811f41c..d4cc642a18562 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1160,6 +1160,12 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder): .. versionadded:: 0.24 + encoded_missing_value : int or np.nan, default=np.nan + Encoded value of missing categories. If set to `np.nan`, then the `dtype` + parameter must be a float dtype. + + .. versionadded:: 1.1 + Attributes ---------- categories_ : list of arrays @@ -1203,6 +1209,23 @@ class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder): >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) + + By default, :class:`OrdinalEncoder` is lenient towards missing values by + propagating them. + + >>> import numpy as np + >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]] + >>> enc.fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., nan]]) + + You can use the parameter `encoded_missing_value` to encode missing values. + + >>> enc.set_params(encoded_missing_value=-1).fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., -1.]]) """ def __init__( @@ -1212,11 +1235,13 @@ def __init__( dtype=np.float64, handle_unknown="error", unknown_value=None, + encoded_missing_value=np.nan, ): self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown self.unknown_value = unknown_value + self.encoded_missing_value = encoded_missing_value def fit(self, X, y=None): """ @@ -1286,13 +1311,38 @@ def fit(self, X, y=None): self._missing_indices[cat_idx] = i continue - if np.dtype(self.dtype).kind != "f" and self._missing_indices: - raise ValueError( - "There are missing values in features " - f"{list(self._missing_indices)}. For OrdinalEncoder to " - "passthrough missing values, the dtype parameter must be a " - "float" - ) + if self._missing_indices: + if np.dtype(self.dtype).kind != "f" and is_scalar_nan( + self.encoded_missing_value + ): + raise ValueError( + "There are missing values in features " + f"{list(self._missing_indices)}. For OrdinalEncoder to " + f"encode missing values with dtype: {self.dtype}, set " + "encoded_missing_value to a non-nan value, or " + "set dtype to a float" + ) + + if not is_scalar_nan(self.encoded_missing_value): + # Features are invalid when they contain a missing category + # and encoded_missing_value was already used to encode a + # known category + invalid_features = [ + cat_idx + for cat_idx, categories_for_idx in enumerate(self.categories_) + if cat_idx in self._missing_indices + and 0 <= self.encoded_missing_value < len(categories_for_idx) + ] + + if invalid_features: + # Use feature names if they are avaliable + if hasattr(self, "feature_names_in_"): + invalid_features = self.feature_names_in_[invalid_features] + raise ValueError( + f"encoded_missing_value ({self.encoded_missing_value}) " + "is already used to encode a known category in features: " + f"{invalid_features}" + ) return self @@ -1317,7 +1367,7 @@ def transform(self, X): for cat_idx, missing_idx in self._missing_indices.items(): X_missing_mask = X_int[:, cat_idx] == missing_idx - X_trans[X_missing_mask, cat_idx] = np.nan + X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value # create separate category for unknown values if self.handle_unknown == "use_encoded_value": @@ -1362,7 +1412,7 @@ def inverse_transform(self, X): # replace values of X[:, i] that were nan with actual indices if i in self._missing_indices: - X_i_mask = _get_mask(X[:, i], np.nan) + X_i_mask = _get_mask(X[:, i], self.encoded_missing_value) labels[X_i_mask] = self._missing_indices[i] if self.handle_unknown == "use_encoded_value": diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index a96786419816d..ea32de22cd2f0 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1664,31 +1664,35 @@ def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): msg = ( r"There are missing values in features \[0\]. For OrdinalEncoder " - "to passthrough missing values, the dtype parameter must be a " - "float" + f"to encode missing values with dtype: {np.int32}" ) with pytest.raises(ValueError, match=msg): oe.fit(X) -def test_ordinal_encoder_passthrough_missing_values_float(): +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value): """Test ordinal encoder with nan on float dtypes.""" X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T - oe = OrdinalEncoder().fit(X) + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X) assert len(oe.categories_) == 1 + assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) X_trans = oe.transform(X) - assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]]) + assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]]) X_inverse = oe.inverse_transform(X_trans) assert_allclose(X_inverse, X) @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"]) -def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_missing_value_support_pandas_categorical( + pd_nan_type, encoded_missing_value +): """Check ordinal encoder is compatible with pandas.""" # checks pandas dataframe with categorical features pd = pytest.importorskip("pandas") @@ -1701,14 +1705,14 @@ def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): } ) - oe = OrdinalEncoder().fit(df) + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df) assert len(oe.categories_) == 1 assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) assert np.isnan(oe.categories_[0][-1]) df_trans = oe.transform(df) - assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) + assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]]) X_inverse = oe.inverse_transform(df_trans) assert X_inverse.shape == (5, 1) @@ -1902,3 +1906,50 @@ def test_ordinal_encoder_features_names_out_pandas(): feature_names_out = enc.get_feature_names_out() assert_array_equal(names, feature_names_out) + + +def test_ordinal_encoder_unknown_missing_interaction(): + """Check interactions between encode_unknown and missing value encoding.""" + + X = np.array([["a"], ["b"], [np.nan]], dtype=object) + + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=-3, + ).fit(X) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[0], [1], [-3]]) + + # "c" is unknown and is mapped to np.nan + # "None" is a missing value and is set to -3 + X_test = np.array([["c"], [np.nan]], dtype=object) + X_test_trans = oe.transform(X_test) + assert_allclose(X_test_trans, [[np.nan], [-3]]) + + +@pytest.mark.parametrize("with_pandas", [True, False]) +def test_ordinal_encoder_encoded_missing_value_error(with_pandas): + """Check OrdinalEncoder errors when encoded_missing_value is used by + an known category.""" + X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object) + + # The 0-th feature has no missing values so it is not included in the list of + # features + error_msg = ( + r"encoded_missing_value \(1\) is already used to encode a known category " + r"in features: " + ) + + if with_pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X, columns=["letter", "pet"]) + error_msg = error_msg + r"\['pet'\]" + else: + error_msg = error_msg + r"\[1\]" + + oe = OrdinalEncoder(encoded_missing_value=1) + + with pytest.raises(ValueError, match=error_msg): + oe.fit(X)