8000 ENH Adds nan passthrough in OrdinalEncoder (#19069) · scikit-learn/scikit-learn@638b768 · GitHub
[go: up one dir, main page]

Skip to content

Commit 638b768

Browse files
authored
ENH Adds nan passthrough in OrdinalEncoder (#19069)
1 parent 86445ab commit 638b768

File tree

4 files changed

+167
-22
lines changed

4 files changed

+167
-22
lines changed

doc/modules/preprocessing.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,17 @@ scikit-learn estimators, as these expect continuous input, and would interpret
482482
the categories as being ordered, which is often not desired (i.e. the set of
483483
browsers was ordered arbitrarily).
484484

485+
:class:`OrdinalEncoder` will also passthrough missing values that are
486+
indicated by `np.nan`.
487+
488+
>>> enc = preprocessing.OrdinalEncoder()
489+
>>> X = [['male'], ['female'], [np.nan], ['female']]
490+
>>> enc.fit_transform(X)
491+
array([[ 1.],
492+
[ 0.],
493+
[nan],
494+
[ 0.]])
495+
485496
Another possibility to convert categorical features to features that can be used
486497
with scikit-learn estimators is to use a one-of-K, also known as one-hot or
487498
dummy encoding.

doc/whats_new/v1.0.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ Changelog
123123
not corresponding to their objective. :pr:`19172` by
124124
:user:`Mathurin Massias <mathurinm>`
125125

126+
:mod:`sklearn.preprocessing`
127+
............................
128+
129+
- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
130+
missing values by default. :pr:`19069` by `Thomas Fan`_.
131+
126132
- |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
127133
is deprecated and will be removed in 1.2.
128134
Motivation for this deprecation: ``normalize`` parameter did not take any

sklearn/preprocessing/_encoders.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ..utils import check_array, is_scalar_nan
1111
from ..utils.validation import check_is_fitted
1212
from ..utils.validation import _deprecate_positional_args
13+
from ..utils._mask import _get_mask
1314

1415
from ..utils._encode import _encode, _check_unknown, _unique
1516

@@ -752,7 +753,7 @@ def fit(self, X, y=None):
752753
if np.dtype(self.dtype).kind != 'f':
753754
raise ValueError(
754755
f"When unknown_value is np.nan, the dtype "
755-
"parameter should be "
756+
f"parameter should be "
756757
f"a float dtype. Got {self.dtype}."
757758
)
758759
elif not isinstance(self.unknown_value, numbers.Integral):
@@ -765,7 +766,7 @@ def fit(self, X, y=None):
765766
f"handle_unknown is 'use_encoded_value', "
766767
f"got {self.unknown_value}.")
767768

768-
self._fit(X)
769+
self._fit(X, force_all_finite='allow-nan')
769770

770771
if self.handle_unknown == 'use_encoded_value':
771772
for feature_cats in self.categories_:
@@ -775,6 +776,21 @@ def fit(self, X, y=None):
775776
f"values already used for encoding the "
776777
f"seen categories.")
777778

779+
# stores the missing indices per category
780+
self._missing_indices = {}
781+
for cat_idx, categories_for_idx in enumerate(self.categories_):
782+
for i, cat in enumerate(categories_for_idx):
783+
if is_scalar_nan(cat):
784+
self._missing_indices[cat_idx] = i
785+
continue
786+
787+
if np.dtype(self.dtype).kind != 'f' and self._missing_indices:
788+
raise ValueError(
789+
"There are missing values in features "
790+
f"{list(self._missing_indices)}. For OrdinalEncoder to "
791+
"passthrough missing values, the dtype parameter must be a "
792+
"float")
793+
778794
return self
779795

780796
def transform(self, X):
@@ -791,9 +807,14 @@ def transform(self, X):
791807
X_out : sparse matrix or a 2-d array
792808
Transformed input.
793809
"""
794-
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
810+
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
811+
force_all_finite='allow-nan')
795812
X_trans = X_int.astype(self.dtype, copy=False)
796813

814+
for cat_idx, missing_idx in self._missing_indices.items():
815+
X_missing_mask = X_int[:, cat_idx] == missing_idx
816+
X_trans[X_missing_mask, cat_idx] = np.nan
817+
797818
# create separate category for unknown values
798819
if self.handle_unknown == 'use_encoded_value':
799820
X_trans[~X_mask] = self.unknown_value
@@ -814,7 +835,7 @@ def inverse_transform(self, X):
814835
Inverse transformed array.
815836
"""
816837
check_is_fitted(self)
817-
X = check_array(X, accept_sparse='csr')
838+
X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')
818839

819840
n_samples, _ = X.shape
820841
n_features = len(self.categories_)
@@ -833,6 +854,12 @@ def inverse_transform(self, X):
833854

834855
for i in range(n_features):
835856
labels = X[:, i].astype('int64', copy=False)
857+
858+
# replace values of X[:, i] that were nan with actual indices
859 10000 +
if i in self._missing_indices:
860+
X_i_mask = _get_mask(X[:, i], np.nan)
861+
labels[X_i_mask] = self._missing_indices[i]
862+
836863
if self.handle_unknown == 'use_encoded_value':
837864
unknown_labels = labels == self.unknown_value
838865
X_tr[:, i] = self.categories_[i][np.where(

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 119 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -574,24 +574,6 @@ def test_ordinal_encoder_inverse():
574574
enc.inverse_transform(X_tr)
575575

576576

577-
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
578-
np.array([['a', np.nan]], dtype=object).T],
579-
ids=['numeric', 'object'])
580-
def test_ordinal_encoder_raise_missing(X):
581-
ohe = OrdinalEncoder()
582-
583-
with pytest.raises(ValueError, match="Input contains NaN"):
584-
ohe.fit(X)
585-
586-
with pytest.raises(ValueError, match="Input contains NaN"):
587-
ohe.fit_transform(X)
588-
589-
ohe.fit(X[:1, :])
590-
591-
with pytest.raises(ValueError, match="Input contains NaN"):
592-
ohe.transform(X)
593-
594-
595577
def test_ordinal_encoder_handle_unknowns_string():
596578
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
597579
X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object)
@@ -930,3 +912,122 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
930912
assert len(ohe.categories_) == 1
931913
assert_array_equal(ohe.categories_[0][:-1], ['a', 'b', 'c'])
932914
assert np.isnan(ohe.categories_[0][-1])
915+
916+
917+
def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
918+
"""Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
919+
920+
X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
921+
oe = OrdinalEncoder(dtype=np.int32)
922+
923+
msg = (r"There are missing values in features \[0\]. For OrdinalEncoder "
924+
"to passthrough missing values, the dtype parameter must be a "
925+
"float")
926+
with pytest.raises(ValueError, match=msg):
927+
oe.fit(X)
928+
929+
930+
def test_ordinal_encoder_passthrough_missing_values_float():
931+
"""Test ordinal encoder with nan on float dtypes."""
932+
933+
X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
934+
oe = OrdinalEncoder().fit(X)
935+
936+
assert len(oe.categories_) == 1
937+
assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
938+
939+
X_trans = oe.transform(X)
940+
assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
941+
942+
X_inverse = oe.inverse_transform(X_trans)
943+
assert_allclose(X_inverse, X)
944+
945+
946+
@pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan'])
947+
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
948+
"""Check ordinal encoder is compatible with pandas."""
949+
# checks pandas dataframe with categorical features
950+
if pd_nan_type == 'pd.NA':
951+
# pd.NA is in pandas 1.0
952+
pd = pytest.importorskip('pandas', minversion="1.0")
953+
pd_missing_value = pd.NA
954+
else: # np.nan
955+
pd = pytest.importorskip('pandas')
956+
pd_missing_value = np.nan
957+
958+
df = pd.DataFrame({
959+
'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'],
960+
dtype='category'),
961+
})
962+
963+
oe = OrdinalEncoder().fit(df)
964+
assert len(oe.categories_) == 1
965+
assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c'])
966+
assert np.isnan(oe.categories_[0][-1])
967+
968+
df_trans = oe.transform(df)
969+
970+
assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
971+
972+
X_inverse = oe.inverse_transform(df_trans)
973+
assert X_inverse.shape == (5, 1)
974+
assert_array_equal(X_inverse[:2, 0], ['c', 'a'])
975+
assert_array_equal(X_inverse[3:, 0], ['b', 'a'])
976+
assert np.isnan(X_inverse[2, 0])
977+
978+
979+
@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
980+
((np.array([['a', np.nan]], dtype=object).T,
981+
np.array([['a', 'b']], dtype=object).T,
982+
[np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
983+
((np.array([['a', np.nan]], dtype=object).T,
984+
np.array([['a', 'b']], dtype=object).T,
985+
[np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
986+
((np.array([[2.0, np.nan]], dtype=np.float64).T,
987+
np.array([[3.0]], dtype=np.float64).T,
988+
[np.array([2.0, 4.0, np.nan])], np.float64)),
989+
], ids=['object-None-missing-value', 'object-nan-missing_value',
990+
'numeric-missing-value'])
991+
def test_ordinal_encoder_specified_categories_missing_passthrough(
992+
X, X2, cats, cat_dtype):
993+
"""Test ordinal encoder for specified categories."""
994+
oe = OrdinalEncoder(categories=cats)
995+
exp = np.array([[0.], [np.nan]])
996+
assert_array_equal(oe.fit_transform(X), exp)
997+
# manually specified categories should have same dtype as
998+
# the data when coerced from lists
999+
assert oe.categories_[0].dtype == cat_dtype
1000+
1001+
# when specifying categories manually, unknown categories should already
1002+
# raise when fitting
1003+
oe = OrdinalEncoder(categories=cats)
1004+
with pytest.raises(ValueError, match="Found unknown categories"):
1005+
oe.fit(X2)
1006+
1007+
1008+
@pytest.mark.parametrize("X, expected_X_trans, X_test", [
1009+
(np.array([[1.0, np.nan, 3.0]]).T,
1010+
np.array([[0.0, np.nan, 1.0]]).T,
1011+
np.array([[4.0]])),
1012+
(np.array([[1.0, 4.0, 3.0]]).T,
1013+
np.array([[0.0, 2.0, 1.0]]).T,
1014+
np.array([[np.nan]])),
1015+
(np.array([['c', np.nan, 'b']], dtype=object).T,
1016+
np.array([[1.0, np.nan, 0.0]]).T,
1017+
np.array([['d']], dtype=object)),
1018+
(np.array([['c', 'a', 'b']], dtype=object).T,
1019+
np.array([[2.0, 0.0, 1.0]]).T,
1020+
np.array([[np.nan]], dtype=object)),
1021+
])
1022+
def test_ordinal_encoder_handle_missing_and_unknown(
1023+
X, expected_X_trans, X_test
1024+
):
1025+
"""Test the interaction between missing values and handle_unknown"""
1026+
1027+
oe = OrdinalEncoder(handle_unknown="use_encoded_value",
1028+
unknown_value=-1)
1029+
1030+
X_trans = oe.fit_transform(X)
1031+
assert_allclose(X_trans, expected_X_trans)
1032+
1033+
assert_allclose(oe.transform(X_test), [[-1.0]])

0 commit comments

Comments
 (0)
0