diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index dd1f798ccb3aa..98c2ec225d8b6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -540,6 +540,18 @@ columns for this feature will be all zeros array([[1., 0., 0., 0., 0., 0.]]) +Missing categorical features in the training data can be handled by specifying what happens to them using the ``handle_missing`` parameter. The values for this can be one of : + +`all-missing`: This will replace all missing rows with NaN. +`all-zero` : This will replace all missing rows with zeros. +`categorical` : This will replace all missing rows as a representation of a separate one hot column. + +Note that, for scikit-learn to handle your missing values using OneHotEncoder, +you have to pass a placeholder of what should be recorded as a missing value. +This is the `missing_values` parameter and possible values can be either a +`NaN` or a custom value of your choice. + + See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bd6e10fb62810..1bbd20532493d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -20,10 +20,8 @@ from .base import _transform_selected from .label import _encode, _encode_check_unknown - range = six.moves.range - __all__ = [ 'OneHotEncoder', 'OrdinalEncoder' @@ -218,6 +216,21 @@ class OneHotEncoder(_BaseEncoder): The ``n_values_`` attribute was deprecated in version 0.20 and will be removed in 0.22. + handle_missing : all-missing, all-zero or category + What should be done to missing values. Should be one of: + + 'all-missing': + Replace with a row of NaNs + + 'all-zero: + Replace with a row of zeros + + 'category: + Represent with a separate one-hot column + + missing_values: NaN or None + What should be considered as a missing value? + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -260,13 +273,15 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, sparse=True, dtype=np.float64, - handle_unknown='error'): + handle_unknown='error', missing_values="NaN", handle_missing="all-missing"): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.n_values = n_values self.categorical_features = categorical_features + self.missing_values = missing_values + self.handle_missing = handle_missing # Deprecated attributes @@ -567,12 +582,30 @@ def transform(self, X): X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ - if self._legacy_mode: - return _transform_selected(X, self._legacy_transform, self.dtype, - self._categorical_features, - copy=True) + + if not self.handle_missing or self.handle_missing not in ["all-missing", + "all-zero", "category"]: + raise ValueError("Wrong 'handle_missing' value specified. " + "'handle_missing' should be one of either " + "['all-missing', 'all-zero', 'category']. " + "Getting {0}".format(self.handle_missing)) + missing_indices = np.argwhere(np.isnan(X)) if self.missing_values == "NaN" else \ + np.argwhere(X == self.missing_values) + if self.handle_missing == "all-missing": + for i in missing_indices: + X[i] = np.nan + if self.handle_missing == "all-zero": + for i in missing_indices: + X[i] = 0 else: - return self._transform_new(X) + # Replace with a seperate one-hot column + pass + + if self._legacy_mode: + return _transform_selected(X, self._legacy_transform, + self.dtype, + self._categorical_features, copy=True) + return self._transform_new(X) def inverse_transform(self, X): """Convert the back data to the original representation. @@ -659,7 +692,7 @@ def get_feature_names(self, input_features=None): cats = self.categories_ if input_features is None: input_features = ['x%d' % i for i in range(len(cats))] - elif(len(input_features) != len(self.categories_)): + elif (len(input_features) != len(self.categories_)): raise ValueError( "input_features should have length equal to number of " "features ({}), got {}".format(len(self.categories_), diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 9ec16b85df60d..13c2722be53e2 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -576,3 +576,35 @@ def test_one_hot_encoder_warning(): def test_categorical_encoder_stub(): from sklearn.preprocessing import CategoricalEncoder assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal') + + +def test_one_hot_encoder_invalid_handle_missing(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_missing_values_none_handle_missing_passed(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', missing_values=None,handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_handle_missing_all_zeros(): + pass + + +def test_one_hot_encoder_handle_missing_all_missing(): + pass + + +def test_one_hot_encoder_handle_missing_category(): + pass