From 9b92970586a52414249428fa60ee412c4ef4f9d2 Mon Sep 17 00:00:00 2001 From: olamilekan Date: Thu, 6 Sep 2018 10:59:48 +0100 Subject: [PATCH 1/4] Docstring Update, Modules docs update and initial logic implementation --- doc/modules/preprocessing.rst | 9 +++++ sklearn/preprocessing/_encoders.py | 40 ++++++++++++++++++-- sklearn/preprocessing/tests/test_encoders.py | 32 ++++++++++++++++ 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index dd1f798ccb3aa..c023ff7b0c4ec 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -540,6 +540,15 @@ columns for this feature will be all zeros array([[1., 0., 0., 0., 0., 0.]]) +Missing categorical features in the training data can be handled by specifying what happens to them using the ``handle_missing`` parameter. The values for this can be one of : + +`all-missing`: This will replace all missing rows with NaN. +`all-zero` : This will replace all missing rows with zeros. +`categorical` : This will replace all missing rows as a representation of a separate one hot column. + +Note that, for scikit-learn to handle your missing values using OneHotEncoder, you have to pass a placeholder of what should be recorded as a missing value. This is the `missing_values` parameter and possible values can be either a `NaN` or a custom value of your choice. + + See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bd6e10fb62810..6ff5ae31c071f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -218,6 +218,18 @@ class OneHotEncoder(_BaseEncoder): The ``n_values_`` attribute was deprecated in version 0.20 and will be removed in 0.22. + handle_missing : all-missing, all-zero or category + What should be done to missing values. Should be one of: + + all-missing: Replace with a row of NaNs as above + + all-zero: Replace with a row of zeros + + category: Represent with a separate one-hot column + + missing_values: NaN or None + What should be considered as a missing value? + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -260,13 +272,15 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, sparse=True, dtype=np.float64, - handle_unknown='error'): + handle_unknown='error', missing_values=None, handle_missing=None): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.n_values = n_values self.categorical_features = categorical_features + self.missing_values = missing_values + self.handle_missing = handle_missing # Deprecated attributes @@ -567,12 +581,30 @@ def transform(self, X): X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ - if self._legacy_mode: - return _transform_selected(X, self._legacy_transform, self.dtype, + if not self.missing_values: + if self._legacy_mode: + return _transform_selected(X, self._legacy_transform, self.dtype, self._categorical_features, copy=True) - else: return self._transform_new(X) + if self.missing_values and self.missing_values != "NaN": + raise ValueError("Wrong 'missing_missing' value specified. " + "'missing_values' should be one of either 'None' or 'NaN'") + if self.missing_values == "NaN": + if not self.handle_missing: + raise ValueError("'handle_missing' cannot be None when 'missing_values' is passed.") + if self.handle_missing not in ["all-missing", "all-zero", "category"]: + raise ValueError("Wrong 'handle_missing' value specified. " + "'handle_missing' should be one of either ['all-missing', 'all-zero', 'category']") + if self.handle_missing == "all-missing": + # Replace entire row with NaN + pass + if self.handle_missing == "all-zero": + # Replace with a row of zeros + pass + else: + # Replace with a seperate one-hot column + pass def inverse_transform(self, X): """Convert the back data to the original representation. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 9ec16b85df60d..13c2722be53e2 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -576,3 +576,35 @@ def test_one_hot_encoder_warning(): def test_categorical_encoder_stub(): from sklearn.preprocessing import CategoricalEncoder assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal') + + +def test_one_hot_encoder_invalid_handle_missing(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_missing_values_none_handle_missing_passed(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', missing_values=None,handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_handle_missing_all_zeros(): + pass + + +def test_one_hot_encoder_handle_missing_all_missing(): + pass + + +def test_one_hot_encoder_handle_missing_category(): + pass From 91e0b43d44b2d0e4d995695c0a1b673c1b17ebbc Mon Sep 17 00:00:00 2001 From: olamilekan Date: Sat, 8 Sep 2018 10:45:58 +0100 Subject: [PATCH 2/4] Fixed line length and updated docstring --- doc/modules/preprocessing.rst | 5 ++++- sklearn/preprocessing/_encoders.py | 9 ++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index c023ff7b0c4ec..98c2ec225d8b6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -546,7 +546,10 @@ Missing categorical features in the training data can be handled by specifying w `all-zero` : This will replace all missing rows with zeros. `categorical` : This will replace all missing rows as a representation of a separate one hot column. -Note that, for scikit-learn to handle your missing values using OneHotEncoder, you have to pass a placeholder of what should be recorded as a missing value. This is the `missing_values` parameter and possible values can be either a `NaN` or a custom value of your choice. +Note that, for scikit-learn to handle your missing values using OneHotEncoder, +you have to pass a placeholder of what should be recorded as a missing value. +This is the `missing_values` parameter and possible values can be either a +`NaN` or a custom value of your choice. See :ref:`dict_feature_extraction` for categorical features that are represented diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6ff5ae31c071f..b3736721fa734 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -221,11 +221,14 @@ class OneHotEncoder(_BaseEncoder): handle_missing : all-missing, all-zero or category What should be done to missing values. Should be one of: - all-missing: Replace with a row of NaNs as above + all-missing: + Replace with a row of NaNs as above - all-zero: Replace with a row of zeros + all-zero: + Replace with a row of zeros - category: Represent with a separate one-hot column + category: + Represent with a separate one-hot column missing_values: NaN or None What should be considered as a missing value? From 3fa635ae86ff0a7e2bf26549ca44ea2c596935d5 Mon Sep 17 00:00:00 2001 From: olamilekan Date: Sat, 8 Sep 2018 10:47:32 +0100 Subject: [PATCH 3/4] Fixed line length and updated docstring --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b3736721fa734..eee1f14382e40 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -222,7 +222,7 @@ class OneHotEncoder(_BaseEncoder): What should be done to missing values. Should be one of: all-missing: - Replace with a row of NaNs as above + Replace with a row of NaNs all-zero: Replace with a row of zeros From 9ed9b7d27877669ae15c5b85933f74465f05727b Mon Sep 17 00:00:00 2001 From: olamilekan Date: Wed, 12 Sep 2018 12:51:22 +0100 Subject: [PATCH 4/4] Updated logic --- sklearn/preprocessing/_encoders.py | 60 +++++++++++++++--------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index eee1f14382e40..1bbd20532493d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -20,10 +20,8 @@ from .base import _transform_selected from .label import _encode, _encode_check_unknown - range = six.moves.range - __all__ = [ 'OneHotEncoder', 'OrdinalEncoder' @@ -221,13 +219,13 @@ class OneHotEncoder(_BaseEncoder): handle_missing : all-missing, all-zero or category What should be done to missing values. Should be one of: - all-missing: + 'all-missing': Replace with a row of NaNs - all-zero: + 'all-zero: Replace with a row of zeros - category: + 'category: Represent with a separate one-hot column missing_values: NaN or None @@ -275,7 +273,7 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, sparse=True, dtype=np.float64, - handle_unknown='error', missing_values=None, handle_missing=None): + handle_unknown='error', missing_values="NaN", handle_missing="all-missing"): self.categories = categories self.sparse = sparse self.dtype = dtype @@ -584,30 +582,30 @@ def transform(self, X): X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ - if not self.missing_values: - if self._legacy_mode: - return _transform_selected(X, self._legacy_transform, self.dtype, - self._categorical_features, - copy=True) - return self._transform_new(X) - if self.missing_values and self.missing_values != "NaN": - raise ValueError("Wrong 'missing_missing' value specified. " - "'missing_values' should be one of either 'None' or 'NaN'") - if self.missing_values == "NaN": - if not self.handle_missing: - raise ValueError("'handle_missing' cannot be None when 'missing_values' is passed.") - if self.handle_missing not in ["all-missing", "all-zero", "category"]: - raise ValueError("Wrong 'handle_missing' value specified. " - "'handle_missing' should be one of either ['all-missing', 'all-zero', 'category']") - if self.handle_missing == "all-missing": - # Replace entire row with NaN - pass - if self.handle_missing == "all-zero": - # Replace with a row of zeros - pass - else: - # Replace with a seperate one-hot column - pass + + if not self.handle_missing or self.handle_missing not in ["all-missing", + "all-zero", "category"]: + raise ValueError("Wrong 'handle_missing' value specified. " + "'handle_missing' should be one of either " + "['all-missing', 'all-zero', 'category']. " + "Getting {0}".format(self.handle_missing)) + missing_indices = np.argwhere(np.isnan(X)) if self.missing_values == "NaN" else \ + np.argwhere(X == self.missing_values) + if self.handle_missing == "all-missing": + for i in missing_indices: + X[i] = np.nan + if self.handle_missing == "all-zero": + for i in missing_indices: + X[i] = 0 + else: + # Replace with a seperate one-hot column + pass + + if self._legacy_mode: + return _transform_selected(X, self._legacy_transform, + self.dtype, + self._categorical_features, copy=True) + return self._transform_new(X) def inverse_transform(self, X): """Convert the back data to the original representation. @@ -694,7 +692,7 @@ def get_feature_names(self, input_features=None): cats = self.categories_ if input_features is None: input_features = ['x%d' % i for i in range(len(cats))] - elif(len(input_features) != len(self.categories_)): + elif (len(input_features) != len(self.categories_)): raise ValueError( "input_features should have length equal to number of " "features ({}), got {}".format(len(self.categories_),