From b823d918c7725f4ef50660ada7c937eca6bd6031 Mon Sep 17 00:00:00 2001 From: J42994 Date: Mon, 1 Oct 2018 22:21:22 +0200 Subject: [PATCH 1/3] add utility function to group low frequent values provide tests: - tests for different frequency values - otherwise tests similar to that of _encode --- sklearn/preprocessing/_encoders.py | 44 +++++++++++++++++++ sklearn/preprocessing/tests/test_encoders.py | 46 ++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c44607d10e6d0..58a543bd586dc 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -9,6 +9,7 @@ import numpy as np from scipy import sparse +from collections import Counter from .. import get_config as _get_config from ..base import BaseEstimator, TransformerMixin @@ -835,3 +836,46 @@ def inverse_transform(self, X): X_tr[:, i] = self.categories_[i][labels] return X_tr + + +def _group_values_python(values, min_freq=0, group=None): + if min_freq and group: + raise ValueError + if min_freq: + freqs = {key: counts/len(values) + for key, counts in Counter(values).items()} + low_freq_keys = (key for key, freq in freqs.items() if freq < min_freq) + # sorting ensures first element in group is always the same + group = np.array(sorted(set(low_freq_keys)), dtype=values.dtype) + if group is not None: + try: + values[np.isin(values, group)] = group[0] + except IndexError: + pass + return values, group + else: + return values, group + + +def _group_values_numpy(values, min_freq=0, group=None): + if min_freq and group: + raise ValueError + if min_freq: + uniques, counts = np.unique(values, return_counts=True) + mask = (counts/len(values) < min_freq) + group = uniques[mask] + if group is not None: + try: + values[np.isin(values, group)] = group[0] + except IndexError: + pass + return values, group + else: + return values, None + + +def _group_values(values, min_freq=0, group=None): + if values.dtype == object: + return _group_values_python(values, min_freq, group) + else: + return _group_values_numpy(values, min_freq, group) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 67169432defdc..26bf6527c314a 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -19,6 +19,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing._encoders import _group_values def toarray(a): @@ -613,3 +614,48 @@ def test_one_hot_encoder_warning(): def test_categorical_encoder_stub(): from sklearn.preprocessing import CategoricalEncoder assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal') + + +@pytest.mark.parametrize( + "values, min_freq, exp_values, exp_group", + [(np.array([1, 2, 3, 3, 3], dtype='int64'), + 0.3, + np.array([1, 1, 3, 3, 3], dtype='int64'), + np.array([1, 2])), + (np.array([1, 2, 3, 3, 3], dtype='int64'), + 0.7, + np.array([1, 1, 1, 1, 1], dtype='int64'), + np.array([1, 2, 3])), + (np.array([1, 2, 3, 3, 3], dtype='int64'), + 0.2, + np.array([1, 2, 3, 3, 3], dtype='int64'), + np.array([], dtype='int64')), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + 0.3, + np.array(['a', 'a', 'c', 'c', 'c'], dtype=object), + np.array(['a', 'b'])), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + 0.7, + np.array(['a', 'a', 'a', 'a', 'a'], dtype=object), + np.array(['a', 'b', 'c'])), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + 0.2, + np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + np.array([], dtype=object)), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + 0.3, + np.array(['a', 'a', 'c', 'c', 'c'], dtype=str), + np.array(['a', 'b'])), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + 0.7, + np.array(['a', 'a', 'a', 'a', 'a'], dtype=str), + np.array(['a', 'b', 'c'])), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + 0.2, + np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + np.array([], dtype=str))], + ids=(['int64']*3 + ['object']*3 + ['str']*3)) +def test_group_values_freq(values, min_freq, exp_values, exp_group): + values, group = _group_values(values, min_freq) + assert_array_equal(values, exp_values) + assert_array_equal(group, exp_group) From 512a5671cba6d897dfd63d31606d74eeeb825e1d Mon Sep 17 00:00:00 2001 From: J42994 Date: Wed, 3 Oct 2018 21:00:25 +0200 Subject: [PATCH 2/3] add min_freq to ordinal and onehot encoder - adds min_freq keyword to ordinal and onehot encoder and adds the necessary calls to BaseEncoder - improves tests on _group_values - adds tests that ensure that fit does not alter the inputarray. --- sklearn/preprocessing/_encoders.py | 14 +++++++-- sklearn/preprocessing/tests/test_encoders.py | 31 ++++++++++++++++++-- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 58a543bd586dc..14a0b84368e71 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -74,12 +74,13 @@ def _fit(self, X, handle_unknown='error'): if len(self._categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") - + self.groups_ = [] self.categories_ = [] for i in range(n_features): Xi = X[:, i] if self._categories == 'auto': + Xi, group = _group_values(Xi.copy(), min_freq=self.min_freq) cats = _encode(Xi) else: cats = np.array(self._categories[i], dtype=X.dtype) @@ -89,6 +90,7 @@ def _fit(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) + self.groups_.append(group) self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -100,6 +102,7 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X[:, i] + Xi, _ = _group_values(Xi, group=self.groups_[i]) diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], return_mask=True) @@ -199,6 +202,9 @@ class OneHotEncoder(_BaseEncoder): 0.20 and will be removed in 0.22. You can use the ``ColumnTransformer`` instead. + min_freq: float, default=0 + group low frequent categories together + Attributes ---------- categories_ : list of arrays @@ -273,13 +279,14 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, sparse=True, dtype=np.float64, - handle_unknown='error'): + min_freq=0, handle_unknown='error'): self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown self.n_values = n_values self.categorical_features = categorical_features + self.min_freq = min_freq # Deprecated attributes @@ -760,9 +767,10 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, min_freq=0): self.categories = categories self.dtype = dtype + self.min_freq = min_freq def fit(self, X, y=None): """Fit the OrdinalEncoder to X. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 26bf6527c314a..92398baef73e5 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -516,6 +516,13 @@ def test_one_hot_encoder_raise_missing(X, handle_unknown): with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X) +def test_one_hot_encoder_min_freq_fit_not_inplace(): + arr = np.array([[1, 1], [2, 3], [1, 2], [3, 2], [1, 4]]) + expected = arr.copy() + enc = OneHotEncoder(min_freq=0.4, categories='auto') + enc.fit(arr) + assert_array_equal(arr, expected) + @pytest.mark.parametrize("X", [ [['abc', 2, 55], ['def', 1, 55]], @@ -616,6 +623,14 @@ def test_categorical_encoder_stub(): assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal') +def test_ordinal_encoder_min_freq_fit_not_inplace(): + arr = np.array([[1, 1], [2, 3], [1, 2], [3, 2], [1, 4]]) + expected = arr.copy() + enc = OrdinalEncoder(min_freq=0.4) + enc.fit(arr) + assert_array_equal(arr, expected) + + @pytest.mark.parametrize( "values, min_freq, exp_values, exp_group", [(np.array([1, 2, 3, 3, 3], dtype='int64'), @@ -630,6 +645,10 @@ def test_categorical_encoder_stub(): 0.2, np.array([1, 2, 3, 3, 3], dtype='int64'), np.array([], dtype='int64')), + (np.array([1, 2, 3, 3, 3], dtype='int64'), + 0, + np.array([1, 2, 3, 3, 3], dtype='int64'), + None), (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), 0.3, np.array(['a', 'a', 'c', 'c', 'c'], dtype=object), @@ -642,6 +661,10 @@ def test_categorical_encoder_stub(): 0.2, np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), np.array([], dtype=object)), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + 0, + np.array(['a', 'b', 'c', 'c', 'c'], dtype=object), + None), (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), 0.3, np.array(['a', 'a', 'c', 'c', 'c'], dtype=str), @@ -653,8 +676,12 @@ def test_categorical_encoder_stub(): (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), 0.2, np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), - np.array([], dtype=str))], - ids=(['int64']*3 + ['object']*3 + ['str']*3)) + np.array([], dtype=str)), + (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + 0, + np.array(['a', 'b', 'c', 'c', 'c'], dtype=str), + None)], + ids=(['int64']*4 + ['object']*4 + ['str']*4)) def test_group_values_freq(values, min_freq, exp_values, exp_group): values, group = _group_values(values, min_freq) assert_array_equal(values, exp_values) From b73942e70d4892d8accc7ea64500ab45f2c2bb86 Mon Sep 17 00:00:00 2001 From: J42994 Date: Thu, 4 Oct 2018 10:53:48 +0200 Subject: [PATCH 3/3] fixes errors concerned accessing group --- sklearn/preprocessing/_encoders.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 14a0b84368e71..29f2625cbd00b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -82,6 +82,7 @@ def _fit(self, X, handle_unknown='error'): if self._categories == 'auto': Xi, group = _group_values(Xi.copy(), min_freq=self.min_freq) cats = _encode(Xi) + self.groups_.append(group) else: cats = np.array(self._categories[i], dtype=X.dtype) if self.handle_unknown == 'error': @@ -90,7 +91,6 @@ def _fit(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) - self.groups_.append(group) self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -102,7 +102,10 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X[:, i] - Xi, _ = _group_values(Xi, group=self.groups_[i]) + try: + Xi, _ = _group_values(Xi, group=self.groups_[i]) + except IndexError: + pass diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], return_mask=True)