From b823d918c7725f4ef50660ada7c937eca6bd6031 Mon Sep 17 00:00:00 2001
From: J42994 <jan-christopher.koch@eon.com>
Date: Mon, 1 Oct 2018 22:21:22 +0200
Subject: [PATCH 1/3] add utility function to group low frequent values

provide tests:
- tests for different frequency values
- otherwise tests similar to that of _encode
---
 sklearn/preprocessing/_encoders.py           | 44 +++++++++++++++++++
 sklearn/preprocessing/tests/test_encoders.py | 46 ++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c44607d10e6d0..58a543bd586dc 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 from scipy import sparse
+from collections import Counter
 
 from .. import get_config as _get_config
 from ..base import BaseEstimator, TransformerMixin
@@ -835,3 +836,46 @@ def inverse_transform(self, X):
             X_tr[:, i] = self.categories_[i][labels]
 
         return X_tr
+
+
+def _group_values_python(values, min_freq=0, group=None):
+    if min_freq and group:
+        raise ValueError
+    if min_freq:
+        freqs = {key: counts/len(values)
+                 for key, counts in Counter(values).items()}
+        low_freq_keys = (key for key, freq in freqs.items() if freq < min_freq)
+        # sorting ensures first element in group is always the same
+        group = np.array(sorted(set(low_freq_keys)), dtype=values.dtype)
+    if group is not None:
+        try:
+            values[np.isin(values, group)] = group[0]
+        except IndexError:
+            pass
+        return values, group
+    else:
+        return values, group
+
+
+def _group_values_numpy(values, min_freq=0, group=None):
+    if min_freq and group:
+        raise ValueError
+    if min_freq:
+        uniques, counts = np.unique(values, return_counts=True)
+        mask = (counts/len(values) < min_freq)
+        group = uniques[mask]
+    if group is not None:
+        try:
+            values[np.isin(values, group)] = group[0]
+        except IndexError:
+            pass
+        return values, group
+    else:
+        return values, None
+
+
+def _group_values(values, min_freq=0, group=None):
+    if values.dtype == object:
+        return _group_values_python(values, min_freq, group)
+    else:
+        return _group_values_numpy(values, min_freq, group)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 67169432defdc..26bf6527c314a 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -19,6 +19,7 @@
 
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing._encoders import _group_values
 
 
 def toarray(a):
@@ -613,3 +614,48 @@ def test_one_hot_encoder_warning():
 def test_categorical_encoder_stub():
     from sklearn.preprocessing import CategoricalEncoder
     assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal')
+
+
+@pytest.mark.parametrize(
+    "values, min_freq, exp_values, exp_group",
+    [(np.array([1, 2, 3, 3, 3], dtype='int64'),
+      0.3,
+      np.array([1, 1, 3, 3, 3], dtype='int64'),
+      np.array([1, 2])),
+     (np.array([1, 2, 3, 3, 3], dtype='int64'),
+      0.7,
+      np.array([1, 1, 1, 1, 1], dtype='int64'),
+      np.array([1, 2, 3])),
+     (np.array([1, 2, 3, 3, 3], dtype='int64'),
+      0.2,
+      np.array([1, 2, 3, 3, 3], dtype='int64'),
+      np.array([], dtype='int64')),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      0.3,
+      np.array(['a', 'a', 'c', 'c', 'c'], dtype=object),
+      np.array(['a', 'b'])),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      0.7,
+      np.array(['a', 'a', 'a', 'a', 'a'], dtype=object),
+      np.array(['a', 'b', 'c'])),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      0.2,
+      np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      np.array([], dtype=object)),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      0.3,
+      np.array(['a', 'a', 'c', 'c', 'c'], dtype=str),
+      np.array(['a', 'b'])),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      0.7,
+      np.array(['a', 'a', 'a', 'a', 'a'], dtype=str),
+      np.array(['a', 'b', 'c'])),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      0.2,
+      np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      np.array([], dtype=str))],
+    ids=(['int64']*3 + ['object']*3 + ['str']*3))
+def test_group_values_freq(values, min_freq, exp_values, exp_group):
+    values, group = _group_values(values, min_freq)
+    assert_array_equal(values, exp_values)
+    assert_array_equal(group, exp_group)

From 512a5671cba6d897dfd63d31606d74eeeb825e1d Mon Sep 17 00:00:00 2001
From: J42994 <jan-christopher.koch@eon.com>
Date: Wed, 3 Oct 2018 21:00:25 +0200
Subject: [PATCH 2/3] add min_freq to ordinal and onehot encoder

- adds min_freq keyword to ordinal and onehot encoder and adds the necessary calls to BaseEncoder
- improves tests on _group_values
- adds tests that ensure that fit does not alter the inputarray.
---
 sklearn/preprocessing/_encoders.py           | 14 +++++++--
 sklearn/preprocessing/tests/test_encoders.py | 31 ++++++++++++++++++--
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 58a543bd586dc..14a0b84368e71 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -74,12 +74,13 @@ def _fit(self, X, handle_unknown='error'):
             if len(self._categories) != n_features:
                 raise ValueError("Shape mismatch: if n_values is an array,"
                                  " it has to be of shape (n_features,).")
-
+        self.groups_ = []
         self.categories_ = []
 
         for i in range(n_features):
             Xi = X[:, i]
             if self._categories == 'auto':
+                Xi, group = _group_values(Xi.copy(), min_freq=self.min_freq)
                 cats = _encode(Xi)
             else:
                 cats = np.array(self._categories[i], dtype=X.dtype)
@@ -89,6 +90,7 @@ def _fit(self, X, handle_unknown='error'):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
+            self.groups_.append(group)
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
@@ -100,6 +102,7 @@ def _transform(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X[:, i]
+            Xi, _ = _group_values(Xi, group=self.groups_[i])
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                      return_mask=True)
 
@@ -199,6 +202,9 @@ class OneHotEncoder(_BaseEncoder):
             0.20 and will be removed in 0.22.
             You can use the ``ColumnTransformer`` instead.
 
+    min_freq: float, default=0
+        group low frequent categories together
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -273,13 +279,14 @@ class OneHotEncoder(_BaseEncoder):
 
     def __init__(self, n_values=None, categorical_features=None,
                  categories=None, sparse=True, dtype=np.float64,
-                 handle_unknown='error'):
+                 min_freq=0, handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.n_values = n_values
         self.categorical_features = categorical_features
+        self.min_freq = min_freq
 
     # Deprecated attributes
 
@@ -760,9 +767,10 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64, min_freq=0):
         self.categories = categories
         self.dtype = dtype
+        self.min_freq = min_freq
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 26bf6527c314a..92398baef73e5 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -516,6 +516,13 @@ def test_one_hot_encoder_raise_missing(X, handle_unknown):
     with pytest.raises(ValueError, match="Input contains NaN"):
         ohe.transform(X)
 
+def test_one_hot_encoder_min_freq_fit_not_inplace():
+    arr = np.array([[1, 1], [2, 3], [1, 2], [3, 2], [1, 4]])
+    expected = arr.copy()
+    enc = OneHotEncoder(min_freq=0.4, categories='auto')
+    enc.fit(arr)
+    assert_array_equal(arr, expected)
+
 
 @pytest.mark.parametrize("X", [
     [['abc', 2, 55], ['def', 1, 55]],
@@ -616,6 +623,14 @@ def test_categorical_encoder_stub():
     assert_raises(RuntimeError, CategoricalEncoder, encoding='ordinal')
 
 
+def test_ordinal_encoder_min_freq_fit_not_inplace():
+    arr = np.array([[1, 1], [2, 3], [1, 2], [3, 2], [1, 4]])
+    expected = arr.copy()
+    enc = OrdinalEncoder(min_freq=0.4)
+    enc.fit(arr)
+    assert_array_equal(arr, expected)
+
+
 @pytest.mark.parametrize(
     "values, min_freq, exp_values, exp_group",
     [(np.array([1, 2, 3, 3, 3], dtype='int64'),
@@ -630,6 +645,10 @@ def test_categorical_encoder_stub():
       0.2,
       np.array([1, 2, 3, 3, 3], dtype='int64'),
       np.array([], dtype='int64')),
+     (np.array([1, 2, 3, 3, 3], dtype='int64'),
+      0,
+      np.array([1, 2, 3, 3, 3], dtype='int64'),
+      None),
      (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
       0.3,
       np.array(['a', 'a', 'c', 'c', 'c'], dtype=object),
@@ -642,6 +661,10 @@ def test_categorical_encoder_stub():
       0.2,
       np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
       np.array([], dtype=object)),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      0,
+      np.array(['a', 'b', 'c', 'c', 'c'], dtype=object),
+      None),
      (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
       0.3,
       np.array(['a', 'a', 'c', 'c', 'c'], dtype=str),
@@ -653,8 +676,12 @@ def test_categorical_encoder_stub():
      (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
       0.2,
       np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
-      np.array([], dtype=str))],
-    ids=(['int64']*3 + ['object']*3 + ['str']*3))
+      np.array([], dtype=str)),
+     (np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      0,
+      np.array(['a', 'b', 'c', 'c', 'c'], dtype=str),
+      None)],
+    ids=(['int64']*4 + ['object']*4 + ['str']*4))
 def test_group_values_freq(values, min_freq, exp_values, exp_group):
     values, group = _group_values(values, min_freq)
     assert_array_equal(values, exp_values)

From b73942e70d4892d8accc7ea64500ab45f2c2bb86 Mon Sep 17 00:00:00 2001
From: J42994 <jan-christopher.koch@eon.com>
Date: Thu, 4 Oct 2018 10:53:48 +0200
Subject: [PATCH 3/3] fixes errors concerned accessing group

---
 sklearn/preprocessing/_encoders.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 14a0b84368e71..29f2625cbd00b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -82,6 +82,7 @@ def _fit(self, X, handle_unknown='error'):
             if self._categories == 'auto':
                 Xi, group = _group_values(Xi.copy(), min_freq=self.min_freq)
                 cats = _encode(Xi)
+                self.groups_.append(group)
             else:
                 cats = np.array(self._categories[i], dtype=X.dtype)
                 if self.handle_unknown == 'error':
@@ -90,7 +91,6 @@ def _fit(self, X, handle_unknown='error'):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
-            self.groups_.append(group)
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
@@ -102,7 +102,10 @@ def _transform(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X[:, i]
-            Xi, _ = _group_values(Xi, group=self.groups_[i])
+            try:
+                Xi, _ = _group_values(Xi, group=self.groups_[i])
+            except IndexError:
+                pass
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                      return_mask=True)