From 9c5dec4ca68f5731423f43e716838717bc869c11 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 16 Dec 2019 15:27:40 -0500
Subject: [PATCH 01/92] ENH Completely adds infrequent categories

---
 doc/modules/preprocessing.rst                |  37 ++
 sklearn/metrics/_ranking.py                  |   7 +-
 sklearn/preprocessing/_encoders.py           | 385 +++++++++++++++++--
 sklearn/preprocessing/_label.py              |  91 +++--
 sklearn/preprocessing/tests/test_encoders.py | 347 ++++++++++++++++-
 sklearn/preprocessing/tests/test_label.py    |  38 +-
 6 files changed, 822 insertions(+), 83 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 3e41c592fbbdc..a97f8182bf3f9 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -574,6 +574,43 @@ When this paramenter is not None, ``handle_unknown`` must be set to
 See :ref:`dict_feature_extraction` for categorical features that are represented
 as a dict, not as scalars.
 
+.. _one_hot_encoder_infrequent_categories:
+
+Infrequent categories
+---------------------
+
+:class:`OneHotEncoder` supports creating a category for infrequent categories
+in the training data. The parameters to enable the gathering of infrequent
+categories are `min_frequency` and `max_levels`.
+
+1. `min_frequency` can be a integer greater or equal to one or a float in
+`(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
+smaller than this value will be considered infrequent. If `min_frequency` is an
+float, categories with a cardinality smaller than this fraction of the
+total number of samples will be considered infrequent.
+
+2. `max_levels` can be `None` or any integer greater than one. This parameter
+sets an upper limit of the number of categories including the infrequent
+category.
+
+These parameters can be used together to filter out infrequent categories. In
+the following example, the categories, `'dog', 'cat'`, are considered infrequent::
+
+   >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 + 
+   ...               ['snake'] * 3]).T
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6,
+   ...                                   handle_unknown='auto').fit(X)
+   >>> enc.transform([['dog']]).toarray()
+   array([[0., 0., 1.]])
+   >>> enc.transform([['rabbit']]).toarray()
+   array([[0., 1., 0.]])
+   
+By setting handle_unknown to `'auto'`, unknown categories will be considered
+infrequent::
+
+   >>> enc.transform([['dragon']]).toarray()
+   array([[0., 0., 1.]])
+
 .. _preprocessing_discretization:
 
 Discretization
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index e525539c0d706..3fcca5f119b12 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -457,7 +457,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
 
     if labels is not None:
         labels = column_or_1d(labels)
-        classes = _encode(labels)
+        classes = _encode(labels)["uniques"]
         if len(classes) != len(labels):
             raise ValueError("Parameter 'labels' must be unique")
         if not np.array_equal(classes, labels):
@@ -471,7 +471,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
             raise ValueError(
                 "'y_true' contains labels not in parameter 'labels'")
     else:
-        classes = _encode(y_true)
+        classes = _encode(y_true)["uniques"]
         if len(classes) != y_score.shape[1]:
             raise ValueError(
                 "Number of classes in y_true not equal to the number of "
@@ -482,7 +482,8 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
             raise ValueError("sample_weight is not supported "
                              "for multiclass one-vs-one ROC AUC, "
                              "'sample_weight' must be None in this case.")
-        _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
+        y_true_encoded = (_encode(y_true, uniques=classes, encode=True)
+                          ["encoded"])
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(_binary_roc_auc_score,
                                              y_true_encoded,
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 36512e359c7ed..68c79087b10c7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -2,6 +2,9 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+import numbers
+import warnings
+
 import numpy as np
 from scipy import sparse
 
@@ -70,7 +73,7 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error'):
+    def _fit(self, X, handle_unknown='error', process_counts=None):
         X_list, n_samples, n_features = self._check_X(X)
 
         if self.categories != 'auto':
@@ -80,10 +83,16 @@ def _fit(self, X, handle_unknown='error'):
 
         self.categories_ = []
 
+        return_counts = process_counts is not None
+        category_counts = [] if return_counts else None
+
         for i in range(n_features):
             Xi = X_list[i]
+
+            result = None
             if self.categories == 'auto':
-                cats = _encode(Xi)
+                result = _encode(Xi, return_counts=return_counts)
+                cats = result["uniques"]
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
@@ -98,7 +107,17 @@ def _fit(self, X, handle_unknown='error'):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error'):
+            if return_counts:
+                if result is None:
+                    result = _encode(Xi, cats, return_counts=True)
+                category_counts.append(result["counts"])
+
+        if return_counts:
+            process_counts(category_counts, n_samples)
+
+    def _transform(self, X, handle_unknown='error',
+                   process_valid_mask=None,
+                   get_default_invalid_category=None):
         X_list, n_samples, n_features = self._check_X(X)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
@@ -116,7 +135,6 @@ def _transform(self, X, handle_unknown='error'):
             Xi = X_list[i]
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                      return_mask=True)
-
             if not np.all(valid_mask):
                 if handle_unknown == 'error':
                     msg = ("Found unknown categories {0} in column {1}"
@@ -126,7 +144,6 @@ def _transform(self, X, handle_unknown='error'):
                     # Set the problematic rows to an acceptable value and
                     # continue `The rows are marked `X_mask` and will be
                     # removed later.
-                    X_mask[:, i] = valid_mask
                     # cast Xi into the largest string type necessary
                     # to handle different lengths of numpy strings
                     if (self.categories_[i].dtype.kind in ('U', 'S')
@@ -135,11 +152,21 @@ def _transform(self, X, handle_unknown='error'):
                     else:
                         Xi = Xi.copy()
 
-                    Xi[~valid_mask] = self.categories_[i][0]
+                    if get_default_invalid_category is not None:
+                        invalid_index = get_default_invalid_category(i)
+                    else:
+                        invalid_index = 0
+
+                    Xi[~valid_mask] = self.categories_[i][invalid_index]
+
+                    if process_valid_mask is not None:
+                        valid_mask = process_valid_mask(valid_mask, i)
+                    X_mask[:, i] = valid_mask
+
             # We use check_unknown=False, since _encode_check_unknown was
             # already called above.
-            _, encoded = _encode(Xi, self.categories_[i], encode=True,
-                                 check_unknown=False)
+            encoded = _encode(Xi, self.categories_[i], encode=True,
+                              check_unknown=False)["encoded"]
             X_int[:, i] = encoded
 
         return X_int, X_mask
@@ -204,7 +231,7 @@ class OneHotEncoder(_BaseEncoder):
     dtype : number type, default=np.float
         Desired dtype of output.
 
-    handle_unknown : {'error', 'ignore'}, default='error'
+    handle_unknown : {'error', 'ignore', 'auto'}, default='error'
         Whether to raise an error or ignore if an unknown categorical feature
         is present during transform (default is to raise). When this parameter
         is set to 'ignore' and an unknown category is encountered during
@@ -212,6 +239,40 @@ class OneHotEncoder(_BaseEncoder):
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
 
+        When this parameter is set to 'auto' and an unknown category is
+        encountered during transform
+
+        1. If there was no infrequent category during training, the resulting
+        one-hot encoded columns for this feature will be be all zeros. In
+        the inverse transform, an unknown category will be denoted as None.
+
+        2. If there is an infrequent category during training, the unknown
+        category will be considered infrequent. In the inverse transform,
+        an unknown category will be the most frequent infrequent category.
+
+        .. versionadded:: 0.23
+            'auto' was added
+
+        .. deprecated:: 0.23
+            'ignore' is deprecated in favor of 'auto'
+
+    min_frequency : int or float, default=1
+        Specifics the categories to be considered infrequent.
+
+        - If int, categories with a cardinality smaller will be considered
+        infrequent.
+        - If float, categories with a cardinality smaller than this fraction
+        of the total number of samples will be considered infrequent.
+
+        .. versionadded:: 0.23
+
+    max_levels : int, default=None
+        Specifies the categories to be considered infrequent. Sets an upper
+        limit to the number of categories including the infrequent category.
+        If `None` there is no limit to the number of categories.
+
+        .. versionadded:: 0.23
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -269,16 +330,21 @@ class OneHotEncoder(_BaseEncoder):
     """
 
     def __init__(self, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error'):
+                 dtype=np.float64, handle_unknown='error',
+                 min_frequency=1, max_levels=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
+        self.min_frequency = min_frequency
+        self.max_levels = max_levels
 
     def _validate_keywords(self):
-        if self.handle_unknown not in ('error', 'ignore'):
-            msg = ("handle_unknown should be either 'error' or 'ignore', "
+
+
+        if self.handle_unknown not in ('error', 'ignore', 'auto'):
+            msg = ("handle_unknown should be either 'error', 'ignore', 'auto'"
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
         # If we have both dropped columns and ignored unknown
@@ -290,6 +356,34 @@ def _validate_keywords(self):
                 "specified, as both would create categories that are all "
                 "zero.")
 
+        # validates infrequent category features
+        if self.drop is not None and self._infrequent_enabled:
+            raise ValueError("infrequent categories are not supported when "
+                             "drop is specified")
+
+        # TODO: Remove when handle_unknown='ignore' is deprecated
+        if self.handle_unknown == 'ignore':
+            warnings.warn("handle_unknown='ignore' is deprecated in favor "
+                          "of 'auto' in version 0.23 and will be removed in "
+                          "version 0.25", FutureWarning)
+            if self._infrequent_enabled:
+                raise ValueError("infrequent categories are only supported "
+                                 "when handle_unknown is 'error' or 'auto'")
+
+        if self.max_levels is not None and self.max_levels <= 1:
+            raise ValueError("max_levels must be greater than 1")
+
+        if isinstance(self.min_frequency, numbers.Integral):
+            if not self.min_frequency >= 1:
+                raise ValueError("min_frequency must be an integer at least "
+                                 "1 or a float in (0.0, 1.0); got the "
+                                 "integer {}".format(self.min_frequency))
+        else:  # float
+            if not 0.0 < self.min_frequency < 1.0:
+                raise ValueError("min_frequency must be an integer at least "
+                                 "1 or a float in (0.0, 1.0); got the "
+                                 "float {}".format(self.min_frequency))
+
     def _compute_drop_idx(self):
         if self.drop is None:
             return None
@@ -326,6 +420,215 @@ def _compute_drop_idx(self):
                    "'first', None or array of objects, got {}")
             raise ValueError(msg.format(type(self.drop)))
 
+    @property
+    def _infrequent_enabled(self):
+        """Infrequent category is enabled."""
+        return (self.max_levels is not None and self.max_levels > 1 or
+                (isinstance(self.min_frequency, numbers.Integral)
+                    and self.min_frequency > 1) or
+                (isinstance(self.min_frequency, numbers.Real)
+                    and 0.0 < self.min_frequency < 1.0))
+
+    def _compute_infrequent_indicies(self, category_count, n_samples, col_idx):
+        """Compute the infrequent indicies based on max_levels and
+        min_frequency.
+
+        Parameters
+        ----------
+        category_count : ndarray of shape (n_cardinality,)
+            category counts
+
+        n_samples : int
+            number of samples
+
+        col_idx : int
+            index of current category only used for the error message
+
+        Returns
+        -------
+        output : ndarray of shape (n_infrequent_categories,) or None
+            If there are infrequent categories, indicies of infrequent
+            categories. Otherwise None.
+        """
+        infrequent_mask = np.zeros_like(category_count, dtype=bool)
+
+        if isinstance(self.min_frequency, numbers.Integral):
+            if self.min_frequency > 1:
+                category_mask = category_count < self.min_frequency
+                infrequent_mask |= category_mask
+        else:  # float
+            if 0.0 < self.min_frequency < 1.0:
+                min_frequency_abs = n_samples * self.min_frequency
+                category_mask = category_count < min_frequency_abs
+                infrequent_mask |= category_mask
+
+        if (self.max_levels is not None and self.max_levels > 1
+                and self.max_levels < category_count.size):
+
+            # stable sort to preserve original count order
+            smallest_levels = np.argsort(category_count, kind='mergesort'
+                                         )[:-self.max_levels + 1]
+            infrequent_mask[smallest_levels] = True
+
+        output = np.flatnonzero(infrequent_mask)
+
+        if output.size == category_count.size:
+            raise ValueError("All categories in column {} are infrequent"
+                             .format(col_idx))
+        return output if output.size > 0 else None
+
+    def _compute_infrequent_categories(self, category_counts, n_samples):
+        """Compute infrequent categories.
+
+        Parameters
+        ----------
+        category_counts : list of ndarrays
+            list of category counts
+
+        n_samples : int
+            number of samples
+        """
+        self.infrequent_indices_ = [
+            self._compute_infrequent_indicies(category_count, n_samples,
+                                              col_idx)
+            for col_idx, category_count in enumerate(category_counts)]
+
+        # compute mapping from default mapping to infrequent mapping
+        default_to_infrequent_mappings = []
+        largest_infreq_idxs = []
+
+        for category_count, infreq_idx in zip(category_counts,
+                                              self.infrequent_indices_):
+            # no infrequent categories
+            if infreq_idx is None:
+                default_to_infrequent_mappings.append(None)
+                largest_infreq_idxs.append(None)
+                continue
+
+            # infrequent indicies exist
+            mapping = np.empty_like(category_count, dtype=np.int)
+            n_cats = mapping.size
+            n_infrequent_cats = infreq_idx.size
+
+            n_frequent_cats = n_cats - n_infrequent_cats
+            mapping[infreq_idx] = n_frequent_cats
+
+            frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
+            mapping[frequent_indices] = np.arange(n_frequent_cats)
+
+            default_to_infrequent_mappings.append(mapping)
+
+            # compute infrequent category with the largest cardinality
+            largest_infreq_idx = np.argmax(category_count[infreq_idx])
+            largest_infreq_idxs.append(infreq_idx[largest_infreq_idx])
+
+        self._default_to_infrequent_mappings = default_to_infrequent_mappings
+        self._largest_infreq_indices = largest_infreq_idxs
+
+    def _map_to_infrequent_categories(self, X_int):
+        """Map categories to infrequent categories.
+
+        Note this will replace the encoding in X_int
+
+        Parameters
+        ----------
+        X_int: ndarray of shape (n_samples, n_features)
+            integer encoded categories
+        """
+        if not self._infrequent_enabled:
+            return
+
+        for col_idx, mapping in enumerate(
+                self._default_to_infrequent_mappings):
+
+            if mapping is None:
+                continue
+            X_int[:, col_idx] = np.take(mapping, X_int[:, col_idx])
+
+    def _get_default_invalid_category(self, col_idx):
+        """Get default invalid category for column index during `_transform`.
+
+        This function is pasesd to `_transform` to set the invalid categories.
+        """
+        infrequent_idx = self.infrequent_indices_[col_idx]
+        return 0 if infrequent_idx is None else infrequent_idx[0]
+
+    def _process_valid_mask(self, valid_mask, col_idx):
+        """Process the valid mask during `_transform`
+
+        This function is passed to `_transform` to adjust the mask depending
+        on if the infrequent column exist or not.
+        """
+        if self.handle_unknown != 'auto':
+            return valid_mask
+
+        # handle_unknown == 'auto'
+        infrequent_idx = self.infrequent_indices_[col_idx]
+
+        # infrequent column does not exist
+        # returning the original mask to allow the column to be ignored
+        if infrequent_idx is None:
+            return valid_mask
+
+        # infrequent column exist
+        # the unknown categories will be mapped to the infrequent category
+        return np.ones_like(valid_mask, dtype=bool)
+
+    def _compute_transformed_category(self, i):
+        """Compute the transformed category used for column `i`.
+
+        1. Dropped columns are removed.
+        2. If there are infrequent categories, the infrequent category with
+        the largest cardinality is placed at the end.
+        """
+        cats = self.categories_[i]
+
+        if self.drop is not None:
+            # early exit because infrequent categories and drop is forbidden
+            return np.delete(cats, self.drop_idx_[i])
+
+        # drop is None
+        if not self._infrequent_enabled:
+            return cats
+
+        # infrequent is enabled
+        infreq_idx = self.infrequent_indices_[i]
+        if infreq_idx is None:
+            return cats
+
+        largest_infreq_idx = self._largest_infreq_indices[i]
+        largest_infreq_cat = cats[largest_infreq_idx]
+        frequent_indices = np.setdiff1d(np.arange(len(cats)), infreq_idx)
+
+        return np.r_[cats[frequent_indices], [largest_infreq_cat]]
+
+    @property
+    def _n_transformed_features(self):
+        """Number of transformed features."""
+        if self.drop is not None:
+            # early exit because drop and infreqeunt are forbidden
+            return [len(cats) - 1 for cats in self.categories_]
+
+        # drop is None
+        output = [len(cats) for cats in self.categories_]
+
+        if not self._infrequent_enabled:
+            return output
+
+        # infrequent is enabled
+        for col_idx, infreq_idx in enumerate(self.infrequent_indices_):
+            if infreq_idx is None:
+                continue
+            output[col_idx] = output[col_idx] - infreq_idx.size + 1
+
+        return output
+
+    @property
+    def _transformed_categories(self):
+        """Transformed categories."""
+        return [self._compute_transformed_category(i)
+                for i in range(len(self.categories_))]
+
     def fit(self, X, y=None):
         """
         Fit OneHotEncoder to X.
@@ -344,7 +647,11 @@ def fit(self, X, y=None):
         self
         """
         self._validate_keywords()
-        self._fit(X, handle_unknown=self.handle_unknown)
+
+        process_counts = (self._compute_infrequent_categories
+                          if self._infrequent_enabled else None)
+        self._fit(X, handle_unknown=self.handle_unknown,
+                  process_counts=process_counts)
         self.drop_idx_ = self._compute_drop_idx()
         return self
 
@@ -387,7 +694,16 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        transform_kws = {"handle_unknown": self.handle_unknown}
+        if self._infrequent_enabled:
+            transform_kws.update({
+                "process_valid_mask": self._process_valid_mask,
+                "get_default_invalid_category":
+                self._get_default_invalid_category
+            })
+
+        X_int, X_mask = self._transform(X, **transform_kws)
+        self._map_to_infrequent_categories(X_int)
 
         n_samples, n_features = X_int.shape
 
@@ -400,9 +716,8 @@ def transform(self, X):
             keep_cells = X_int != to_drop
             X_mask &= keep_cells
             X_int[X_int > to_drop] -= 1
-            n_values = [len(cats) - 1 for cats in self.categories_]
-        else:
-            n_values = [len(cats) for cats in self.categories_]
+
+        n_values = self._n_transformed_features
 
         mask = X_mask.ravel()
         feature_indices = np.cumsum([0] + n_values)
@@ -444,12 +759,7 @@ def inverse_transform(self, X):
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
-        if self.drop is None:
-            n_transformed_features = sum(len(cats)
-                                         for cats in self.categories_)
-        else:
-            n_transformed_features = sum(len(cats) - 1
-                                         for cats in self.categories_)
+        n_transformed_features = sum(self._n_transformed_features)
 
         # validate shape of passed X
         msg = ("Shape of the passed X data is not correct. Expected {0} "
@@ -464,12 +774,14 @@ def inverse_transform(self, X):
         j = 0
         found_unknown = {}
 
+        if self._infrequent_enabled:
+            infrequent_indices = self.infrequent_indices_
+        else:
+            infrequent_indices = [None] * n_features
+
         for i in range(n_features):
-            if self.drop is None:
-                cats = self.categories_[i]
-            else:
-                cats = np.delete(self.categories_[i], self.drop_idx_[i])
-            n_categories = len(cats)
+            n_categories = self._n_transformed_features[i]
+            cats = self._transformed_categories[i]
 
             # Only happens if there was a column with a unique
             # category. In this case we just fill the column with this
@@ -482,7 +794,10 @@ def inverse_transform(self, X):
             # for sparse X argmax returns 2D matrix, ensure 1D array
             labels = np.asarray(_argmax(sub, axis=1)).flatten()
             X_tr[:, i] = cats[labels]
-            if self.handle_unknown == 'ignore':
+
+            if (self.handle_unknown == 'ignore' or
+                (self.handle_unknown == 'auto' and
+                 infrequent_indices[i] is None)):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
@@ -524,21 +839,17 @@ def get_feature_names(self, input_features=None):
             Array of feature names.
         """
         check_is_fitted(self)
-        cats = self.categories_
+        cats = self._transformed_categories
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
-        elif len(input_features) != len(self.categories_):
+        elif len(input_features) != len(cats):
             raise ValueError(
                 "input_features should have length equal to number of "
-                "features ({}), got {}".format(len(self.categories_),
-                                               len(input_features)))
+                "features ({}), got {}".format(len(cats), len(input_features)))
 
         feature_names = []
         for i in range(len(cats)):
-            names = [
-                input_features[i] + '_' + str(t) for t in cats[i]]
-            if self.drop is not None:
-                names.pop(self.drop_idx_[i])
+            names = [input_features[i] + '_' + str(t) for t in cats[i]]
             feature_names.extend(names)
 
         return np.array(feature_names, dtype=object)
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index c644aa919f5cf..d7789a00ca741 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -7,6 +7,7 @@
 # License: BSD 3 clause
 
 from collections import defaultdict
+from collections import Counter
 import itertools
 import array
 import warnings
@@ -33,32 +34,55 @@
 ]
 
 
-def _encode_numpy(values, uniques=None, encode=False, check_unknown=True):
+def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
+                  return_counts=False):
     # only used in _encode below, see docstring there for details
     if uniques is None:
-        if encode:
-            uniques, encoded = np.unique(values, return_inverse=True)
-            return uniques, encoded
+        unique_result = np.unique(values, return_inverse=encode,
+                                  return_counts=return_counts)
+        if encode and return_counts:
+            return {'uniques': unique_result[0],
+                    'encoded': unique_result[1],
+                    'counts': unique_result[2]}
+        elif encode:
+            return {'uniques': unique_result[0],
+                    'encoded': unique_result[1]}
+        elif return_counts:
+            return {'uniques': unique_result[0],
+                    'counts': unique_result[1]}
         else:
-            # unique sorts
-            return np.unique(values)
+            return {'uniques': unique_result}
+
+    output = {'uniques': uniques}
     if encode:
         if check_unknown:
             diff = _encode_check_unknown(values, uniques)
             if diff:
                 raise ValueError("y contains previously unseen labels: %s"
                                  % str(diff))
-        encoded = np.searchsorted(uniques, values)
-        return uniques, encoded
-    else:
-        return uniques
+        output['encoded'] = np.searchsorted(uniques, values)
+
+    if return_counts:
+        _, counts = np.unique(values, return_counts=True)
+        output['counts'] = counts
 
+    return output
 
-def _encode_python(values, uniques=None, encode=False):
+
+def _encode_python(values, uniques=None, encode=False, return_counts=False):
     # only used in _encode below, see docstring there for details
+    output = {}
     if uniques is None:
         uniques = sorted(set(values))
         uniques = np.array(uniques, dtype=values.dtype)
+
+    if return_counts:
+        uniques_dict = Counter(values)
+        counts = np.array([uniques_dict[item] for item in uniques],
+                          dtype=np.int)
+        output['counts'] = counts
+
+    output['uniques'] = uniques
     if encode:
         table = {val: i for i, val in enumerate(uniques)}
         try:
@@ -66,12 +90,12 @@ def _encode_python(values, uniques=None, encode=False):
         except KeyError as e:
             raise ValueError("y contains previously unseen labels: %s"
                              % str(e))
-        return uniques, encoded
-    else:
-        return uniques
+        output['encoded'] = encoded
+    return output
 
 
-def _encode(values, uniques=None, encode=False, check_unknown=True):
+def _encode(values, uniques=None, encode=False, check_unknown=True,
+            return_counts=False):
     """Helper function to factorize (find uniques) and encode values.
 
     Uses pure python method for object dtype, and numpy method for
@@ -97,25 +121,38 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
         True in this case. This parameter is useful for
         _BaseEncoder._transform() to avoid calling _encode_check_unknown()
         twice.
+    return_counts: bool, default=False
+        Returns the counts of the unique items in values. If uniques of object
+        dtype is passed in, the order of the counts will match the
+        order of the uniques. All other dtypes will return counts that assume
+        that uniques is ordered.
 
     Returns
     -------
-    uniques
-        If ``encode=False``. The unique values are sorted if the `uniques`
-        parameter was None (and thus inferred from the data).
-    (uniques, encoded)
-        If ``encode=True``.
+    output :
+        Dictionary with attributes:
+
+        uniques :
+            If ``encode=False``. The unique values are sorted if the `uniques`
+            parameter was None (and thus inferred from the data).
+
+        encoded :
+            If ``encode=True``.
 
+        counts :
+            If ``return_counts``.
     """
     if values.dtype == object:
         try:
-            res = _encode_python(values, uniques, encode)
+            res = _encode_python(values, uniques, encode,
+                                 return_counts=return_counts)
         except TypeError:
             raise TypeError("argument must be a string or number")
         return res
     else:
         return _encode_numpy(values, uniques, encode,
-                             check_unknown=check_unknown)
+                             check_unknown=check_unknown,
+                             return_counts=return_counts)
 
 
 def _encode_check_unknown(values, uniques, return_mask=False):
@@ -233,7 +270,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         y = column_or_1d(y, warn=True)
-        self.classes_ = _encode(y)
+        self.classes_ = _encode(y)["uniques"]
         return self
 
     def fit_transform(self, y):
@@ -249,8 +286,9 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         y = column_or_1d(y, warn=True)
-        self.classes_, y = _encode(y, encode=True)
-        return y
+        result = _encode(y, encode=True)
+        self.classes_ = result["uniques"]
+        return result["encoded"]
 
     def transform(self, y):
         """Transform labels to normalized encoding.
@@ -270,8 +308,7 @@ def transform(self, y):
         if _num_samples(y) == 0:
             return np.array([])
 
-        _, y = _encode(y, uniques=self.classes_, encode=True)
-        return y
+        return _encode(y, uniques=self.classes_, encode=True)["encoded"]
 
     def inverse_transform(self, y):
         """Transform labels back to original encoding.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 78590f40ffba5..2e81b9e6559e4 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -9,6 +9,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import ignore_warnings
 
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
@@ -53,7 +54,10 @@ def test_one_hot_encoder_diff_n_features():
         enc.transform(X2)
 
 
-def test_one_hot_encoder_handle_unknown():
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+def test_one_hot_encoder_handle_unknown(handle_unknown):
     X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
     X2 = np.array([[4, 1, 1]])
 
@@ -65,7 +69,7 @@ def test_one_hot_encoder_handle_unknown():
         oh.transform(X2)
 
     # Test the ignore option, ignores unknown features (giving all 0's)
-    oh = OneHotEncoder(handle_unknown='ignore')
+    oh = OneHotEncoder(handle_unknown=handle_unknown)
     oh.fit(X)
     X2_passed = X2.copy()
     assert_array_equal(
@@ -90,14 +94,17 @@ def test_one_hot_encoder_not_fitted():
         enc.transform(X)
 
 
-def test_one_hot_encoder_handle_unknown_strings():
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
     X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
     X2 = np.array(['55555', '22']).reshape((-1, 1))
     # Non Regression test for the issue #12470
     # Test the ignore option, when categories are numpy string dtype
     # particularly when the known category strings are larger
     # than the unknown category strings
-    oh = OneHotEncoder(handle_unknown='ignore')
+    oh = OneHotEncoder(handle_unknown=handle_unknown)
     oh.fit(X)
     X2_passed = X2.copy()
     assert_array_equal(
@@ -220,9 +227,12 @@ def test_one_hot_encoder(X):
     assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 @pytest.mark.parametrize('sparse_', [False, True])
 @pytest.mark.parametrize('drop', [None, 'first'])
-def test_one_hot_encoder_inverse(sparse_, drop):
+def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
     X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
     enc = OneHotEncoder(sparse=sparse_, drop=drop)
     X_tr = enc.fit_transform(X)
@@ -240,7 +250,7 @@ def test_one_hot_encoder_inverse(sparse_, drop):
         # with unknown categories
         # drop is incompatible with handle_unknown=ignore
         X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
-        enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
+        enc = OneHotEncoder(sparse=sparse_, handle_unknown=handle_unknown,
                             categories=[['abc', 'def'], [1, 2],
                                         [54, 55, 56]])
         X_tr = enc.fit_transform(X)
@@ -251,7 +261,7 @@ def test_one_hot_encoder_inverse(sparse_, drop):
         # with an otherwise numerical output, still object if unknown
         X = [[2, 55], [1, 55], [3, 55]]
         enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
-                            handle_unknown='ignore')
+                            handle_unknown=handle_unknown)
         X_tr = enc.fit_transform(X)
         exp = np.array(X, dtype=object)
         exp[2, 0] = None
@@ -309,6 +319,9 @@ def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
             assert np.issubdtype(res.dtype, cat_dtype)
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 @pytest.mark.parametrize("X, X2, cats, cat_dtype", [
     (np.array([['a', 'b']], dtype=object).T,
      np.array([['a', 'd']], dtype=object).T,
@@ -320,7 +333,8 @@ def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
      np.array([['a', 'd']], dtype=object).T,
      [np.array(['a', 'b', 'c'])], np.object_),
     ], ids=['object', 'numeric', 'object-string-cat'])
-def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
+def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype,
+                                              handle_unknown):
     enc = OneHotEncoder(categories=cats)
     exp = np.array([[1., 0., 0.],
                     [0., 1., 0.]])
@@ -336,7 +350,7 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
     enc = OneHotEncoder(categories=cats)
     with pytest.raises(ValueError, match="Found unknown categories"):
         enc.fit(X2)
-    enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
+    enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown)
     exp = np.array([[1., 0., 0.], [0., 0., 0.]])
     assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
 
@@ -398,12 +412,14 @@ def test_one_hot_encoder_feature_names_drop(drop, expected_names):
     assert_array_equal(expected_names, feature_names)
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])
 @pytest.mark.parametrize("as_data_frame", [False, True],
                          ids=['array', 'dataframe'])
-@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
+@pytest.mark.parametrize("handle_unknown", ['error', 'auto', 'ignore'])
 def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
     if as_data_frame:
         pd = pytest.importorskip('pandas')
@@ -637,3 +653,314 @@ def test_categories(density, drop):
 @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
 def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
+
+
+@pytest.mark.parametrize("kwargs", [
+    {'max_levels': 2},
+    {'min_frequency': 11},
+    {'min_frequency': 0.29},
+    {'max_levels': 2, 'min_frequency': 6},
+    {'max_levels': 4, 'min_frequency': 12},
+])
+@pytest.mark.parametrize("categories",
+                         ["auto", [['a', 'b', 'c', 'd']]])
+def test_ohe_infrequent_two_levels(kwargs, categories):
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(categories=categories,
+                        handle_unknown='auto', sparse=False,
+                        **kwargs).fit(X_train)
+    assert_array_equal(ohe.infrequent_indices_, [[0, 2, 3]])
+
+    X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
+    expected = np.array([
+        [1, 0],
+        [0, 1],
+        [0, 1],
+        [0, 1],
+        [0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [['b'], ['c'], ['c'], ['c'], ['c']]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+    # The most frequent infrequent category becomes the feature name
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(['x0_b', 'x0_c'], feature_names)
+
+
+@pytest.mark.parametrize("kwargs", [
+    {'max_levels': 3},
+    {'min_frequency': 6},
+    {'min_frequency': 9},
+    {'min_frequency': 0.24},
+    {'min_frequency': 0.16},
+    {'max_levels': 3, 'min_frequency': 8},
+    {'max_levels': 4, 'min_frequency': 6},
+])
+def test_ohe_infrequent_three_levels(kwargs):
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+                        **kwargs).fit(X_train)
+    assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
+
+    X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
+    expected = np.array([
+        [1, 0, 0],
+        [0, 0, 1],
+        [0, 1, 0],
+        [0, 0, 1],
+        [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [['b'], ['a'], ['c'], ['a'], ['a']]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+    # The most frequent infrequent category becomes the feature name
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(['x0_b', 'x0_c', 'x0_a'], feature_names)
+
+
+def test_ohe_infrequent_two_levels_user_cats():
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
+                       dtype=object).T
+    ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
+                        sparse=False, handle_unknown='auto',
+                        max_levels=2).fit(X_train)
+
+    assert_array_equal(ohe.infrequent_indices_, [[0, 1, 2]])
+
+    X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
+    expected = np.array([
+        [1, 0],
+        [0, 1],
+        [0, 1],
+        [0, 1],
+        [0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [['b'], ['c'], ['c'], ['c'], ['c']]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_three_levels_user_cats():
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
+                       dtype=object).T
+    ohe = OneHotEncoder(categories=[['c', 'd', 'b', 'a']],
+                        sparse=False, handle_unknown='auto',
+                        max_levels=3).fit(X_train)
+
+    assert_array_equal(ohe.infrequent_indices_, [[1, 3]])
+
+    X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
+    expected = np.array([
+        [0, 1, 0],
+        [0, 0, 1],
+        [1, 0, 0],
+        [0, 0, 1],
+        [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [['b'], ['a'], ['c'], ['a'], ['a']]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_multiple_categories():
+    X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3],
+              [0, 0, 5, 1, 1, 10, 5, 5, 0],
+              [1, 0, 1, 0, 1, 0, 1, 0, 1]]
+
+    ohe = OneHotEncoder(categories='auto', max_levels=3,
+                        handle_unknown='auto')
+    # X[:, 0] 1 and 2 is infrequent
+    # X[:, 1] 1 and 10 are infrequent
+    # X[:, 2] nothing is infrequent
+
+    X_trans = ohe.fit_transform(X).toarray()
+    assert_array_equal(ohe.infrequent_indices_[0], [1, 2])
+    assert_array_equal(ohe.infrequent_indices_[1], [1, 3])
+    assert_array_equal(ohe.infrequent_indices_[2], None)
+
+    # The most frequent infrequent category becomes the feature name
+    # For the first column, 1 and 2 have the same frequency. In this case,
+    # 1 will be choosen to be the feature name because is smaller lexiconically
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(['x0_0', 'x0_3', 'x0_1',
+                        'x1_0', 'x1_5', 'x1_1',
+                        'x2_0', 'x2_1'], feature_names)
+
+    expected = [[1, 0, 0,  1, 0, 0,  0, 1],
+                [0, 0, 1,  1, 0, 0,  1, 0],
+                [0, 1, 0,  0, 1, 0,  0, 1],
+                [0, 1, 0,  0, 0, 1,  1, 0],
+                [0, 1, 0,  0, 0, 1,  0, 1],
+                [0, 1, 0,  0, 0, 1,  1, 0],
+                [0, 0, 1,  0, 1, 0,  0, 1],
+                [1, 0, 0,  0, 1, 0,  1, 0],
+                [0, 1, 0,  1, 0, 0,  0, 1]]
+
+    assert_allclose(expected, X_trans)
+
+    X_test = [[3, 1, 2],
+              [4, 0, 3]]
+
+    X_test_trans = ohe.transform(X_test)
+
+    # X[:, 2] does not have an infrequent category, thus it is encoded as all
+    # zeros
+    expected = [[0, 1, 0,  0, 0, 1,  0, 0],
+                [0, 0, 1,  1, 0, 0,  0, 0]]
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array([[3, 1, None],
+                             [1, 0, None]], dtype=object)
+    assert_array_equal(expected_inv, X_inv)
+
+    # error for unknown categories
+    ohe = OneHotEncoder(categories='auto', max_levels=3,
+                        handle_unknown='error').fit(X)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        ohe.transform(X_test)
+
+    # only infrequent or known categories
+    X_test = [[1, 1, 1],
+              [3, 10, 0]]
+    X_test_trans = ohe.transform(X_test)
+
+    expected = [[0, 0, 1,  0, 0, 1,  0, 1],
+                [0, 1, 0,  0, 0, 1,  1, 0]]
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+
+    expected_inv = [[1, 1, 1],
+                    [3, 1, 0]]
+    assert_allclose(expected_inv, X_inv)
+
+
+def test_ohe_infrequent_multiple_categories_dtypes():
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame(
+        {'str': ['a', 'f', 'c', 'f', 'f', 'a', 'c', 'b', 'b'],
+         'int': [5, 3, 0, 10, 10, 12, 0, 3, 5]},
+        columns=['str', 'int'])
+
+    ohe = OneHotEncoder(categories='auto', max_levels=3,
+                        handle_unknown='auto')
+    # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
+    # considered infrequent because they are greater
+
+    # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
+    # 0, 3, 12 will be considered infrequent
+
+    X_trans = ohe.fit_transform(X).toarray()
+    assert_allclose(ohe.infrequent_indices_[0], [0, 1])
+    assert_allclose(ohe.infrequent_indices_[1], [0, 1, 4])
+
+    expected = [[0, 0, 1,  1, 0, 0],
+                [0, 1, 0,  0, 0, 1],
+                [1, 0, 0,  0, 0, 1],
+                [0, 1, 0,  0, 1, 0],
+                [0, 1, 0,  0, 1, 0],
+                [0, 0, 1,  0, 0, 1],
+                [1, 0, 0,  0, 0, 1],
+                [0, 0, 1,  0, 0, 1],
+                [0, 0, 1,  1, 0, 0]]
+
+    assert_allclose(expected, X_trans)
+
+    X_test = pd.DataFrame(
+        {'str': ['b', 'f'],
+         'int': [14, 12]},
+        columns=['str', 'int'])
+
+    expected = [[0, 0, 1,  0, 0, 1],
+                [0, 1, 0,  0, 0, 1]]
+    X_test_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_test_trans.toarray())
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array([['a', 0], ['f', 0]], dtype=object)
+    assert_array_equal(expected_inv, X_inv)
+
+    # error for unknown categories
+    ohe = OneHotEncoder(categories='auto', max_levels=3,
+                        handle_unknown='error').fit(X)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        ohe.transform(X_test)
+
+    # only infrequent or known categories
+    X_test = pd.DataFrame(
+        {'str': ['c', 'b'],
+         'int': [12, 5]},
+        columns=['str', 'int'])
+    X_test_trans = ohe.transform(X_test).toarray()
+    expected = [[1, 0, 0,  0, 0, 1],
+                [0, 0, 1,  1, 0, 0]]
+    assert_allclose(expected, X_test_trans)
+
+    X_inv = ohe.inverse_transform(X_test_trans)
+    expected_inv = np.array([['c', 0], ['a', 5]], dtype=object)
+    assert_array_equal(expected_inv, X_inv)
+
+
+@pytest.mark.parametrize("min_frequency", [21])
+def test_ohe_infrequent_one_level_errors(min_frequency):
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
+
+    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+                        min_frequency=min_frequency)
+
+    msg = "All categories in column 0 are infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
+
+
+# TODO: Remove when 'ignore' is deprecated in 0.25
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("kwargs, error_msg", [
+    ({'max_levels': 1}, 'max_levels must be greater than 1'),
+    ({'max_levels': -2}, 'max_levels must be greater than 1'),
+    ({'min_frequency': -1}, 'min_frequency must be an integer at least'),
+    ({'min_frequency': 1.1}, 'min_frequency must be an integer at least'),
+    ({'max_levels': 2, 'drop': 'first', 'handle_unknown': 'error'},
+     "infrequent categories are not supported when drop is specified"),
+    ({'handle_unknown': 'ignore', 'max_levels': 2},
+     "infrequent categories are only supported when handle_unknown is "
+     "'error' or 'auto'")
+])
+def test_ohe_infrequent_invalid_parameters_error(kwargs, error_msg):
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
+
+    default_kwargs = {**{'handle_unknown': 'auto'}, **kwargs}
+    ohe = OneHotEncoder(**default_kwargs)
+
+    with pytest.raises(ValueError, match=error_msg):
+        ohe.fit(X_train)
+
+
+# TODO: Remove in 0.25 when 'ignore' is deprecated
+def test_ohe_ignore_deprecated():
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
+    ohe = OneHotEncoder(handle_unknown='ignore')
+
+    msg = (r"handle_unknown='ignore' is deprecated in favor of 'auto' in "
+           r"version 0\.23 and will be removed in version 0\.25")
+    with pytest.warns(FutureWarning, match=msg):
+        ohe.fit(X_train)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 6cdb198182a20..408489a27954c 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -626,13 +626,39 @@ def test_inverse_binarize_multiclass():
           np.array(['a', 'b', 'c']))],
         ids=['int64', 'object', 'str'])
 def test_encode_util(values, expected):
-    uniques = _encode(values)
+    uniques = _encode(values)['uniques']
     assert_array_equal(uniques, expected)
-    uniques, encoded = _encode(values, encode=True)
-    assert_array_equal(uniques, expected)
-    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
-    _, encoded = _encode(values, uniques, encode=True)
-    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
+    result = _encode(values, encode=True)
+    assert_array_equal(result['uniques'], expected)
+    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
+
+    result = _encode(values, uniques, encode=True)
+    assert_array_equal(result['uniques'], expected)
+    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
+
+    result = _encode(values, return_counts=True)
+    assert_array_equal(result['uniques'], expected)
+    assert_array_equal(result['counts'], np.array([2, 1, 2]))
+
+    result = _encode(values, encode=True, return_counts=True)
+    assert_array_equal(result['uniques'], expected)
+    assert_array_equal(result['counts'], np.array([2, 1, 2]))
+    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
+
+    result = _encode(values, uniques, return_counts=True)
+    assert_array_equal(result['uniques'], expected)
+    assert_array_equal(result['counts'], np.array([2, 1, 2]))
+
+
+def test_encode_util_uniques_unordered():
+    # The return counts are ordered based on the order of uniques
+
+    values = np.array(['b'] * 21 + ['c'] * 5 + ['a'] * 11, dtype=object)
+    result = _encode(values, np.array(['a', 'c', 'b']), return_counts=True)
+
+    assert_array_equal(result['uniques'], np.array(['a', 'c', 'b']))
+    assert_array_equal(result['counts'], [11, 5, 21])
 
 
 def test_encode_check_unknown():

From 66136459ea93524100b91df6897dd7edae98e732 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 3 Jan 2020 13:15:18 -0500
Subject: [PATCH 02/92] STY Linting

---
 sklearn/preprocessing/tests/test_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 2e81b9e6559e4..33203d84f3137 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -9,7 +9,6 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import ignore_warnings
 
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder

From 741bd10f287d13732e8814f0590c622fad3d6aec Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 3 Jan 2020 13:46:48 -0500
Subject: [PATCH 03/92] STY Linting

---
 sklearn/preprocessing/_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 68c79087b10c7..9cc34295a9dd0 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -342,7 +342,6 @@ def __init__(self, categories='auto', drop=None, sparse=True,
 
     def _validate_keywords(self):
 
-
         if self.handle_unknown not in ('error', 'ignore', 'auto'):
             msg = ("handle_unknown should be either 'error', 'ignore', 'auto'"
                    "got {0}.".format(self.handle_unknown))

From f1ba19130090a76c5fa97b7770488c0102ea9419 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 3 Jan 2020 14:29:13 -0500
Subject: [PATCH 04/92] DOC Improves wording

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 9cc34295a9dd0..d4a2665f9d34d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -251,7 +251,7 @@ class OneHotEncoder(_BaseEncoder):
         an unknown category will be the most frequent infrequent category.
 
         .. versionadded:: 0.23
-            'auto' was added
+            'auto' was added to automatically handle unknown categories
 
         .. deprecated:: 0.23
             'ignore' is deprecated in favor of 'auto'

From ae3f8731de5c3c48d4070209534bc9fa5e30b7f5 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Sat, 4 Jan 2020 00:39:50 -0500
Subject: [PATCH 05/92] DOC Lint

---
 sklearn/preprocessing/_encoders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d4a2665f9d34d..c97ed10e91aad 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -261,6 +261,7 @@ class OneHotEncoder(_BaseEncoder):
 
         - If int, categories with a cardinality smaller will be considered
         infrequent.
+
         - If float, categories with a cardinality smaller than this fraction
         of the total number of samples will be considered infrequent.
 

From dc4249b7ec38268ff81adc46bee1d8c8d56a6c14 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 6 Jan 2020 14:52:34 -0500
Subject: [PATCH 06/92] BUG Fixes

---
 sklearn/preprocessing/_encoders.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c97ed10e91aad..f3462ca0ef9ef 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -240,15 +240,14 @@ class OneHotEncoder(_BaseEncoder):
         will be denoted as None.
 
         When this parameter is set to 'auto' and an unknown category is
-        encountered during transform
+        encountered in transform:
 
         1. If there was no infrequent category during training, the resulting
         one-hot encoded columns for this feature will be be all zeros. In
         the inverse transform, an unknown category will be denoted as None.
-
         2. If there is an infrequent category during training, the unknown
         category will be considered infrequent. In the inverse transform,
-        an unknown category will be the most frequent infrequent category.
+        an unknown category will be the most frequent infrequent category
 
         .. versionadded:: 0.23
             'auto' was added to automatically handle unknown categories
@@ -259,10 +258,9 @@ class OneHotEncoder(_BaseEncoder):
     min_frequency : int or float, default=1
         Specifics the categories to be considered infrequent.
 
-        - If int, categories with a cardinality smaller will be considered
+        1. If int, categories with a cardinality smaller will be considered
         infrequent.
-
-        - If float, categories with a cardinality smaller than this fraction
+        2. If float, categories with a cardinality smaller than this fraction
         of the total number of samples will be considered infrequent.
 
         .. versionadded:: 0.23
@@ -307,10 +305,10 @@ class OneHotEncoder(_BaseEncoder):
     values per feature and transform the data to a binary one-hot encoding.
 
     >>> from sklearn.preprocessing import OneHotEncoder
-    >>> enc = OneHotEncoder(handle_unknown='ignore')
+    >>> enc = OneHotEncoder(handle_unknown='auto')
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    OneHotEncoder(handle_unknown='ignore')
+    OneHotEncoder(handle_unknown='auto')
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()

From c070f16662d6701c91c9f569d9d5ae3054318471 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Jan 2020 10:45:33 -0500
Subject: [PATCH 07/92] CLN Address comments

---
 doc/modules/preprocessing.rst                | 49 ++++++++++++------
 sklearn/preprocessing/_encoders.py           | 43 ++++++++++------
 sklearn/preprocessing/tests/test_encoders.py | 52 +++++++++++++++++++-
 3 files changed, 112 insertions(+), 32 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index a97f8182bf3f9..afd5e0e84d3ae 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -579,38 +579,57 @@ as a dict, not as scalars.
 Infrequent categories
 ---------------------
 
-:class:`OneHotEncoder` supports creating a category for infrequent categories
-in the training data. The parameters to enable the gathering of infrequent
-categories are `min_frequency` and `max_levels`.
+:class:`OneHotEncoder` supports outputing a feature that combines infrequent
+categories in the training data. The parameters to enable the gathering of
+infrequent categories are `min_frequency` and `max_levels`.
 
-1. `min_frequency` can be a integer greater or equal to one or a float in
+1. `min_frequency` can be a integer greater or equal to 1, or a float in
 `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
-smaller than this value will be considered infrequent. If `min_frequency` is an
+smaller than this value will be considered infrequent. If `min_frequency` is a
 float, categories with a cardinality smaller than this fraction of the
 total number of samples will be considered infrequent.
 
-2. `max_levels` can be `None` or any integer greater than one. This parameter
-sets an upper limit of the number of categories including the infrequent
-category.
+2. `max_levels` can be `None` or any integer greater than 1. This parameter
+sets an upper limit to the number of output features for each input feature.
+`max_levels` includes the feature that combines infrequent categories.
 
-These parameters can be used together to filter out infrequent categories. In
-the following example, the categories, `'dog', 'cat'`, are considered infrequent::
+In the following example, the categories, `'dog', 'snake'`, are considered
+infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 + 
    ...               ['snake'] * 3]).T
    >>> enc = preprocessing.OneHotEncoder(min_frequency=6,
    ...                                   handle_unknown='auto').fit(X)
-   >>> enc.transform([['dog']]).toarray()
-   array([[0., 0., 1.]])
-   >>> enc.transform([['rabbit']]).toarray()
-   array([[0., 1., 0.]])
-   
+   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
+   array([[0., 0., 1.],
+          [1., 0., 0.],
+          [0., 1., 0.],
+          [0., 0., 1.]])
+
 By setting handle_unknown to `'auto'`, unknown categories will be considered
 infrequent::
 
    >>> enc.transform([['dragon']]).toarray()
    array([[0., 0., 1.]])
 
+:meth:`OneHotEncoder.get_feature_names` uses the most frequent infrequent 
+category as the feature name for the infrequent feature name::
+
+   >>> enc.get_feature_names()
+   array(['x0_cat', 'x0_rabbit', 'x0_dog'], dtype=object)
+
+`min_frequency` and `max_levels` can be used together to filter out infrequent
+categories. In the following example, setting `max_levels` to 2 limits the
+number of output features::
+
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_levels=2,
+   ...                                   handle_unknown='auto').fit(X)
+   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
+   array([[0., 1.],
+          [1., 0.],
+          [0., 1.],
+          [0., 1.]])
+
 .. _preprocessing_discretization:
 
 Discretization
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index f3462ca0ef9ef..5da552397fbc2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -242,12 +242,14 @@ class OneHotEncoder(_BaseEncoder):
         When this parameter is set to 'auto' and an unknown category is
         encountered in transform:
 
-        1. If there was no infrequent category during training, the resulting
-        one-hot encoded columns for this feature will be be all zeros. In
-        the inverse transform, an unknown category will be denoted as None.
-        2. If there is an infrequent category during training, the unknown
-        category will be considered infrequent. In the inverse transform,
-        an unknown category will be the most frequent infrequent category
+            1. If there was no infrequent category during training, the
+            resulting one-hot encoded columns for this feature will be be all
+            zeros. In the inverse transform, an unknown category will be
+            denoted as None.
+
+            2. If there is an infrequent category during training, the unknown
+            category will be considered infrequent. In the inverse transform,
+            an unknown category will be the most frequent infrequent category
 
         .. versionadded:: 0.23
             'auto' was added to automatically handle unknown categories
@@ -256,19 +258,22 @@ class OneHotEncoder(_BaseEncoder):
             'ignore' is deprecated in favor of 'auto'
 
     min_frequency : int or float, default=1
-        Specifics the categories to be considered infrequent.
+        Specifies the categories to be considered infrequent.
+
+            1. If int, categories with a cardinality smaller will be considered
+            infrequent.
 
-        1. If int, categories with a cardinality smaller will be considered
-        infrequent.
-        2. If float, categories with a cardinality smaller than this fraction
-        of the total number of samples will be considered infrequent.
+            2. If float, categories with a cardinality smaller than this
+            fraction of the total number of samples will be considered
+            infrequent.
 
         .. versionadded:: 0.23
 
     max_levels : int, default=None
-        Specifies the categories to be considered infrequent. Sets an upper
-        limit to the number of categories including the infrequent category.
-        If `None` there is no limit to the number of categories.
+        Specifies an upper limit to the number of output features for each
+        input feature when considering infrequent categories. `max_levels`
+        includes the feature that combines infrequent categories. If `None`
+        there is no limit to the number of output features.
 
         .. versionadded:: 0.23
 
@@ -448,7 +453,8 @@ def _compute_infrequent_indicies(self, category_count, n_samples, col_idx):
             If there are infrequent categories, indicies of infrequent
             categories. Otherwise None.
         """
-        infrequent_mask = np.zeros_like(category_count, dtype=bool)
+        # categories with no count are infrequent
+        infrequent_mask = category_count == 0
 
         if isinstance(self.min_frequency, numbers.Integral):
             if self.min_frequency > 1:
@@ -462,7 +468,6 @@ def _compute_infrequent_indicies(self, category_count, n_samples, col_idx):
 
         if (self.max_levels is not None and self.max_levels > 1
                 and self.max_levels < category_count.size):
-
             # stable sort to preserve original count order
             smallest_levels = np.argsort(category_count, kind='mergesort'
                                          )[:-self.max_levels + 1]
@@ -742,6 +747,9 @@ def inverse_transform(self, X):
         In case unknown categories are encountered (all zeros in the
         one-hot encoding), ``None`` is used to represent this category.
 
+        For a given input feature, if there is an infrequent category, the most
+        frequent infrequent category will be used to represent this category.
+
         Parameters
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
@@ -825,6 +833,9 @@ def get_feature_names(self, input_features=None):
         """
         Return feature names for output features.
 
+        For a given input feature, if there is an infrequent category, the most
+        frequent infrequent category will be used as a feature name.
+
         Parameters
         ----------
         input_features : list of str of shape (n_features,)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 33203d84f3137..28680f07be022 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -664,6 +664,8 @@ def test_encoders_has_categorical_tags(Encoder):
 @pytest.mark.parametrize("categories",
                          ["auto", [['a', 'b', 'c', 'd']]])
 def test_ohe_infrequent_two_levels(kwargs, categories):
+    # Test that different parameters for combine 'a', 'c', and 'd' into
+    # the infrequent category works as expected
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(categories=categories,
@@ -701,6 +703,8 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     {'max_levels': 4, 'min_frequency': 6},
 ])
 def test_ohe_infrequent_three_levels(kwargs):
+    # Test that different parameters for combine 'a', and 'd' into
+    # the infrequent category works as expected
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
@@ -727,8 +731,32 @@ def test_ohe_infrequent_three_levels(kwargs):
     assert_array_equal(['x0_b', 'x0_c', 'x0_a'], feature_names)
 
 
-def test_ohe_infrequent_two_levels_user_cats():
+@pytest.mark.parametrize("kwargs", [{'max_levels': 3},
+                                    {'min_frequency': 4}])
+def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
+    # 'a' is the only frequent category, all other categories are infrequent
+
+    X_train = np.array([['a'] * 5 + ['e'] * 30], dtype=object).T
+    ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
+                        sparse=False, handle_unknown='auto',
+                        **kwargs).fit(X_train)
+
+    X_test = [['a'], ['b'], ['c'], ['d'], ['e']]
+    expected = np.array([
+        [1, 0],
+        [0, 1],
+        [0, 1],
+        [0, 1],
+        [0, 1]])
 
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+
+def test_ohe_infrequent_two_levels_user_cats():
+    # Test that the order of the categories provided by a user is respected.
+    # Specifically, the infrequent_indicies_ correspond to the user provided
+    # categories.
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
@@ -748,12 +776,16 @@ def test_ohe_infrequent_two_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
+    # The most frequent infrquent category is used for the inverse transform
     expected_inv = [['b'], ['c'], ['c'], ['c'], ['c']]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
 
 def test_ohe_infrequent_three_levels_user_cats():
+    # Test that the order of the categories provided by a user is respected.
+    # In this case 'c' is encoded as the first category and 'b' is encoded
+    # as the second one
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
@@ -774,12 +806,15 @@ def test_ohe_infrequent_three_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
+    # The most frequent infrquent category is used for the inverse transform
     expected_inv = [['b'], ['a'], ['c'], ['a'], ['a']]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
 
 def test_ohe_infrequent_multiple_categories():
+    # Test infrequent categories with feature matrix with 3 features
+
     X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3],
               [0, 0, 5, 1, 1, 10, 5, 5, 0],
               [1, 0, 1, 0, 1, 0, 1, 0, 1]]
@@ -854,6 +889,8 @@ def test_ohe_infrequent_multiple_categories():
 
 
 def test_ohe_infrequent_multiple_categories_dtypes():
+    # Test infrequent categories with a pandas dataframe with multiple dtypes
+
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame(
         {'str': ['a', 'f', 'c', 'f', 'f', 'a', 'c', 'b', 'b'],
@@ -931,6 +968,19 @@ def test_ohe_infrequent_one_level_errors(min_frequency):
         ohe.fit(X_train)
 
 
+@pytest.mark.parametrize("kwargs", [{'min_frequency': 2, 'max_levels': 3}])
+def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
+    # All user provided categories are infrequent
+
+    X_train = np.array([['e'] * 3], dtype=object).T
+    ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
+                        sparse=False, handle_unknown='auto', **kwargs)
+
+    msg = "All categories in column 0 are infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
+
+
 # TODO: Remove when 'ignore' is deprecated in 0.25
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("kwargs, error_msg", [

From 3400e070e05bdc183151949dd10732c6f75471f0 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Jan 2020 12:15:00 -0500
Subject: [PATCH 08/92] CLN Address comments

---
 doc/modules/preprocessing.rst             |  5 +-
 sklearn/preprocessing/_encoders.py        | 60 ++++++++++++++++-------
 sklearn/preprocessing/tests/test_label.py |  2 +-
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index afd5e0e84d3ae..2c5e6820690d1 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -580,8 +580,9 @@ Infrequent categories
 ---------------------
 
 :class:`OneHotEncoder` supports outputing a feature that combines infrequent
-categories in the training data. The parameters to enable the gathering of
-infrequent categories are `min_frequency` and `max_levels`.
+categories in the training data. For each input feature that has a infrequent
+category a new column is formed to represent it. The parameters to enable the
+gathering of infrequent categories are `min_frequency` and `max_levels`.
 
 1. `min_frequency` can be a integer greater or equal to 1, or a float in
 `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 5da552397fbc2..09063478dda8a 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -161,6 +161,7 @@ def _transform(self, X, handle_unknown='error',
 
                     if process_valid_mask is not None:
                         valid_mask = process_valid_mask(valid_mask, i)
+
                     X_mask[:, i] = valid_mask
 
             # We use check_unknown=False, since _encode_check_unknown was
@@ -243,7 +244,7 @@ class OneHotEncoder(_BaseEncoder):
         encountered in transform:
 
             1. If there was no infrequent category during training, the
-            resulting one-hot encoded columns for this feature will be be all
+            resulting one-hot encoded columns for this feature will be all
             zeros. In the inverse transform, an unknown category will be
             denoted as None.
 
@@ -263,7 +264,7 @@ class OneHotEncoder(_BaseEncoder):
             1. If int, categories with a cardinality smaller will be considered
             infrequent.
 
-            2. If float, categories with a cardinality smaller than this
+            2. If float, categories with a cardinality smaller than the
             fraction of the total number of samples will be considered
             infrequent.
 
@@ -290,6 +291,11 @@ class OneHotEncoder(_BaseEncoder):
         be dropped for each feature. None if all the transformed features will
         be retained.
 
+    infrequent_indices_ : list of shape (n_features,)
+        `infrequent_indices_[i]` is an array of indices corresponding to
+        `categories_[i]` of the infrequent categories. `infrequent_indices_[i]`
+        is None if the ith input feature has no infrequent categories.
+
     See Also
     --------
     sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
@@ -432,7 +438,7 @@ def _infrequent_enabled(self):
                 (isinstance(self.min_frequency, numbers.Real)
                     and 0.0 < self.min_frequency < 1.0))
 
-    def _compute_infrequent_indicies(self, category_count, n_samples, col_idx):
+    def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indicies based on max_levels and
         min_frequency.
 
@@ -480,8 +486,15 @@ def _compute_infrequent_indicies(self, category_count, n_samples, col_idx):
                              .format(col_idx))
         return output if output.size > 0 else None
 
-    def _compute_infrequent_categories(self, category_counts, n_samples):
-        """Compute infrequent categories.
+    def _fit_infrequent_category_mapping(self, category_counts, n_samples):
+        """Fit infrequent categories.
+
+        Defines:
+            1. infrequent_indices_ to be the categories that are infrequent.
+            2. _default_to_infrequent_mappings to be the mapping from the
+               default mapping provided by _encode to the infrequent categories
+            3. _largest_infreq_indices to be the indices of the most frequent
+               infrequent category
 
         Parameters
         ----------
@@ -492,8 +505,7 @@ def _compute_infrequent_categories(self, category_counts, n_samples):
             number of samples
         """
         self.infrequent_indices_ = [
-            self._compute_infrequent_indicies(category_count, n_samples,
-                                              col_idx)
+            self._identify_infrequent(category_count, n_samples, col_idx)
             for col_idx, category_count in enumerate(category_counts)]
 
         # compute mapping from default mapping to infrequent mapping
@@ -530,8 +542,7 @@ def _compute_infrequent_categories(self, category_counts, n_samples):
 
     def _map_to_infrequent_categories(self, X_int):
         """Map categories to infrequent categories.
-
-        Note this will replace the encoding in X_int
+        This modifies X_int in-place.
 
         Parameters
         ----------
@@ -541,12 +552,10 @@ def _map_to_infrequent_categories(self, X_int):
         if not self._infrequent_enabled:
             return
 
-        for col_idx, mapping in enumerate(
-                self._default_to_infrequent_mappings):
-
+        for i, mapping in enumerate(self._default_to_infrequent_mappings):
             if mapping is None:
                 continue
-            X_int[:, col_idx] = np.take(mapping, X_int[:, col_idx])
+            X_int[:, i] = np.take(mapping, X_int[:, i])
 
     def _get_default_invalid_category(self, col_idx):
         """Get default invalid category for column index during `_transform`.
@@ -560,7 +569,20 @@ def _process_valid_mask(self, valid_mask, col_idx):
         """Process the valid mask during `_transform`
 
         This function is passed to `_transform` to adjust the mask depending
-        on if the infrequent column exist or not.
+        on if the infrequent column exists or not.
+
+        Parameters
+        ----------
+        valid_mask : array of shape (n_samples, )
+            boolean mask representing if a sample was seen during training
+
+        col_idx : int
+            column index
+
+        Returns
+        -------
+        valid_mask : array of shape (n_samples,) or None
+            boolean mask to use for constructing X_mask in `_transform`.
         """
         if self.handle_unknown != 'auto':
             return valid_mask
@@ -568,17 +590,17 @@ def _process_valid_mask(self, valid_mask, col_idx):
         # handle_unknown == 'auto'
         infrequent_idx = self.infrequent_indices_[col_idx]
 
-        # infrequent column does not exist
+        # infrequent column does not exists
         # returning the original mask to allow the column to be ignored
         if infrequent_idx is None:
             return valid_mask
 
-        # infrequent column exist
+        # infrequent column exists
         # the unknown categories will be mapped to the infrequent category
         return np.ones_like(valid_mask, dtype=bool)
 
-    def _compute_transformed_category(self, i):
-        """Compute the transformed category used for column `i`.
+    def _compute_transformed_categories(self, i):
+        """Compute the transformed categories used for column `i`.
 
         1. Dropped columns are removed.
         2. If there are infrequent categories, the infrequent category with
@@ -651,7 +673,7 @@ def fit(self, X, y=None):
         """
         self._validate_keywords()
 
-        process_counts = (self._compute_infrequent_categories
+        process_counts = (self._fit_infrequent_category_mapping
                           if self._infrequent_enabled else None)
         self._fit(X, handle_unknown=self.handle_unknown,
                   process_counts=process_counts)
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 026acff1bdbe3..53db2af607fd9 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -652,7 +652,7 @@ def test_encode_util(values, expected):
 
 
 def test_encode_util_uniques_unordered():
-    # The return counts are ordered based on the order of uniques
+    # Make sure the returned counts are ordered based on the order of uniques
 
     values = np.array(['b'] * 21 + ['c'] * 5 + ['a'] * 11, dtype=object)
     result = _encode(values, np.array(['a', 'c', 'b']), return_counts=True)

From 5defa0bb71b47d313f9f5750a82b966a521cb83e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Jan 2020 12:22:00 -0500
Subject: [PATCH 09/92] DOC Uses math to description float min_frequency

---
 doc/modules/preprocessing.rst      | 6 +++---
 sklearn/preprocessing/_encoders.py | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 2c5e6820690d1..aa367eaf6570d 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -586,9 +586,9 @@ gathering of infrequent categories are `min_frequency` and `max_levels`.
 
 1. `min_frequency` can be a integer greater or equal to 1, or a float in
 `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
-smaller than this value will be considered infrequent. If `min_frequency` is a
-float, categories with a cardinality smaller than this fraction of the
-total number of samples will be considered infrequent.
+smaller than `min_frequency * n_samples`  will be considered infrequent.
+If `min_frequency` is a float, categories with a cardinality smaller than this
+fraction of the total number of samples will be considered infrequent.
 
 2. `max_levels` can be `None` or any integer greater than 1. This parameter
 sets an upper limit to the number of output features for each input feature.
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 09063478dda8a..7424440230247 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -264,9 +264,8 @@ class OneHotEncoder(_BaseEncoder):
             1. If int, categories with a cardinality smaller will be considered
             infrequent.
 
-            2. If float, categories with a cardinality smaller than the
-            fraction of the total number of samples will be considered
-            infrequent.
+            2. If float, categories with a cardinality smaller than
+            `min_frequency * n_samples`  will be considered infrequent.
 
         .. versionadded:: 0.23
 

From 35d24704146b6f4358462015ef779cd9926b6298 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Jan 2020 12:31:15 -0500
Subject: [PATCH 10/92] DOC Adds comment regarding drop

---
 sklearn/preprocessing/_encoders.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 7424440230247..b47b98768a17a 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -218,7 +218,9 @@ class OneHotEncoder(_BaseEncoder):
         Specifies a methodology to use to drop one of the categories per
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
-        into a neural network or an unregularized regression.
+        into a neural network or an unregularized regression. Drop is not
+        support when `min_frequency` or `max_levels` is set to combine
+        infrequent categories.
 
         - None : retain all features (the default).
         - 'first' : drop the first category in each feature. If only one

From aec14305c6d5a61906b91e5e812057b1dcf2ca5e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 29 Jan 2020 10:52:46 -0500
Subject: [PATCH 11/92] BUG Fixes method name

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b47b98768a17a..e63473b507870 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -652,7 +652,7 @@ def _n_transformed_features(self):
     @property
     def _transformed_categories(self):
         """Transformed categories."""
-        return [self._compute_transformed_category(i)
+        return [self._compute_transformed_categories(i)
                 for i in range(len(self.categories_))]
 
     def fit(self, X, y=None):

From a64ffdd83ecd5112b512e27199dd8f082b5b301a Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 29 Jan 2020 11:00:10 -0500
Subject: [PATCH 12/92] DOC Clearer docstring

---
 sklearn/preprocessing/_encoders.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index e63473b507870..db83a45eb22d2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -293,7 +293,8 @@ class OneHotEncoder(_BaseEncoder):
         be retained.
 
     infrequent_indices_ : list of shape (n_features,)
-        `infrequent_indices_[i]` is an array of indices corresponding to
+        Defined when `min_frequency` or `max_levels` is set to a non-default
+        value. `infrequent_indices_[i]` is an array of indices corresponding to
         `categories_[i]` of the infrequent categories. `infrequent_indices_[i]`
         is None if the ith input feature has no infrequent categories.
 
@@ -642,10 +643,10 @@ def _n_transformed_features(self):
             return output
 
         # infrequent is enabled
-        for col_idx, infreq_idx in enumerate(self.infrequent_indices_):
+        for i, infreq_idx in enumerate(self.infrequent_indices_):
             if infreq_idx is None:
                 continue
-            output[col_idx] = output[col_idx] - infreq_idx.size + 1
+            output[i] = output[i] - infreq_idx.size + 1
 
         return output
 

From f445018813784470a20b502577c1d08b053b4833 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 29 Jan 2020 11:08:16 -0500
Subject: [PATCH 13/92] TST Adds more tests

---
 sklearn/preprocessing/tests/test_encoders.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 28680f07be022..a0da16586c14c 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -956,6 +956,24 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_array_equal(expected_inv, X_inv)
 
 
+def test_ohe_infrequent_user_cats_with_many_zero_counts():
+    # Only category 'd' is a frequent category. This should result in
+    # two columns.
+
+    X_train = np.array([['e'] * 3 + ['d']], dtype=object).T
+    ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b', 'f', 'g']],
+                        max_levels=3, sparse=False,
+                        handle_unknown='auto').fit(X_train)
+
+    X_trans = ohe.transform([['c'], ['d'], ['a'], ['b'], ['e']])
+    expected = [[0, 1],
+                [1, 0],
+                [0, 1],
+                [0, 1],
+                [0, 1]]
+    assert_array_equal(expected, X_trans)
+
+
 @pytest.mark.parametrize("min_frequency", [21])
 def test_ohe_infrequent_one_level_errors(min_frequency):
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T

From 462b46c0d82b16bb4a13930d82cb71db62780a42 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 11 Feb 2020 14:43:56 -0500
Subject: [PATCH 14/92] FIX Fixes mege

---
 sklearn/preprocessing/_encoders.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 2da74ae25175e..0d8169e63bb34 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -641,7 +641,8 @@ def _compute_transformed_categories(self, i):
         cats = self.categories_[i]
 
         if self.drop is not None:
-            # early exit because infrequent categories and drop is forbidden
+            if self.drop_idx_[i] == -1:
+                return cats
             return np.delete(cats, self.drop_idx_[i])
 
         # drop is None
@@ -663,12 +664,13 @@ def _compute_transformed_categories(self, i):
     def _n_transformed_features(self):
         """Number of transformed features."""
         if self.drop is not None:
-            if self.drop == 'first':
-                return [len(cats) - 1 for cats in self.categories_]
-
-            # drop == 'if_binary
-            return [1 if len(cats) == 2 else len(cats)
-                    for cats in self.categories_]
+            output = []
+            for i, cats in enumerate(self.categories_):
+                if self.drop_idx_[i] == -1:
+                    output.append(len(cats))
+                else:
+                    output.append(len(cats) - 1)
+            return output
 
         # drop is None
         output = [len(cats) for cats in self.categories_]

From a920d37b70a8a03149a4ce98c62f810d61900060 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 11 Feb 2020 14:47:05 -0500
Subject: [PATCH 15/92] CLN More pythonic

---
 sklearn/preprocessing/_encoders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 0d8169e63bb34..88bc52dff734e 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -665,8 +665,8 @@ def _n_transformed_features(self):
         """Number of transformed features."""
         if self.drop is not None:
             output = []
-            for i, cats in enumerate(self.categories_):
-                if self.drop_idx_[i] == -1:
+            for drop_idx, cats in zip(self.drop_idx_, self.categories_):
+                if drop_idx == -1:
                     output.append(len(cats))
                 else:
                     output.append(len(cats) - 1)

From 9398229122c3ebafbf6723aa4a2754fab23b1dbe Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 11 Feb 2020 16:13:15 -0500
Subject: [PATCH 16/92] CLN Address comments

---
 doc/modules/preprocessing.rst                |  30 +++---
 sklearn/preprocessing/_encoders.py           |  67 ++++++------
 sklearn/preprocessing/tests/test_encoders.py | 101 ++++++++++++-------
 3 files changed, 115 insertions(+), 83 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 3fb904cc7604e..00a9b56a94e3f 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -538,17 +538,17 @@ dataset::
     array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
 
 If there is a possibility that the training data might have missing categorical
-features, it can often be better to specify ``handle_unknown='ignore'`` instead
+features, it can often be better to specify ``handle_unknown='auto'`` instead
 of setting the ``categories`` manually as above. When
-``handle_unknown='ignore'`` is specified and unknown categories are encountered
+``handle_unknown='auto'`` is specified and unknown categories are encountered
 during transform, no error will be raised but the resulting one-hot encoded
 columns for this feature will be all zeros
-(``handle_unknown='ignore'`` is only supported for one-hot encoding)::
+(``handle_unknown='auto'`` is only supported for one-hot encoding)::
 
-    >>> enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
+    >>> enc = preprocessing.OneHotEncoder(handle_unknown='auto')
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
     >>> enc.fit(X)
-    OneHotEncoder(handle_unknown='ignore')
+    OneHotEncoder(handle_unknown='auto')
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 0., 0., 0.]])
 
@@ -601,7 +601,7 @@ Infrequent categories
 :class:`OneHotEncoder` supports outputing a feature that combines infrequent
 categories in the training data. For each input feature that has a infrequent
 category a new column is formed to represent it. The parameters to enable the
-gathering of infrequent categories are `min_frequency` and `max_levels`.
+gathering of infrequent categories are `min_frequency` and `max_categories`.
 
 1. `min_frequency` can be a integer greater or equal to 1, or a float in
 `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
@@ -609,9 +609,9 @@ smaller than `min_frequency * n_samples`  will be considered infrequent.
 If `min_frequency` is a float, categories with a cardinality smaller than this
 fraction of the total number of samples will be considered infrequent.
 
-2. `max_levels` can be `None` or any integer greater than 1. This parameter
+2. `max_categories` can be `None` or any integer greater than 1. This parameter
 sets an upper limit to the number of output features for each input feature.
-`max_levels` includes the feature that combines infrequent categories.
+`max_categories` includes the feature that combines infrequent categories.
 
 In the following example, the categories, `'dog', 'snake'`, are considered
 infrequent::
@@ -632,17 +632,17 @@ infrequent::
    >>> enc.transform([['dragon']]).toarray()
    array([[0., 0., 1.]])
 
-:meth:`OneHotEncoder.get_feature_names` uses the most frequent infrequent 
-category as the feature name for the infrequent feature name::
+:meth:`OneHotEncoder.get_feature_names` uses 'infrequent' as the infrequent
+feature name::
 
    >>> enc.get_feature_names()
-   array(['x0_cat', 'x0_rabbit', 'x0_dog'], dtype=object)
+   array(['x0_cat', 'x0_rabbit', 'x0_infrequent'], dtype=object)
 
-`min_frequency` and `max_levels` can be used together to filter out infrequent
-categories. In the following example, setting `max_levels` to 2 limits the
-number of output features::
+`min_frequency` and `max_categories` can be used together to filter out
+infrequent categories. In the following example, setting `max_categories` to 2
+limits the number of output features::
 
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_levels=2,
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_categories=2,
    ...                                   handle_unknown='auto').fit(X)
    >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
    array([[0., 1.],
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 88bc52dff734e..a3e0e2930eded 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -220,7 +220,7 @@ class OneHotEncoder(_BaseEncoder):
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
         into a neural network or an unregularized regression. Drop is not
-        support when `min_frequency` or `max_levels` is set to combine
+        support when `min_frequency` or `max_categories` is set to combine
         infrequent categories.
 
         - None : retain all features (the default).
@@ -275,9 +275,9 @@ class OneHotEncoder(_BaseEncoder):
 
         .. versionadded:: 0.23
 
-    max_levels : int, default=None
+    max_categories : int, default=None
         Specifies an upper limit to the number of output features for each
-        input feature when considering infrequent categories. `max_levels`
+        input feature when considering infrequent categories. `max_categories`
         includes the feature that combines infrequent categories. If `None`
         there is no limit to the number of output features.
 
@@ -301,10 +301,10 @@ class OneHotEncoder(_BaseEncoder):
         ``drop_idx_ = None`` if all the transformed features will be retained.
 
     infrequent_indices_ : list of shape (n_features,)
-        Defined when `min_frequency` or `max_levels` is set to a non-default
-        value. `infrequent_indices_[i]` is an array of indices corresponding to
-        `categories_[i]` of the infrequent categories. `infrequent_indices_[i]`
-        is None if the ith input feature has no infrequent categories.
+        Defined when `min_frequency` or `max_categories` is set to a
+        non-default value. `infrequent_indices_[i]` is an array of indices corresponding to `categories_[i]` of the infrequent categories.
+        `infrequent_indices_[i]` is None if the ith input feature has no
+        infrequent categories.
 
     See Also
     --------
@@ -364,14 +364,14 @@ class OneHotEncoder(_BaseEncoder):
 
     def __init__(self, categories='auto', drop=None, sparse=True,
                  dtype=np.float64, handle_unknown='error',
-                 min_frequency=1, max_levels=None):
+                 min_frequency=1, max_categories=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
         self.handle_unknown = handle_unknown
         self.drop = drop
         self.min_frequency = min_frequency
-        self.max_levels = max_levels
+        self.max_categories = max_categories
 
     def _validate_keywords(self):
 
@@ -402,8 +402,8 @@ def _validate_keywords(self):
                 raise ValueError("infrequent categories are only supported "
                                  "when handle_unknown is 'error' or 'auto'")
 
-        if self.max_levels is not None and self.max_levels <= 1:
-            raise ValueError("max_levels must be greater than 1")
+        if self.max_categories is not None and self.max_categories <= 1:
+            raise ValueError("max_categories must be greater than 1")
 
         if isinstance(self.min_frequency, numbers.Integral):
             if not self.min_frequency >= 1:
@@ -464,14 +464,14 @@ def _compute_drop_idx(self):
     @property
     def _infrequent_enabled(self):
         """Infrequent category is enabled."""
-        return (self.max_levels is not None and self.max_levels > 1 or
+        return (self.max_categories is not None and self.max_categories > 1 or
                 (isinstance(self.min_frequency, numbers.Integral)
                     and self.min_frequency > 1) or
                 (isinstance(self.min_frequency, numbers.Real)
                     and 0.0 < self.min_frequency < 1.0))
 
     def _identify_infrequent(self, category_count, n_samples, col_idx):
-        """Compute the infrequent indicies based on max_levels and
+        """Compute the infrequent indicies based on max_categories and
         min_frequency.
 
         Parameters
@@ -504,11 +504,11 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
                 category_mask = category_count < min_frequency_abs
                 infrequent_mask |= category_mask
 
-        if (self.max_levels is not None and self.max_levels > 1
-                and self.max_levels < category_count.size):
+        if (self.max_categories is not None and self.max_categories > 1
+                and self.max_categories < category_count.size):
             # stable sort to preserve original count order
             smallest_levels = np.argsort(category_count, kind='mergesort'
-                                         )[:-self.max_levels + 1]
+                                         )[:-self.max_categories + 1]
             infrequent_mask[smallest_levels] = True
 
         output = np.flatnonzero(infrequent_mask)
@@ -525,8 +525,6 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
             1. infrequent_indices_ to be the categories that are infrequent.
             2. _default_to_infrequent_mappings to be the mapping from the
                default mapping provided by _encode to the infrequent categories
-            3. _largest_infreq_indices to be the indices of the most frequent
-               infrequent category
 
         Parameters
         ----------
@@ -542,14 +540,12 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
 
         # compute mapping from default mapping to infrequent mapping
         default_to_infrequent_mappings = []
-        largest_infreq_idxs = []
 
         for category_count, infreq_idx in zip(category_counts,
                                               self.infrequent_indices_):
             # no infrequent categories
             if infreq_idx is None:
                 default_to_infrequent_mappings.append(None)
-                largest_infreq_idxs.append(None)
                 continue
 
             # infrequent indicies exist
@@ -565,12 +561,8 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
 
             default_to_infrequent_mappings.append(mapping)
 
-            # compute infrequent category with the largest cardinality
-            largest_infreq_idx = np.argmax(category_count[infreq_idx])
-            largest_infreq_idxs.append(infreq_idx[largest_infreq_idx])
-
         self._default_to_infrequent_mappings = default_to_infrequent_mappings
-        self._largest_infreq_indices = largest_infreq_idxs
+        # self._largest_infreq_indices = largest_infreq_idxs
 
     def _map_to_infrequent_categories(self, X_int):
         """Map categories to infrequent categories.
@@ -635,8 +627,9 @@ def _compute_transformed_categories(self, i):
         """Compute the transformed categories used for column `i`.
 
         1. Dropped columns are removed.
-        2. If there are infrequent categories, the infrequent category with
-        the largest cardinality is placed at the end.
+        2. If there are infrequent categories, the category is named
+        'infrequent'. If 'infrequent' is already a category, then then new
+        category is called 'infrequent_sklearn'.
         """
         cats = self.categories_[i]
 
@@ -654,11 +647,14 @@ def _compute_transformed_categories(self, i):
         if infreq_idx is None:
             return cats
 
-        largest_infreq_idx = self._largest_infreq_indices[i]
-        largest_infreq_cat = cats[largest_infreq_idx]
         frequent_indices = np.setdiff1d(np.arange(len(cats)), infreq_idx)
 
-        return np.r_[cats[frequent_indices], [largest_infreq_cat]]
+        if cats.dtype.kind in 'US' and 'infrequent' in cats:
+            infrequent_cat = 'infrequent_sklearn'
+        else:
+            infrequent_cat = 'infrequent'
+        return np.r_[cats[frequent_indices],
+                     np.array([infrequent_cat], dtype=object)]
 
     @property
     def _n_transformed_features(self):
@@ -812,8 +808,9 @@ def inverse_transform(self, X):
         In case unknown categories are encountered (all zeros in the
         one-hot encoding), ``None`` is used to represent this category.
 
-        For a given input feature, if there is an infrequent category, the most
-        frequent infrequent category will be used to represent this category.
+        For a given input feature, if there is an infrequent category,
+        'infrequent' will be used to represent the category. If 'infrequent'
+        is already a category, 'infrequent_sklearn' will be used instead.
 
         Parameters
         ----------
@@ -839,7 +836,8 @@ def inverse_transform(self, X):
             raise ValueError(msg.format(n_transformed_features, X.shape[1]))
 
         # create resulting array of appropriate dtype
-        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
+        dt = np.find_common_type([cat.dtype
+                                  for cat in self._transformed_categories], [])
         X_tr = np.empty((n_samples, n_features), dtype=dt)
 
         j = 0
@@ -899,7 +897,8 @@ def get_feature_names(self, input_features=None):
         Return feature names for output features.
 
         For a given input feature, if there is an infrequent category, the most
-        frequent infrequent category will be used as a feature name.
+        'infrequent' will be used as a feature name. If 'infrequent' is already
+        a category, 'infrequent_sklearn' will be used instead.
 
         Parameters
         ----------
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index f143fdcdf6dc2..66a79681442a2 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -693,12 +693,41 @@ def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
 
+def test_ohe_infrequent_infrequent_is_a_cat():
+    # category with 'infrequent' is a frequent category, ohe will name mangle
+    # this into 'infrequent_sklearn'
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['infrequent'] * 10 +
+                        ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+                        max_categories=3).fit(X_train)
+    ohe.fit(X_train)
+
+    X_test = [['b'], ['a'], ['infrequent'], ['d']]
+    expected = np.array([
+        [1, 0, 0],
+        [0, 0, 1],
+        [0, 1, 0],
+        [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    expected_inv = [['b'], ['infrequent_sklearn'],
+                    ['infrequent'], ['infrequent_sklearn']]
+    X_inv = ohe.inverse_transform(X_trans)
+    assert_array_equal(expected_inv, X_inv)
+
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(feature_names,
+                       ['x0_b', 'x0_infrequent', 'x0_infrequent_sklearn'])
+
+
 @pytest.mark.parametrize("kwargs", [
-    {'max_levels': 2},
+    {'max_categories': 2},
     {'min_frequency': 11},
     {'min_frequency': 0.29},
-    {'max_levels': 2, 'min_frequency': 6},
-    {'max_levels': 4, 'min_frequency': 12},
+    {'max_categories': 2, 'min_frequency': 6},
+    {'max_categories': 4, 'min_frequency': 12},
 ])
 @pytest.mark.parametrize("categories",
                          ["auto", [['a', 'b', 'c', 'd']]])
@@ -723,23 +752,23 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    expected_inv = [['b'], ['c'], ['c'], ['c'], ['c']]
+    expected_inv = [[col] for col in ['b'] + ['infrequent'] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
     # The most frequent infrequent category becomes the feature name
     feature_names = ohe.get_feature_names()
-    assert_array_equal(['x0_b', 'x0_c'], feature_names)
+    assert_array_equal(['x0_b', 'x0_infrequent'], feature_names)
 
 
 @pytest.mark.parametrize("kwargs", [
-    {'max_levels': 3},
+    {'max_categories': 3},
     {'min_frequency': 6},
     {'min_frequency': 9},
     {'min_frequency': 0.24},
     {'min_frequency': 0.16},
-    {'max_levels': 3, 'min_frequency': 8},
-    {'max_levels': 4, 'min_frequency': 6},
+    {'max_categories': 3, 'min_frequency': 8},
+    {'max_categories': 4, 'min_frequency': 6},
 ])
 def test_ohe_infrequent_three_levels(kwargs):
     # Test that different parameters for combine 'a', and 'd' into
@@ -761,16 +790,17 @@ def test_ohe_infrequent_three_levels(kwargs):
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    expected_inv = [['b'], ['a'], ['c'], ['a'], ['a']]
+    expected_inv = [['b'], ['infrequent'], ['c'], ['infrequent'],
+                    ['infrequent']]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
     # The most frequent infrequent category becomes the feature name
     feature_names = ohe.get_feature_names()
-    assert_array_equal(['x0_b', 'x0_c', 'x0_a'], feature_names)
+    assert_array_equal(['x0_b', 'x0_c', 'x0_infrequent'], feature_names)
 
 
-@pytest.mark.parametrize("kwargs", [{'max_levels': 3},
+@pytest.mark.parametrize("kwargs", [{'max_categories': 3},
                                     {'min_frequency': 4}])
 def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
     # 'a' is the only frequent category, all other categories are infrequent
@@ -800,7 +830,7 @@ def test_ohe_infrequent_two_levels_user_cats():
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
                         sparse=False, handle_unknown='auto',
-                        max_levels=2).fit(X_train)
+                        max_categories=2).fit(X_train)
 
     assert_array_equal(ohe.infrequent_indices_, [[0, 1, 2]])
 
@@ -816,7 +846,7 @@ def test_ohe_infrequent_two_levels_user_cats():
     assert_allclose(expected, X_trans)
 
     # The most frequent infrquent category is used for the inverse transform
-    expected_inv = [['b'], ['c'], ['c'], ['c'], ['c']]
+    expected_inv = [[col] for col in ['b'] + ['infrequent'] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
@@ -830,7 +860,7 @@ def test_ohe_infrequent_three_levels_user_cats():
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'b', 'a']],
                         sparse=False, handle_unknown='auto',
-                        max_levels=3).fit(X_train)
+                        max_categories=3).fit(X_train)
 
     assert_array_equal(ohe.infrequent_indices_, [[1, 3]])
 
@@ -846,7 +876,8 @@ def test_ohe_infrequent_three_levels_user_cats():
     assert_allclose(expected, X_trans)
 
     # The most frequent infrquent category is used for the inverse transform
-    expected_inv = [['b'], ['a'], ['c'], ['a'], ['a']]
+    expected_inv = [['b'], ['infrequent'], ['c'], ['infrequent'],
+                    ['infrequent']]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
@@ -858,7 +889,7 @@ def test_ohe_infrequent_multiple_categories():
               [0, 0, 5, 1, 1, 10, 5, 5, 0],
               [1, 0, 1, 0, 1, 0, 1, 0, 1]]
 
-    ohe = OneHotEncoder(categories='auto', max_levels=3,
+    ohe = OneHotEncoder(categories='auto', max_categories=3,
                         handle_unknown='auto')
     # X[:, 0] 1 and 2 is infrequent
     # X[:, 1] 1 and 10 are infrequent
@@ -873,8 +904,8 @@ def test_ohe_infrequent_multiple_categories():
     # For the first column, 1 and 2 have the same frequency. In this case,
     # 1 will be choosen to be the feature name because is smaller lexiconically
     feature_names = ohe.get_feature_names()
-    assert_array_equal(['x0_0', 'x0_3', 'x0_1',
-                        'x1_0', 'x1_5', 'x1_1',
+    assert_array_equal(['x0_0', 'x0_3', 'x0_infrequent',
+                        'x1_0', 'x1_5', 'x1_infrequent',
                         'x2_0', 'x2_1'], feature_names)
 
     expected = [[1, 0, 0,  1, 0, 0,  0, 1],
@@ -901,12 +932,12 @@ def test_ohe_infrequent_multiple_categories():
     assert_allclose(expected, X_test_trans.toarray())
 
     X_inv = ohe.inverse_transform(X_test_trans)
-    expected_inv = np.array([[3, 1, None],
-                             [1, 0, None]], dtype=object)
+    expected_inv = np.array([[3, 'infrequent', None],
+                             ['infrequent', 0, None]], dtype=object)
     assert_array_equal(expected_inv, X_inv)
 
     # error for unknown categories
-    ohe = OneHotEncoder(categories='auto', max_levels=3,
+    ohe = OneHotEncoder(categories='auto', max_categories=3,
                         handle_unknown='error').fit(X)
     with pytest.raises(ValueError, match="Found unknown categories"):
         ohe.transform(X_test)
@@ -922,9 +953,9 @@ def test_ohe_infrequent_multiple_categories():
 
     X_inv = ohe.inverse_transform(X_test_trans)
 
-    expected_inv = [[1, 1, 1],
-                    [3, 1, 0]]
-    assert_allclose(expected_inv, X_inv)
+    expected_inv = np.array([['infrequent', 'infrequent', 1],
+                             [3,            'infrequent', 0]], dtype=object)
+    assert_array_equal(expected_inv, X_inv)
 
 
 def test_ohe_infrequent_multiple_categories_dtypes():
@@ -936,7 +967,7 @@ def test_ohe_infrequent_multiple_categories_dtypes():
          'int': [5, 3, 0, 10, 10, 12, 0, 3, 5]},
         columns=['str', 'int'])
 
-    ohe = OneHotEncoder(categories='auto', max_levels=3,
+    ohe = OneHotEncoder(categories='auto', max_categories=3,
                         handle_unknown='auto')
     # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
     # considered infrequent because they are greater
@@ -971,11 +1002,12 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_allclose(expected, X_test_trans.toarray())
 
     X_inv = ohe.inverse_transform(X_test_trans)
-    expected_inv = np.array([['a', 0], ['f', 0]], dtype=object)
+    expected_inv = np.array([['infrequent', 'infrequent'],
+                             ['f',          'infrequent']], dtype=object)
     assert_array_equal(expected_inv, X_inv)
 
     # error for unknown categories
-    ohe = OneHotEncoder(categories='auto', max_levels=3,
+    ohe = OneHotEncoder(categories='auto', max_categories=3,
                         handle_unknown='error').fit(X)
     with pytest.raises(ValueError, match="Found unknown categories"):
         ohe.transform(X_test)
@@ -991,7 +1023,8 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_allclose(expected, X_test_trans)
 
     X_inv = ohe.inverse_transform(X_test_trans)
-    expected_inv = np.array([['c', 0], ['a', 5]], dtype=object)
+    expected_inv = np.array([['c',          'infrequent'],
+                             ['infrequent',            5]], dtype=object)
     assert_array_equal(expected_inv, X_inv)
 
 
@@ -1001,7 +1034,7 @@ def test_ohe_infrequent_user_cats_with_many_zero_counts():
 
     X_train = np.array([['e'] * 3 + ['d']], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b', 'f', 'g']],
-                        max_levels=3, sparse=False,
+                        max_categories=3, sparse=False,
                         handle_unknown='auto').fit(X_train)
 
     X_trans = ohe.transform([['c'], ['d'], ['a'], ['b'], ['e']])
@@ -1025,7 +1058,7 @@ def test_ohe_infrequent_one_level_errors(min_frequency):
         ohe.fit(X_train)
 
 
-@pytest.mark.parametrize("kwargs", [{'min_frequency': 2, 'max_levels': 3}])
+@pytest.mark.parametrize("kwargs", [{'min_frequency': 2, 'max_categories': 3}])
 def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
     # All user provided categories are infrequent
 
@@ -1041,13 +1074,13 @@ def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
 # TODO: Remove when 'ignore' is deprecated in 0.25
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("kwargs, error_msg", [
-    ({'max_levels': 1}, 'max_levels must be greater than 1'),
-    ({'max_levels': -2}, 'max_levels must be greater than 1'),
+    ({'max_categories': 1}, 'max_categories must be greater than 1'),
+    ({'max_categories': -2}, 'max_categories must be greater than 1'),
     ({'min_frequency': -1}, 'min_frequency must be an integer at least'),
     ({'min_frequency': 1.1}, 'min_frequency must be an integer at least'),
-    ({'max_levels': 2, 'drop': 'first', 'handle_unknown': 'error'},
+    ({'max_categories': 2, 'drop': 'first', 'handle_unknown': 'error'},
      "infrequent categories are not supported when drop is specified"),
-    ({'handle_unknown': 'ignore', 'max_levels': 2},
+    ({'handle_unknown': 'ignore', 'max_categories': 2},
      "infrequent categories are only supported when handle_unknown is "
      "'error' or 'auto'")
 ])

From 3a3eb5d1fe07bc64e0089b879d8eea23b1ca0b1a Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 11 Feb 2020 16:28:48 -0500
Subject: [PATCH 17/92] STY Flake8

---
 sklearn/preprocessing/_encoders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a3e0e2930eded..792dd03d4cc27 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -302,7 +302,8 @@ class OneHotEncoder(_BaseEncoder):
 
     infrequent_indices_ : list of shape (n_features,)
         Defined when `min_frequency` or `max_categories` is set to a
-        non-default value. `infrequent_indices_[i]` is an array of indices corresponding to `categories_[i]` of the infrequent categories.
+        non-default value. `infrequent_indices_[i]` is an array of indices
+        corresponding to `categories_[i]` of the infrequent categories.
         `infrequent_indices_[i]` is None if the ith input feature has no
         infrequent categories.
 

From e5c4eef817ea6297fc0d672f739c5d9b0ca95384 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 21 Feb 2020 13:07:20 -0500
Subject: [PATCH 18/92] CLN Address comments

---
 doc/modules/preprocessing.rst      | 27 ++++++++++++++-------------
 sklearn/preprocessing/_encoders.py | 25 +++++++++++++++----------
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 00a9b56a94e3f..bcbed37eaa229 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -599,21 +599,22 @@ Infrequent categories
 ---------------------
 
 :class:`OneHotEncoder` supports outputing a feature that combines infrequent
-categories in the training data. For each input feature that has a infrequent
+categories in the training data. For each input feature that has an infrequent
 category a new column is formed to represent it. The parameters to enable the
 gathering of infrequent categories are `min_frequency` and `max_categories`.
 
-1. `min_frequency` can be a integer greater or equal to 1, or a float in
-`(0.0, 1.0)`. If `min_frequency` is an integer, categories with a cardinality
-smaller than `min_frequency * n_samples`  will be considered infrequent.
-If `min_frequency` is a float, categories with a cardinality smaller than this
-fraction of the total number of samples will be considered infrequent.
+1. `min_frequency` is either an  integer greater or equal to 1, or a float in
+the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a
+cardinality smaller than `min_frequency * n_samples`  will be considered
+infrequent. If `min_frequency` is a float, categories with a cardinality smaller
+than this fraction of the total number of samples will be considered infrequent.
 
-2. `max_categories` can be `None` or any integer greater than 1. This parameter
-sets an upper limit to the number of output features for each input feature.
-`max_categories` includes the feature that combines infrequent categories.
+2. `max_categories` is either `None` or any integer greater than 1. This
+parameter sets an upper limit to the number of output features for each input
+feature. `max_categories` includes the feature that combines infrequent
+categories.
 
-In the following example, the categories, `'dog', 'snake'`, are considered
+In the following example, the categories, `'dog', 'snake'` are considered
 infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 + 
@@ -638,9 +639,9 @@ feature name::
    >>> enc.get_feature_names()
    array(['x0_cat', 'x0_rabbit', 'x0_infrequent'], dtype=object)
 
-`min_frequency` and `max_categories` can be used together to filter out
-infrequent categories. In the following example, setting `max_categories` to 2
-limits the number of output features::
+Infrequent categories can be filtered out using `min_frequency` and
+`max_categories`. In the following example, we set `max_categories=2` to
+limit the number of features in the output::
 
    >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_categories=2,
    ...                                   handle_unknown='auto').fit(X)
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 792dd03d4cc27..f45488d53c8ef 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -252,25 +252,29 @@ class OneHotEncoder(_BaseEncoder):
             1. If there was no infrequent category during training, the
             resulting one-hot encoded columns for this feature will be all
             zeros. In the inverse transform, an unknown category will be
-            denoted as None.
+            denoted as `None`.
 
             2. If there is an infrequent category during training, the unknown
             category will be considered infrequent. In the inverse transform,
-            an unknown category will be the most frequent infrequent category
+            'infrequent' will be used to represent the infrequent category. If
+            'infrequent' is already a category, 'infrequent_sklearn' will be
+            used instead.
 
         .. versionadded:: 0.23
-            'auto' was added to automatically handle unknown categories
+            `'auto'` was added to automatically handle unknown categories
+            and infrequent categories.
 
         .. deprecated:: 0.23
-            'ignore' is deprecated in favor of 'auto'
+            `'ignore'` is deprecated in favor of `'auto'`. This option will be
+            removed in 0.25.
 
     min_frequency : int or float, default=1
         Specifies the categories to be considered infrequent.
 
-            1. If int, categories with a cardinality smaller will be considered
+            1. If int, categories with a smaller cardinality will be considered
             infrequent.
 
-            2. If float, categories with a cardinality smaller than
+            2. If float, categories with a smaller cardinality than
             `min_frequency * n_samples`  will be considered infrequent.
 
         .. versionadded:: 0.23
@@ -377,8 +381,8 @@ def __init__(self, categories='auto', drop=None, sparse=True,
     def _validate_keywords(self):
 
         if self.handle_unknown not in ('error', 'ignore', 'auto'):
-            msg = ("handle_unknown should be either 'error', 'ignore', 'auto'"
-                   "got {0}.".format(self.handle_unknown))
+            msg = (f"handle_unknown should be either 'error', 'ignore', 'auto'"
+                   f"got {self.handle_unknown}.")
             raise ValueError(msg)
         # If we have both dropped columns and ignored unknown
         # values, there will be ambiguous cells. This creates difficulties
@@ -810,8 +814,9 @@ def inverse_transform(self, X):
         one-hot encoding), ``None`` is used to represent this category.
 
         For a given input feature, if there is an infrequent category,
-        'infrequent' will be used to represent the category. If 'infrequent'
-        is already a category, 'infrequent_sklearn' will be used instead.
+        'infrequent' will be used to represent the infrequent category. If
+        'infrequent' is already a category, 'infrequent_sklearn' will be used
+        instead.
 
         Parameters
         ----------

From 78fa49585bbbf12657487ee1ae44f553532c4f2e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 10 Mar 2020 17:18:35 -0400
Subject: [PATCH 19/92] DOC Fix

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index f45488d53c8ef..b9f4217f39af4 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -305,7 +305,7 @@ class OneHotEncoder(_BaseEncoder):
         ``drop_idx_ = None`` if all the transformed features will be retained.
 
     infrequent_indices_ : list of shape (n_features,)
-        Defined when `min_frequency` or `max_categories` is set to a
+        Defined only when `min_frequency` or `max_categories` is set to a
         non-default value. `infrequent_indices_[i]` is an array of indices
         corresponding to `categories_[i]` of the infrequent categories.
         `infrequent_indices_[i]` is None if the ith input feature has no

From 0c431edd0e20a49c0994903898333c22c35b3a14 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 13 Apr 2020 15:32:37 -0400
Subject: [PATCH 20/92] MRG

---
 COPYING                                       |   2 +-
 Makefile                                      |   2 +-
 README.rst                                    |  29 +-
 azure-pipelines.yml                           |  23 +-
 ...bench_hist_gradient_boosting_higgsboson.py |  77 +-
 build_tools/azure/install.sh                  |   4 +
 build_tools/azure/posix.yml                   |   2 +-
 build_tools/circle/build_doc.sh               |   2 +-
 build_tools/circle/linting.sh                 |   2 +
 conftest.py                                   |   5 +
 doc/about.rst                                 |   4 +-
 doc/conf.py                                   |   3 +-
 doc/developers/contributing.rst               |  45 +-
 doc/developers/maintainer.rst                 |   8 +
 doc/faq.rst                                   |   8 +-
 doc/glossary.rst                              |  79 ++-
 doc/inspection.rst                            |   4 +
 doc/install.rst                               |   3 +-
 doc/modules/classes.rst                       |   3 +
 doc/modules/compose.rst                       |   5 -
 doc/modules/ensemble.rst                      |  47 +-
 doc/modules/feature_selection.rst             |   3 +-
 doc/modules/linear_model.rst                  |   3 +-
 doc/modules/naive_bayes.rst                   |   2 +-
 doc/modules/sgd.rst                           | 272 ++++---
 doc/modules/svm.rst                           | 459 +++++++-----
 doc/modules/tree.rst                          |  20 +-
 .../scikit-learn-modern/static/css/theme.css  |  38 +
 doc/whats_new/v0.23.rst                       |  62 +-
 ...sing.py => plot_outlier_detection_wine.py} |  99 +--
 .../plot_gradient_boosting_quantile.py        |   4 +-
 .../plot_gradient_boosting_regression.py      | 146 ++--
 .../ensemble/plot_monotonic_constraints.py    |  70 ++
 .../plot_feature_selection.py                 |   9 +-
 ...t_iterative_imputer_variants_comparison.py |   2 +-
 ...linear_model_coefficient_interpretation.py | 671 ++++++++++++++++++
 examples/neighbors/plot_nca_illustration.py   |   2 +-
 pyproject.toml                                |   7 +-
 sklearn/__init__.py                           |   4 +-
 sklearn/_build_utils/deprecated_modules.py    |   3 +-
 sklearn/calibration.py                        |   2 +-
 sklearn/cluster/_bicluster.py                 |   3 +-
 sklearn/cluster/_dbscan.py                    |  43 +-
 sklearn/cluster/_k_means_elkan.pyx            |  17 +-
 sklearn/cluster/_k_means_lloyd.pyx            |  13 +
 sklearn/cluster/_kmeans.py                    |  65 +-
 sklearn/cluster/_optics.py                    |   1 -
 sklearn/cluster/_spectral.py                  |   2 +-
 sklearn/cluster/tests/test_bicluster.py       |   1 -
 sklearn/compose/_column_transformer.py        |   9 +-
 sklearn/compose/_target.py                    |   4 +-
 sklearn/covariance/_graph_lasso.py            |   5 +-
 sklearn/covariance/_shrunk_covariance.py      |  21 +
 sklearn/cross_decomposition/_cca.py           |   4 +-
 sklearn/cross_decomposition/_pls.py           |  16 +-
 sklearn/cross_decomposition/tests/test_pls.py |   1 -
 sklearn/datasets/_base.py                     |   1 -
 sklearn/datasets/_lfw.py                      |   2 +-
 sklearn/datasets/descr/breast_cancer.rst      |   8 +-
 sklearn/datasets/tests/test_base.py           |   1 -
 sklearn/decomposition/_dict_learning.py       |  11 +-
 sklearn/decomposition/_factor_analysis.py     |   6 +-
 sklearn/decomposition/_fastica.py             |   4 +-
 sklearn/decomposition/_incremental_pca.py     |   5 +-
 sklearn/decomposition/_kernel_pca.py          |  13 +-
 sklearn/decomposition/_lda.py                 |  11 +-
 sklearn/decomposition/_nmf.py                 |  11 +-
 sklearn/decomposition/_pca.py                 |  75 +-
 sklearn/decomposition/_sparse_pca.py          |   7 +-
 sklearn/decomposition/_truncated_svd.py       |   9 +-
 .../decomposition/tests/test_kernel_pca.py    |  13 +
 sklearn/decomposition/tests/test_nmf.py       |  11 +-
 sklearn/decomposition/tests/test_pca.py       |  81 ++-
 sklearn/discriminant_analysis.py              |   2 +-
 sklearn/dummy.py                              |   6 +-
 sklearn/ensemble/__init__.py                  |   7 +
 sklearn/ensemble/_bagging.py                  |  13 +-
 sklearn/ensemble/_base.py                     |  13 +-
 sklearn/ensemble/_forest.py                   | 148 ++--
 sklearn/ensemble/_gb.py                       |  18 +-
 sklearn/ensemble/_gb_losses.py                |   3 +-
 .../_hist_gradient_boosting/binning.py        |   2 +-
 .../_hist_gradient_boosting/common.pxd        |   6 +
 .../gradient_boosting.py                      |  80 ++-
 .../_hist_gradient_boosting/grower.py         | 118 ++-
 .../ensemble/_hist_gradient_boosting/loss.py  |   6 +-
 .../_hist_gradient_boosting/splitting.pyx     | 253 +++++--
 .../tests/test_gradient_boosting.py           |  36 +
 .../tests/test_grower.py                      |   6 +-
 .../tests/test_monotonic_contraints.py        | 341 +++++++++
 .../tests/test_splitting.py                   |  64 +-
 sklearn/ensemble/_iforest.py                  |   7 +-
 sklearn/ensemble/_stacking.py                 |  11 +-
 sklearn/ensemble/_voting.py                   |  14 +-
 sklearn/ensemble/_weight_boosting.py          |  11 +-
 sklearn/ensemble/tests/test_forest.py         |  10 +-
 .../ensemble/tests/test_gradient_boosting.py  |   1 -
 .../enable_hist_gradient_boosting.py          |   8 +-
 .../experimental/enable_iterative_imputer.py  |   3 +-
 sklearn/externals/_arff.py                    |   4 +-
 sklearn/externals/joblib/__init__.py          |  15 -
 sklearn/externals/joblib/numpy_pickle.py      |   3 -
 sklearn/externals/setup.py                    |   9 -
 sklearn/feature_extraction/text.py            |   6 +-
 sklearn/feature_selection/_from_model.py      |   9 +-
 sklearn/feature_selection/_rfe.py             |  11 +-
 .../_univariate_selection.py                  |  38 +-
 .../feature_selection/_variance_threshold.py  |   1 -
 .../tests/test_feature_select.py              |   2 +-
 sklearn/gaussian_process/_gpc.py              |   2 +-
 sklearn/gaussian_process/_gpr.py              |   4 +-
 sklearn/impute/__init__.py                    |   6 +
 sklearn/impute/_base.py                       |   1 -
 sklearn/impute/_iterative.py                  |  19 +-
 sklearn/inspection/_permutation_importance.py |  15 +
 .../inspection/_plot/partial_dependence.py    |   2 +-
 sklearn/isotonic.py                           |  10 +-
 sklearn/kernel_approximation.py               |  25 +-
 sklearn/kernel_ridge.py                       |   1 -
 sklearn/linear_model/_base.py                 |   2 +-
 sklearn/linear_model/_bayes.py                |   1 -
 sklearn/linear_model/_coordinate_descent.py   |   5 +-
 sklearn/linear_model/_huber.py                |   1 -
 sklearn/linear_model/_least_angle.py          |   3 +-
 sklearn/linear_model/_logistic.py             |  17 +-
 sklearn/linear_model/_omp.py                  |   2 +-
 sklearn/linear_model/_ridge.py                |   1 -
 sklearn/linear_model/_stochastic_gradient.py  | 152 ++--
 sklearn/linear_model/_theil_sen.py            |   1 -
 sklearn/linear_model/tests/test_base.py       |  20 +-
 sklearn/linear_model/tests/test_huber.py      |   1 -
 sklearn/linear_model/tests/test_perceptron.py |   1 -
 sklearn/linear_model/tests/test_ransac.py     |   1 -
 sklearn/linear_model/tests/test_sag.py        |   2 +-
 sklearn/manifold/_isomap.py                   |   7 +-
 sklearn/manifold/_t_sne.py                    |   4 +-
 sklearn/manifold/tests/test_t_sne.py          |   3 +-
 sklearn/metrics/_plot/confusion_matrix.py     |   3 +-
 .../metrics/_plot/precision_recall_curve.py   |   3 +-
 sklearn/metrics/_plot/roc_curve.py            |   3 +-
 sklearn/metrics/cluster/_supervised.py        |   3 +-
 sklearn/metrics/cluster/_unsupervised.py      |   1 -
 .../cluster/tests/test_unsupervised.py        |   1 -
 sklearn/metrics/tests/test_ranking.py         |   3 +-
 sklearn/mixture/_base.py                      |   2 +-
 sklearn/mixture/_bayesian_mixture.py          |   1 -
 sklearn/mixture/_gaussian_mixture.py          |   1 -
 sklearn/model_selection/_search.py            |   9 +-
 sklearn/model_selection/_split.py             |   5 +-
 sklearn/model_selection/tests/test_search.py  |  14 +-
 sklearn/model_selection/tests/test_split.py   |   3 +-
 sklearn/multioutput.py                        |  11 +
 sklearn/naive_bayes.py                        |   2 +-
 sklearn/neighbors/_kde.py                     |   4 +-
 sklearn/neighbors/_nca.py                     |   3 +-
 sklearn/neighbors/_nearest_centroid.py        |   2 +-
 sklearn/neighbors/tests/test_kde.py           |  13 +
 .../neural_network/_multilayer_perceptron.py  |   2 +-
 sklearn/pipeline.py                           | 106 ++-
 sklearn/preprocessing/_data.py                |  16 +-
 sklearn/preprocessing/_encoders.py            |  49 +-
 .../preprocessing/_function_transformer.py    |   1 -
 sklearn/preprocessing/_label.py               |   5 +-
 sklearn/preprocessing/tests/test_data.py      |  23 +-
 sklearn/preprocessing/tests/test_encoders.py  |  34 +-
 sklearn/semi_supervised/_label_propagation.py |   2 +-
 sklearn/setup.py                              |   2 +-
 sklearn/svm/_base.py                          |  20 +-
 sklearn/svm/_classes.py                       | 119 ++--
 sklearn/svm/setup.py                          |  35 +-
 sklearn/svm/src/liblinear/liblinear_helper.c  |   2 +-
 sklearn/svm/src/liblinear/linear.cpp          |  76 +-
 sklearn/svm/src/liblinear/linear.h            |   2 +
 sklearn/svm/src/libsvm/LIBSVM_CHANGES         |   2 +
 sklearn/svm/src/libsvm/svm.cpp                |  20 +-
 sklearn/svm/src/newrand/newrand.h             |  68 ++
 sklearn/svm/tests/test_svm.py                 |  23 +-
 sklearn/tests/test_common.py                  |  21 +-
 sklearn/tests/test_discriminant_analysis.py   |   4 +-
 sklearn/tests/test_docstring_parameters.py    |   6 +-
 sklearn/tests/test_multiclass.py              |   2 -
 sklearn/tests/test_random_projection.py       |   9 +-
 sklearn/tests/test_site_joblib.py             |  16 -
 sklearn/tree/_classes.py                      |  15 +
 sklearn/tree/tests/test_tree.py               |   1 -
 sklearn/utils/__init__.py                     |   1 +
 sklearn/utils/_pprint.py                      |   3 +-
 sklearn/utils/_testing.py                     |   2 +-
 sklearn/utils/deprecation.py                  |   1 -
 sklearn/utils/estimator_checks.py             |  11 +-
 sklearn/utils/fixes.py                        | 130 +---
 sklearn/utils/metaestimators.py               |   3 +
 sklearn/utils/tests/test_estimator_checks.py  |   2 +-
 sklearn/utils/tests/test_fixes.py             |  12 -
 sklearn/utils/tests/test_random.py            |   2 +-
 sklearn/utils/tests/test_validation.py        |  19 +
 sklearn/utils/validation.py                   |  11 +-
 197 files changed, 3694 insertions(+), 1574 deletions(-)
 rename examples/applications/{plot_outlier_detection_housing.py => plot_outlier_detection_wine.py} (60%)
 create mode 100644 examples/ensemble/plot_monotonic_constraints.py
 create mode 100644 examples/inspection/plot_linear_model_coefficient_interpretation.py
 create mode 100644 sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
 delete mode 100644 sklearn/externals/joblib/__init__.py
 delete mode 100644 sklearn/externals/joblib/numpy_pickle.py
 delete mode 100644 sklearn/externals/setup.py
 create mode 100644 sklearn/svm/src/newrand/newrand.h
 delete mode 100644 sklearn/tests/test_site_joblib.py

diff --git a/COPYING b/COPYING
index 0f665f8400d08..b98af18710185 100644
--- a/COPYING
+++ b/COPYING
@@ -1,6 +1,6 @@
 New BSD License
 
-Copyright (c) 2007–2019 The scikit-learn developers.
+Copyright (c) 2007–2020 The scikit-learn developers.
 All rights reserved.
 
 
diff --git a/Makefile b/Makefile
index 43fc5afe63361..b2171d06b6747 100644
--- a/Makefile
+++ b/Makefile
@@ -67,4 +67,4 @@ code-analysis:
 	pylint -E -i y sklearn/ -d E1103,E0611,E1101
 
 flake8-diff:
-	./build_tools/circle/linting.sh
+	git diff upstream/master -u -- "*.py" | flake8 --diff
diff --git a/README.rst b/README.rst
index fa0b665bbc8dd..0fac479bba81a 100644
--- a/README.rst
+++ b/README.rst
@@ -31,12 +31,12 @@ SciPy and is distributed under the 3-Clause BSD license.
 
 The project was started in 2007 by David Cournapeau as a Google Summer
 of Code project, and since then many volunteers have contributed. See
-the `About us <http://scikit-learn.org/dev/about.html#authors>`__ page
+the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
 for a list of core contributors.
 
 It is currently maintained by a team of volunteers.
 
-Website: http://scikit-learn.org
+Website: https://scikit-learn.org
 
 
 Installation
@@ -58,7 +58,8 @@ scikit-learn 0.23 and later require Python 3.6 or newer.
 Scikit-learn plotting capabilities (i.e., functions start with ``plot_``
 and classes end with "Display") require Matplotlib (>= 2.1.1). For running the
 examples Matplotlib >= 2.1.1 is required. A few examples require
-scikit-image >= 0.13, a few examples require pandas >= 0.18.0.
+scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples
+require seaborn >= 0.9.0.
 
 User installation
 ~~~~~~~~~~~~~~~~~
@@ -72,13 +73,13 @@ or ``conda``::
 
     conda install scikit-learn
 
-The documentation includes more detailed `installation instructions <http://scikit-learn.org/stable/install.html>`_.
+The documentation includes more detailed `installation instructions <https://scikit-learn.org/stable/install.html>`_.
 
 
 Changelog
 ---------
 
-See the `changelog <http://scikit-learn.org/dev/whats_new.html>`__
+See the `changelog <https://scikit-learn.org/dev/whats_new.html>`__
 for a history of notable changes to scikit-learn.
 
 Development
@@ -86,7 +87,7 @@ Development
 
 We welcome new contributors of all experience levels. The scikit-learn
 community goals are to be helpful, welcoming, and effective. The
-`Development Guide <http://scikit-learn.org/stable/developers/index.html>`_
+`Development Guide <https://scikit-learn.org/stable/developers/index.html>`_
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
@@ -119,7 +120,7 @@ source directory (you will need to have ``pytest`` >= 3.3.0 installed)::
 
     pytest sklearn
 
-See the web page http://scikit-learn.org/dev/developers/advanced_installation.html#testing
+See the web page https://scikit-learn.org/dev/developers/advanced_installation.html#testing
 for more information.
 
     Random number generation can be controlled during testing by setting
@@ -130,7 +131,7 @@ Submitting a Pull Request
 
 Before opening a Pull Request, have a look at the
 full Contributing page to make sure your code complies
-with our guidelines: http://scikit-learn.org/stable/developers/index.html
+with our guidelines: https://scikit-learn.org/stable/developers/index.html
 
 
 Project History
@@ -138,7 +139,7 @@ Project History
 
 The project was started in 2007 by David Cournapeau as a Google Summer
 of Code project, and since then many volunteers have contributed. See
-the `About us <http://scikit-learn.org/dev/about.html#authors>`__ page
+the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
 for a list of core contributors.
 
 The project is currently maintained by a team of volunteers.
@@ -152,9 +153,9 @@ Help and Support
 Documentation
 ~~~~~~~~~~~~~
 
-- HTML documentation (stable release): http://scikit-learn.org
-- HTML documentation (development version): http://scikit-learn.org/dev/
-- FAQ: http://scikit-learn.org/stable/faq.html
+- HTML documentation (stable release): https://scikit-learn.org
+- HTML documentation (development version): https://scikit-learn.org/dev/
+- FAQ: https://scikit-learn.org/stable/faq.html
 
 Communication
 ~~~~~~~~~~~~~
@@ -162,9 +163,9 @@ Communication
 - Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn
 - IRC channel: ``#scikit-learn`` at ``webchat.freenode.net``
 - Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn
-- Website: http://scikit-learn.org
+- Website: https://scikit-learn.org
 
 Citation
 ~~~~~~~~
 
-If you use scikit-learn in a scientific publication, we would appreciate citations: http://scikit-learn.org/stable/about.html#citing-scikit-learn
+If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index df504a4ab3bf7..196d4ca34f434 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -17,7 +17,10 @@ jobs:
       displayName: Add conda to PATH
     - bash: sudo chown -R $USER $CONDA
       displayName: Take ownership of conda installation
-    - bash: conda create --name flake8_env --yes flake8
+    - bash: |
+        conda create --name flake8_env --yes python=3.8
+        conda activate flake8_env
+        pip install flake8 mypy==0.770
       displayName: Install flake8
     - bash: |
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
@@ -25,10 +28,20 @@ jobs:
           echo "Skipping linting"
           exit 0
         else
-          source activate flake8_env
+          conda activate flake8_env
           ./build_tools/circle/linting.sh
         fi
       displayName: Run linting
+    - bash: |
+        if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
+          # skip linting
+          echo "Skipping linting"
+          exit 0
+        else
+          conda activate flake8_env
+          mypy sklearn/ --ignore-missing-imports
+        fi
+      displayName: Run mypy
     - bash: |
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[scipy-dev\] ]] || \
            [[ $BUILD_REASON == "Schedule" ]]; then
@@ -80,6 +93,7 @@ jobs:
     name: Linux
     vmImage: ubuntu-18.04
     dependsOn: [linting]
+    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
     matrix:
       # Linux environment to test that scikit-learn can be built against
       # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04
@@ -123,6 +137,7 @@ jobs:
     name: Linux32
     vmImage: ubuntu-18.04
     dependsOn: [linting]
+    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
     matrix:
       py36_ubuntu_atlas_32bit:
         DISTRIB: 'ubuntu-32'
@@ -133,8 +148,9 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: macOS
-    vmImage: xcode9-macos10.13
+    vmImage: macOS-10.14
     dependsOn: [linting]
+    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
     matrix:
       pylatest_conda_mkl:
         DISTRIB: 'conda'
@@ -168,6 +184,7 @@ jobs:
     name: Windows
     vmImage: vs2017-win2016
     dependsOn: [linting]
+    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
     matrix:
       py37_conda_mkl:
         PYTHON_VERSION: '3.7'
diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
index ec75760cd39f7..2c74bb8818343 100644
--- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py
+++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
@@ -25,12 +25,14 @@
 parser.add_argument('--learning-rate', type=float, default=1.)
 parser.add_argument('--subsample', type=int, default=None)
 parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--no-predict', action="store_true", default=False)
+parser.add_argument('--cache-loc', type=str, default='/tmp')
 args = parser.parse_args()
 
 HERE = os.path.dirname(__file__)
 URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/"
        "HIGGS.csv.gz")
-m = Memory(location='/tmp', mmap_mode='r')
+m = Memory(location=args.cache_loc, mmap_mode='r')
 
 n_leaf_nodes = args.n_leaf_nodes
 n_trees = args.n_trees
@@ -56,6 +58,27 @@ def load_data():
     return df
 
 
+def fit(est, data_train, target_train, libname):
+    print(f"Fitting a {libname} model...")
+    tic = time()
+    est.fit(data_train, target_train)
+    toc = time()
+    print(f"fitted in {toc - tic:.3f}s")
+
+
+def predict(est, data_test, target_test):
+    if args.no_predict:
+        return
+    tic = time()
+    predicted_test = est.predict(data_test)
+    predicted_proba_test = est.predict_proba(data_test)
+    toc = time()
+    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
+    acc = accuracy_score(target_test, predicted_test)
+    print(f"predicted in {toc - tic:.3f}s, "
+          f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+
+
 df = load_data()
 target = df.values[:, 0]
 data = np.ascontiguousarray(df.values[:, 1:])
@@ -68,56 +91,28 @@ def load_data():
 n_samples, n_features = data_train.shape
 print(f"Training set with {n_samples} records with {n_features} features.")
 
-print("Fitting a sklearn model...")
-tic = time()
 est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                      learning_rate=lr,
                                      max_iter=n_trees,
                                      max_bins=max_bins,
                                      max_leaf_nodes=n_leaf_nodes,
-                                     n_iter_no_change=None,
+                                     early_stopping=False,
                                      random_state=0,
                                      verbose=1)
-est.fit(data_train, target_train)
-toc = time()
-predicted_test = est.predict(data_test)
-predicted_proba_test = est.predict_proba(data_test)
-roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-acc = accuracy_score(target_test, predicted_test)
-print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+fit(est, data_train, target_train, 'sklearn')
+predict(est, data_test, target_test)
 
 if args.lightgbm:
-    print("Fitting a LightGBM model...")
-    tic = time()
-    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
-    lightgbm_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = lightgbm_est.predict(data_test)
-    predicted_proba_test = lightgbm_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='lightgbm')
+    fit(est, data_train, target_train, 'lightgbm')
+    predict(est, data_test, target_test)
 
 if args.xgboost:
-    print("Fitting an XGBoost model...")
-    tic = time()
-    xgboost_est = get_equivalent_estimator(est, lib='xgboost')
-    xgboost_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = xgboost_est.predict(data_test)
-    predicted_proba_test = xgboost_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='xgboost')
+    fit(est, data_train, target_train, 'xgboost')
+    predict(est, data_test, target_test)
 
 if args.catboost:
-    print("Fitting a Catboost model...")
-    tic = time()
-    catboost_est = get_equivalent_estimator(est, lib='catboost')
-    catboost_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = catboost_est.predict(data_test)
-    predicted_proba_test = catboost_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='catboost')
+    fit(est, data_train, target_train, 'catboost')
+    predict(est, data_test, target_test)
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 0a3ca4e034efd..d1849a940d96c 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -97,6 +97,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
+
+    # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed
+    python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1"
+
     python -m pip install pandas matplotlib pyamg scikit-image
     # do not install dependencies for lightgbm since it requires scikit-learn
     python -m pip install lightgbm --no-deps
diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml
index 9efb0418278d2..c48e3644680bd 100644
--- a/build_tools/azure/posix.yml
+++ b/build_tools/azure/posix.yml
@@ -3,7 +3,7 @@ parameters:
   vmImage: ''
   matrix: []
   dependsOn: []
-  condition: ne(variables['Build.Reason'], 'Schedule')
+  condition: ''
 
 jobs:
 - job: ${{ parameters.name }}
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index abc823facee15..b0429e41762b1 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -169,7 +169,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \
   cython="${CYTHON_VERSION:-*}" pytest coverage \
   matplotlib="${MATPLOTLIB_VERSION:-*}" sphinx=2.1.2 pillow \
   scikit-image="${SCIKIT_IMAGE_VERSION:-*}" pandas="${PANDAS_VERSION:-*}" \
-  joblib memory_profiler packaging
+  joblib memory_profiler packaging seaborn
 
 source activate testenv
 pip install sphinx-gallery
diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh
index 2b408031c2eb6..dad7ad95ce7c1 100755
--- a/build_tools/circle/linting.sh
+++ b/build_tools/circle/linting.sh
@@ -141,6 +141,8 @@ else
     check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)"
     check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \
         --config ./examples/.flake8
+    # check code for unused imports
+    flake8 --exclude=sklearn/externals/ --select=F401 sklearn/ examples/
 fi
 echo -e "No problem detected by flake8\n"
 
diff --git a/conftest.py b/conftest.py
index 17c3f4b144346..2b9e87bf9f292 100644
--- a/conftest.py
+++ b/conftest.py
@@ -87,6 +87,11 @@ def pytest_collection_modifyitems(config, items):
 def pytest_configure(config):
     import sys
     sys._is_pytest_session = True
+    # declare our custom markers to avoid PytestUnknownMarkWarning
+    config.addinivalue_line(
+        "markers",
+        "network: mark a test for execution if network available."
+    )
 
 
 def pytest_unconfigure(config):
diff --git a/doc/about.rst b/doc/about.rst
index 9926f62dcc824..a6cdd54eb9201 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -13,7 +13,7 @@ this project as part of his thesis.
 In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent
 Michel of INRIA took leadership of the project and made the first public
 release, February the 1st 2010. Since then, several releases have appeared
-following a ~3 month cycle, and a thriving international community has
+following a ~ 3-month cycle, and a thriving international community has
 been leading the development.
 
 Governance
@@ -520,7 +520,7 @@ budget of the project [#f1]_.
 
 .. rubric:: Notes
 
-.. [#f1] Regarding the organization budget in particular, we might use some of
+.. [#f1] Regarding the organization budget, in particular, we might use some of
          the donated funds to pay for other project expenses such as DNS,
          hosting or continuous integration services.
 
diff --git a/doc/conf.py b/doc/conf.py
index b7eb635b15f40..c3ab17d3e73af 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -250,6 +250,7 @@
     'matplotlib': ('https://matplotlib.org/', None),
     'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
     'joblib': ('https://joblib.readthedocs.io/en/latest/', None),
+    'seaborn': ('https://seaborn.pydata.org/', None),
 }
 
 v = parse(release)
@@ -297,7 +298,7 @@ def __call__(self, directory):
 sphinx_gallery_conf = {
     'doc_module': 'sklearn',
     'backreferences_dir': os.path.join('modules', 'generated'),
-    'show_memory': True,
+    'show_memory': False,
     'reference_url': {
         'sklearn': None},
     'examples_dirs': ['../examples'],
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 3b2f7317ee41b..99c59ec3392c6 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -181,12 +181,12 @@ Contributing code
   If in doubt about duplicated work, or if you want to work on a non-trivial
   feature, it's recommended to first open an issue in
   the `issue tracker <https://github.com/scikit-learn/scikit-learn/issues>`_
-  to get some feedbacks from core developers. 
-  
-  One easy way to find an issue to work on is by applying the "help wanted" 
-  label in your search. This lists all the issues that have been unclaimed 
-  so far. In order to claim an issue for yourself, please comment exactly 
-  ``take`` on it for the CI to automatically assign the issue to you.  
+  to get some feedbacks from core developers.
+
+  One easy way to find an issue to work on is by applying the "help wanted"
+  label in your search. This lists all the issues that have been unclaimed
+  so far. In order to claim an issue for yourself, please comment exactly
+  ``take`` on it for the CI to automatically assign the issue to you.
 
 How to contribute
 -----------------
@@ -215,7 +215,7 @@ how to set up your git repository:
 
 4. Install the development dependencies::
 
-       $ pip install cython pytest pytest-cov flake8
+       $ pip install cython pytest pytest-cov flake8 mypy
 
 5. Install scikit-learn in editable mode::
 
@@ -224,6 +224,8 @@ how to set up your git repository:
    for more details about advanced installation, see the
    :ref:`install_bleeding_edge` section.
 
+.. _upstream:
+
 6. Add the ``upstream`` remote. This saves a reference to the main
    scikit-learn repository, which you can use to keep your repository
    synchronized with the latest changes::
@@ -356,13 +358,17 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    non-regression tests should fail for the code base in the master branch
    and pass for the PR code.
 
-5. **Make sure that your PR does not add PEP8 violations**. On a Unix-like
-   system, you can run `make flake8-diff`. `flake8 path_to_file`, would work
-   for any system, but please avoid reformatting parts of the file that your
-   pull request doesn't change, as it distracts from code review.
+5. **Make sure that your PR does not add PEP8 violations**. To check the
+   code that you changed, you can run the following command (see
+   :ref:`above <upstream>` to set up the upstream remote)::
+
+        git diff upstream/master -u -- "*.py" | flake8 --diff
+
+   or `make flake8-diff` which should work on unix-like system.
 
 6. Follow the :ref:`coding-guidelines`.
 
+
 7. When applicable, use the validation tools and scripts in the
    ``sklearn.utils`` submodule.  A list of utility routines available
    for developers can be found in the :ref:`developers-utils` page.
@@ -408,6 +414,18 @@ You can check for common programming errors with the following tools:
 
   see also :ref:`testing_coverage`
 
+* A moderate use of type annotations is encouraged but is not mandatory. See
+  [mypy quickstart](https://mypy.readthedocs.io/en/latest/getting_started.html)
+  for an introduction, as well as [pandas contributing documentation](
+  https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#type-hints)
+  for style guidelines. Whether you add type annotation or not::
+
+    mypy --ignore-missing-import sklearn
+
+  must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular,
+   - when importing C or Cython modules
+   - on properties with decorators
+
 Bonus points for contributions that include a performance analysis with
 a benchmark script and profiling output (please report on the mailing
 list or on the GitHub issue).
@@ -553,7 +571,8 @@ the development version.
 
 Building the documentation requires installing some additional packages::
 
-    pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas scikit-image packaging
+    pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas \
+                scikit-image packaging seaborn
 
 To build the documentation, you need to be in the ``doc`` folder::
 
@@ -661,7 +680,7 @@ In general have the following in mind:
     4. 1D or 2D data can be a subset of
        ``{array-like, ndarray, sparse matrix, dataframe}``. Note that ``array-like``
        can also be a ``list``, while ``ndarray`` is explicitly only a ``numpy.ndarray``.
-    5. When specifying the data type of a list, use ``of`` as a delimiter: 
+    5. When specifying the data type of a list, use ``of`` as a delimiter:
        ``list of int``.
     6. When specifying the dtype of an ndarray, use e.g. ``dtype=np.int32``
        after defining the shape:
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index 2a42bee301554..6fdf17ccc927f 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -289,6 +289,14 @@ submodule/subpackage of the public subpackage, e.g.
 ``sklearn/impute/_iterative.py``. This is needed so that pickles still work
 in the future when the features aren't experimental anymore
 
+To avoid type checker (e.g. mypy) errors a direct import of experimenal
+estimators should be done in the parent module, protected by the
+``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py
+<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/__init__.py>`_,
+or `sklearn/impute/__init__.py
+<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/impute/__init__.py>`_
+for an example.
+
 Please also write basic tests following those in
 `test_enable_hist_gradient_boosting.py
 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`_.
diff --git a/doc/faq.rst b/doc/faq.rst
index 883ac290a3f16..9f43656ef2262 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -97,7 +97,7 @@ What are the inclusion criteria for new algorithms ?
 ----------------------------------------------------
 
 We only consider well-established algorithms for inclusion. A rule of thumb is
-at least 3 years since publication, 200+ citations and wide use and
+at least 3 years since publication, 200+ citations, and wide use and
 usefulness. A technique that provides a clear-cut improvement (e.g. an
 enhanced data structure or a more efficient approximation technique) on
 a widely-used method will also be considered for inclusion.
@@ -123,7 +123,7 @@ Inclusion of a new algorithm speeding up an existing model is easier if:
   n_samples",
 - benchmarks clearly show a speed up.
 
-Also note that your implementation need not be in scikit-learn to be used
+Also, note that your implementation need not be in scikit-learn to be used
 together with scikit-learn tools. You can implement your favorite algorithm
 in a scikit-learn compatible way, upload it to GitHub and let us know. We
 will be happy to list it under :ref:`related_projects`. If you already have
@@ -135,7 +135,7 @@ interested to look at `scikit-learn-contrib
 
 Why are you so selective on what algorithms you include in scikit-learn?
 ------------------------------------------------------------------------
-Code is maintenance cost, and we need to balance the amount of
+Code comes with maintenance cost, and we need to balance the amount of
 code we have with the size of the team (and add to this the fact that
 complexity scales non linearly with the number of features).
 The package relies on core developers using their free time to
@@ -250,7 +250,7 @@ Why do I sometime get a crash/freeze with n_jobs > 1 under OSX or Linux?
 
 Several scikit-learn tools such as ``GridSearchCV`` and ``cross_val_score``
 rely internally on Python's `multiprocessing` module to parallelize execution
-onto several Python processes by passing ``n_jobs > 1`` as argument.
+onto several Python processes by passing ``n_jobs > 1`` as an argument.
 
 The problem is that Python ``multiprocessing`` does a ``fork`` system call
 without following it with an ``exec`` system call for performance reasons. Many
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 70dd230c7ecd3..bfa675c50a21c 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -41,7 +41,7 @@ General Concepts
         contributor documentation <api_overview>`.
 
         The specific interfaces that constitute Scikit-learn's public API are
-        largely documented in :ref:`api_ref`. However we less formally consider
+        largely documented in :ref:`api_ref`. However, we less formally consider
         anything as public API if none of the identifiers required to access it
         begins with ``_``.  We generally try to maintain :term:`backwards
         compatibility` for all objects in the public API.
@@ -106,12 +106,12 @@ General Concepts
         are documented under an estimator's *Parameters* documentation.
 
     backwards compatibility
-        We generally try to maintain backwards compatibility (i.e. interfaces
+        We generally try to maintain backward compatibility (i.e. interfaces
         and behaviors may be extended but not changed or removed) from release
         to release but this comes with some exceptions:
 
         Public API only
-            The behaviour of objects accessed through private identifiers
+            The behavior of objects accessed through private identifiers
             (those beginning ``_``) may be changed arbitrarily between
             versions.
         As documented
@@ -145,8 +145,8 @@ General Concepts
             assumed but not formally tested.
 
         Despite this informal contract with our users, the software is provided
-        as is, as stated in the licence.  When a release inadvertently
-        introduces changes that are not backwards compatible, these are known
+        as is, as stated in the license.  When a release inadvertently
+        introduces changes that are not backward compatible, these are known
         as software regressions.
 
     callable
@@ -647,7 +647,7 @@ General Concepts
         first axis and a fixed, finite set of :term:`features` on the second
         is called rectangular.
 
-        This term excludes samples with non-vectorial structure, such as text,
+        This term excludes samples with non-vectorial structures, such as text,
         an image of arbitrary size, a time series of arbitrary length, a set of
         vectors, etc. The purpose of a :term:`vectorizer` is to produce
         rectangular forms of such data.
@@ -684,7 +684,7 @@ General Concepts
         versions happen via a :ref:`SLEP <slep>` and follows the
         decision-making process outlined in :ref:`governance`.
         For all votes, a proposal must have been made public and discussed before the
-        vote. Such proposal must be a consolidated document, in the form of a
+        vote. Such a proposal must be a consolidated document, in the form of a
         ‘Scikit-Learn Enhancement Proposal’ (SLEP), rather than a long discussion on an
         issue. A SLEP must be submitted as a pull-request to
         `enhancement proposals <https://scikit-learn-enhancement-proposals.readthedocs.io>`_ using the
@@ -881,12 +881,12 @@ Class APIs and Estimator Types
         In a meta-estimator's :term:`fit` method, any contained estimators
         should be :term:`cloned` before they are fit (although FIXME: Pipeline
         and FeatureUnion do not do this currently). An exception to this is
-        that an estimator may explicitly document that it accepts a prefitted
+        that an estimator may explicitly document that it accepts a pre-fitted
         estimator (e.g. using ``prefit=True`` in
         :class:`feature_selection.SelectFromModel`). One known issue with this
-        is that the prefitted estimator will lose its model if the
+        is that the pre-fitted estimator will lose its model if the
         meta-estimator is cloned.  A meta-estimator should have ``fit`` called
-        before prediction, even if all contained estimators are prefitted.
+        before prediction, even if all contained estimators are pre-fitted.
 
         In cases where a meta-estimator's primary behaviors (e.g.
         :term:`predict` or :term:`transform` implementation) are functions of
@@ -1008,7 +1008,7 @@ Target Types
 
     binary
         A classification problem consisting of two classes.  A binary target
-        may represented as for a :term:`multiclass` problem but with only two
+        may  be represented as for a :term:`multiclass` problem but with only two
         labels.  A binary decision function is represented as a 1d array.
 
         Semantically, one class is often considered the "positive" class.
@@ -1028,7 +1028,7 @@ Target Types
 
     continuous
         A regression problem where each sample's target is a finite floating
-        point number, represented as a 1-dimensional array of floats (or
+        point number represented as a 1-dimensional array of floats (or
         sometimes ints).
 
         :func:`~utils.multiclass.type_of_target` will return 'continuous' for
@@ -1078,7 +1078,7 @@ Target Types
         A classification problem where each sample's target consists of
         ``n_outputs`` :term:`outputs`, each a class label, for a fixed int
         ``n_outputs > 1`` in a particular dataset.  Each output has a
-        fixed set of available classes, and each sample is labelled with a
+        fixed set of available classes, and each sample is labeled with a
         class for each output. An output may be binary or multiclass, and in
         the case where all outputs are binary, the target is
         :term:`multilabel`.
@@ -1213,10 +1213,10 @@ Methods
         and ``transform`` separately would be less efficient than together.
         :class:`base.TransformerMixin` provides a default implementation,
         providing a consistent interface across transformers where
-        ``fit_transform`` is or is not specialised.
+        ``fit_transform`` is or is not specialized.
 
         In :term:`inductive` learning -- where the goal is to learn a
-        generalised model that can be applied to new data -- users should be
+        generalized model that can be applied to new data -- users should be
         careful not to apply ``fit_transform`` to the entirety of a dataset
         (i.e. training and test data together) before further modelling, as
         this results in :term:`data leakage`.
@@ -1225,7 +1225,7 @@ Methods
         Primarily for :term:`feature extractors`, but also used for other
         transformers to provide string names for each column in the output of
         the estimator's :term:`transform` method.  It outputs a list of
-        strings, and may take a list of strings as input, corresponding
+        strings and may take a list of strings as input, corresponding
         to the names of input columns from which output column names can
         be generated.  By default input features are named x0, x1, ....
 
@@ -1250,7 +1250,7 @@ Methods
     ``partial_fit``
         Facilitates fitting an estimator in an online fashion.  Unlike ``fit``,
         repeatedly calling ``partial_fit`` does not clear the model, but
-        updates it with respect to the data provided. The portion of data
+        updates it with the data provided. The portion of data
         provided to ``partial_fit`` may be called a mini-batch.
         Each mini-batch must be of consistent shape, etc. In iterative
         estimators, ``partial_fit`` often only performs a single iteration.
@@ -1322,7 +1322,7 @@ Methods
         to facilitate numerical stability.
 
     ``predict_proba``
-        A method in :term:`classifiers` and :term:`clusterers` that are able to
+        A method in :term:`classifiers` and :term:`clusterers` that can
         return probability estimates for each class/cluster.  Its input is
         usually only some observed data, :term:`X`.
 
@@ -1381,7 +1381,7 @@ Methods
         In a :term:`transformer`, transforms the input, usually only :term:`X`,
         into some transformed space (conventionally notated as :term:`Xt`).
         Output is an array or sparse matrix of length :term:`n_samples` and
-        with number of columns fixed after :term:`fitting`.
+        with the number of columns fixed after :term:`fitting`.
 
         If the estimator was not already :term:`fitted`, calling this method
         should raise a :class:`exceptions.NotFittedError`.
@@ -1405,8 +1405,8 @@ functions or non-estimator constructors.
         :term:`multioutput` (including :term:`multilabel`) tasks, the weights
         are multiplied across outputs (i.e. columns of ``y``).
 
-        By default all samples have equal weight such that classes are
-        effectively weighted by their their prevalence in the training data.
+        By default, all samples have equal weight such that classes are
+        effectively weighted by their prevalence in the training data.
         This could be achieved explicitly with ``class_weight={label1: 1,
         label2: 1, ...}`` for all class labels.
 
@@ -1541,16 +1541,20 @@ functions or non-estimator constructors.
         mean that randomization is always used, as it may be dependent on
         another parameter, e.g. ``shuffle``, being set.
 
-        ``random_state``'s value may be:
+        The passed value will have an effect on the reproducibility of the
+        results returned by the function (:term:`fit`, :term:`split`, or any
+        other function like :func:`~sklearn.cluster.k_means`). `random_state`'s
+        value may be:
 
         None (default)
-            Use the global random state from :mod:`numpy.random`.
+            Use the global random state instance from :mod:`numpy.random`.
+            Calling the function multiple times will reuse
+            the same instance, and will produce different results.
 
         An integer
             Use a new random number generator seeded by the given integer.
-            To make a randomized algorithm deterministic (i.e. running it
-            multiple times will produce the same result), an arbitrary
-            integer ``random_state`` can be used. However, it may be
+            Using an int will produce the same results across different calls.
+            However, it may be
             worthwhile checking that your results are stable across a
             number of different distinct random seeds. Popular integer
             random seeds are 0 and `42
@@ -1558,9 +1562,9 @@ functions or non-estimator constructors.
 
         A :class:`numpy.random.RandomState` instance
             Use the provided random state, only affecting other users
-            of the same random state instance. Calling fit multiple times
-            will reuse the same instance, and will produce different
-            results.
+            of that same random state instance. Calling the function
+            multiple times will reuse the same instance, and
+            will produce different results.
 
         :func:`utils.check_random_state` is used internally to validate the
         input ``random_state`` and return a :class:`~numpy.random.RandomState`
@@ -1577,10 +1581,11 @@ functions or non-estimator constructors.
         in the User Guide.
 
         Where multiple metrics can be evaluated, ``scoring`` may be given
-        either as a list of unique strings or a dict with names as keys and
-        callables as values. Note that this does *not* specify which score
-        function is to be maximised, and another parameter such as ``refit``
-        may be used for this purpose.
+        either as a list of unique strings or a dictionary with names as keys
+        and callables as values. Note that this does *not* specify which score
+        function is to be maximized, and another parameter such as ``refit``
+        maybe used for this purpose.
+
 
         The ``scoring`` parameter is validated and interpreted using
         :func:`metrics.check_scoring`.
@@ -1600,9 +1605,9 @@ functions or non-estimator constructors.
         When fitting an estimator repeatedly on the same dataset, but for
         multiple parameter values (such as to find the value maximizing
         performance as in :ref:`grid search <grid_search>`), it may be possible
-        to reuse aspects of the model learnt from the previous parameter value,
+        to reuse aspects of the model learned from the previous parameter value,
         saving time.  When ``warm_start`` is true, the existing :term:`fitted`
-        model :term:`attributes` are used to initialise the new model
+        model :term:`attributes` are used to initialize the new model
         in a subsequent call to :term:`fit`.
 
         Note that this is only applicable for some models and some
@@ -1697,8 +1702,8 @@ See concept :term:`sample property`.
 .. glossary::
 
     ``groups``
-        Used in cross validation routines to identify samples which are
-        correlated.  Each value is an identifier such that, in a supporting
+        Used in cross-validation routines to identify samples that are correlated.
+        Each value is an identifier such that, in a supporting
         :term:`CV splitter`, samples from some ``groups`` value may not
         appear in both a training set and its corresponding test set.
         See :ref:`group_cv`.
diff --git a/doc/inspection.rst b/doc/inspection.rst
index b53aeb436b4cd..1304a1030abb9 100644
--- a/doc/inspection.rst
+++ b/doc/inspection.rst
@@ -17,6 +17,10 @@ predictions from a model and what affects them. This can be used to
 evaluate assumptions and biases of a model, design a better model, or
 to diagnose issues with model performance.
 
+.. topic:: Examples:
+
+   * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`
+
 .. toctree::
 
     modules/partial_dependence
diff --git a/doc/install.rst b/doc/install.rst
index 6a2b83605c1a6..9f8c277577a3c 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -134,7 +134,8 @@ it as ``scikit-learn[alldeps]``.
 Scikit-learn plotting capabilities (i.e., functions start with "plot\_"
 and classes end with "Display") require Matplotlib (>= 2.1.1). For running the
 examples Matplotlib >= 2.1.1 is required. A few examples require
-scikit-image >= 0.13, a few examples require pandas >= 0.18.0.
+scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples
+require seaborn >= 0.9.0.
 
 .. warning::
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c138f51f6c06f..3d9924638b69b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1346,6 +1346,9 @@ Model validation
    :no-members:
    :no-inherited-members:
 
+**User guide:** See the :ref:`combining_estimators` section for further
+details.
+
 .. currentmodule:: sklearn
 
 .. autosummary::
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 8145cdef984bb..51a933dcbee47 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -383,11 +383,6 @@ and ignored by setting to ``'drop'``::
 ColumnTransformer for heterogeneous data
 ========================================
 
-.. warning::
-
-    The :class:`compose.ColumnTransformer <sklearn.compose.ColumnTransformer>`
-    class is experimental and the API is subject to change.
-
 Many datasets contain features of different types, say text, floats, and dates,
 where each type of feature requires separate preprocessing or feature
 extraction steps.  Often it is easiest to preprocess data before applying
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index b7c0e49f9c477..1416b9d3a6045 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -897,7 +897,7 @@ based on permutation of the features.
 Histogram-Based Gradient Boosting
 =================================
 
-Scikit-learn 0.21 introduces two new experimental implementations of
+Scikit-learn 0.21 introduced two new experimental implementations of
 gradient boosting trees, namely :class:`HistGradientBoostingClassifier`
 and :class:`HistGradientBoostingRegressor`, inspired by
 `LightGBM <https://github.com/Microsoft/LightGBM>`__ (See [LightGBM]_).
@@ -1050,6 +1050,51 @@ multiplying the gradients (and the hessians) by the sample weights. Note that
 the binning stage (specifically the quantiles computation) does not take the
 weights into account.
 
+.. _monotonic_cst_gbdt:
+
+Monotonic Constraints
+---------------------
+
+Depending on the problem at hand, you may have prior knowledge indicating
+that a given feature should in general have a positive (or negative) effect
+on the target value. For example, all else being equal, a higher credit
+score should increase the probability of getting approved for a loan.
+Monotonic constraints allow you to incorporate such prior knowledge into the
+model.
+
+A positive monotonic constraint is a constraint of the form:
+
+:math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2)`,
+where :math:`F` is the predictor with two features.
+
+Similarly, a negative monotonic constraint is of the form:
+
+:math:`x_1 \leq x_1' \implies F(x_1, x_2) \geq F(x_1', x_2)`.
+
+Note that monotonic constraints only constraint the output "all else being
+equal". Indeed, the following relation **is not enforced** by a positive
+constraint: :math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2')`.
+
+You can specify a monotonic constraint on each feature using the
+`monotonic_cst` parameter. For each feature, a value of 0 indicates no
+constraint, while -1 and 1 indicate a negative and positive constraint,
+respectively::
+
+  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+  >>> from sklearn.ensemble import HistGradientBoostingRegressor
+
+  ... # positive, negative, and no constraint on the 3 features
+  >>> gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1, 0])
+
+In a binary classification context, imposing a monotonic constraint means
+that the feature is supposed to have a positive / negative effect on the
+probability to belong to the positive class. Monotonic constraints are not
+supported for multiclass context.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
+
 Low-level parallelism
 ---------------------
 
diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 6a319cfb94336..2683a35eaf72b 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -152,7 +152,8 @@ The features are considered unimportant and removed, if the corresponding
 ``threshold`` parameter. Apart from specifying the threshold numerically,
 there are built-in heuristics for finding a threshold using a string argument.
 Available heuristics are "mean", "median" and float multiples of these like
-"0.1*mean".
+"0.1*mean". In combination with the `threshold` criteria, one can use the
+`max_features` parameter to set a limit on the number of features to select.
 
 For examples on how it is to be used refer to the sections below.
 
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index fc5f254035a53..477baca9c4de3 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -146,7 +146,7 @@ a linear kernel.
 
    * :ref:`sphx_glr_auto_examples_linear_model_plot_ridge_path.py`
    * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
-
+   * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`
 
 Ridge Complexity
 ----------------
@@ -232,6 +232,7 @@ computes the coefficients along the full path of possible values.
 
   * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py`
   * :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py`
+  * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`
 
 
 .. note:: **Feature selection with Lasso**
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 457ec6c630b99..b2dd4cf5a7cd3 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -16,7 +16,7 @@ vector :math:`x_1` through :math:`x_n`, :
 
 .. math::
 
-   P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots x_n \mid y)}
+   P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots, x_n \mid y)}
                                     {P(x_1, \dots, x_n)}
 
 Using the naive conditional independence assumption that
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 620df39b4d0ba..95a5111747509 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -7,7 +7,7 @@ Stochastic Gradient Descent
 .. currentmodule:: sklearn.linear_model
 
 **Stochastic Gradient Descent (SGD)** is a simple yet very efficient
-approach to discriminative learning of linear classifiers under
+approach to fitting linear classifiers and regressors under
 convex loss functions such as (linear) `Support Vector Machines
 <https://en.wikipedia.org/wiki/Support_vector_machine>`_ and `Logistic
 Regression <https://en.wikipedia.org/wiki/Logistic_regression>`_.
@@ -21,6 +21,19 @@ language processing.  Given that the data is sparse, the classifiers
 in this module easily scale to problems with more than 10^5 training
 examples and more than 10^5 features.
 
+Strictly speaking, SGD is merely an optimization technique and does not
+correspond to a specific family of machine learning models. It is only a
+*way* to train a model. Often, an instance of :class:`SGDClassifier` or
+:class:`SGDRegressor` will have an equivalent estimator in
+the scikit-learn API, potentially using a different optimization technique.
+For example, using `SGDClassifier(loss='log')` results in logistic regression,
+i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression`
+which is fitted via SGD instead of being fitted by one of the other solvers
+in :class:`~sklearn.linear_model.LogisticRegression`. Similarly,
+`SGDRegressor(loss='squared_loss', penalty='l2')` and
+:class:`~sklearn.linear_model.Ridge` solve the same optimization problem, via
+different means.
+
 The advantages of Stochastic Gradient Descent are:
 
     + Efficiency.
@@ -34,26 +47,31 @@ The disadvantages of Stochastic Gradient Descent include:
 
     + SGD is sensitive to feature scaling.
 
+.. warning::
+
+  Make sure you permute (shuffle) your training data before fitting the model
+  or use ``shuffle=True`` to shuffle after each iteration (used by default).
+  Also, ideally, features should be standardized using e.g.
+  `make_pipeline(StandardScaler(), SGDClassifier())` (see :ref:`Pipelines
+  <combining_estimators>`).
+
 Classification
 ==============
 
-.. warning::
-
-  Make sure you permute (shuffle) your training data before fitting the
-  model or use ``shuffle=True`` to shuffle after each iteration.
 
 The class :class:`SGDClassifier` implements a plain stochastic gradient
 descent learning routine which supports different loss functions and
-penalties for classification.
+penalties for classification. Below is the decision boundary of a
+:class:`SGDClassifier` trained with the hinge loss, equivalent to a linear SVM.
 
 .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_separating_hyperplane_001.png
    :target: ../auto_examples/linear_model/plot_sgd_separating_hyperplane.html
    :align: center
    :scale: 75
 
-As other classifiers, SGD has to be fitted with two arrays: an array X
-of size [n_samples, n_features] holding the training samples, and an
-array Y of size [n_samples] holding the target values (class labels)
+As other classifiers, SGD has to be fitted with two arrays: an array `X`
+of shape (n_samples, n_features) holding the training samples, and an
+array y of shape (n_samples,) holding the target values (class labels)
 for the training samples::
 
     >>> from sklearn.linear_model import SGDClassifier
@@ -69,13 +87,13 @@ After being fitted, the model can then be used to predict new values::
     >>> clf.predict([[2., 2.]])
     array([1])
 
-SGD fits a linear model to the training data. The member ``coef_`` holds
+SGD fits a linear model to the training data. The ``coef_`` attribute holds
 the model parameters::
 
     >>> clf.coef_
     array([[9.9..., 9.9...]])
 
-Member ``intercept_`` holds the intercept (aka offset or bias)::
+The ``intercept_`` attribute holds the intercept (aka offset or bias)::
 
     >>> clf.intercept_
     array([-9.9...])
@@ -83,7 +101,9 @@ Member ``intercept_`` holds the intercept (aka offset or bias)::
 Whether or not the model should use an intercept, i.e. a biased
 hyperplane, is controlled by the parameter ``fit_intercept``.
 
-To get the signed distance to the hyperplane use :meth:`SGDClassifier.decision_function`::
+The signed distance to the hyperplane (computed as the dot product between
+the coefficients and the input sample, plus the intercept) is given by
+:meth:`SGDClassifier.decision_function`::
 
     >>> clf.decision_function([[2., 2.]])
     array([29.6...])
@@ -94,12 +114,16 @@ parameter. :class:`SGDClassifier` supports the following loss functions:
   * ``loss="hinge"``: (soft-margin) linear Support Vector Machine,
   * ``loss="modified_huber"``: smoothed hinge loss,
   * ``loss="log"``: logistic regression,
-  * and all regression losses below.
+  * and all regression losses below. In this case the target is encoded as -1
+    or 1, and the problem is treated as a regression problem. The predicted
+    class then correspond to the sign of the predicted target.
 
+Please refer to the :ref:`mathematical section below
+<sgd_mathematical_formulation>` for formulas.
 The first two loss functions are lazy, they only update the model
 parameters if an example violates the margin constraint, which makes
-training very efficient and may result in sparser models, even when L2 penalty
-is used.
+training very efficient and may result in sparser models (i.e. with more zero
+coefficents), even when L2 penalty is used.
 
 Using ``loss="log"`` or ``loss="modified_huber"`` enables the
 ``predict_proba`` method, which gives a vector of probability estimates
@@ -118,7 +142,7 @@ SGD supports the following penalties:
     ``(1 - l1_ratio) * L2 + l1_ratio * L1``.
 
 The default setting is ``penalty="l2"``. The L1 penalty leads to sparse
-solutions, driving most coefficients to zero. The Elastic Net solves
+solutions, driving most coefficients to zero. The Elastic Net [#5]_ solves
 some deficiencies of the L1 penalty in the presence of highly correlated
 attributes. The parameter ``l1_ratio`` controls the convex combination
 of L1 and L2 penalty.
@@ -139,8 +163,8 @@ the decision surface induced by the three classifiers.
    :scale: 75
 
 In the case of multi-class classification ``coef_`` is a two-dimensional
-array of ``shape=[n_classes, n_features]`` and ``intercept_`` is a
-one-dimensional array of ``shape=[n_classes]``. The i-th row of ``coef_`` holds
+array of shape (n_classes, n_features) and ``intercept_`` is a
+one-dimensional array of shape (n_classes,). The i-th row of ``coef_`` holds
 the weight vector of the OVA classifier for the i-th class; classes are
 indexed in ascending order (see attribute ``classes_``).
 Note that, in principle, since they allow to create a probability model,
@@ -152,23 +176,27 @@ instances via the fit parameters ``class_weight`` and ``sample_weight``. See
 the examples below and the docstring of :meth:`SGDClassifier.fit` for
 further information.
 
+:class:`SGDClassifier` supports averaged SGD (ASGD) [#4]_. Averaging can be
+enabled by setting `average=True`. ASGD performs the same updates as the
+regular SGD (see :ref:`sgd_mathematical_formulation`), but instead of using
+the last value of the coefficients as the `coef_` attribute (i.e. the values
+of the last update), `coef_` is set instead to the **average** value of the
+coefficients across all updates. The same is done for the `intercept_`
+attribute. When using ASGD the learning rate can be larger and even constant,
+leading on some datasets to a speed up in training time.
+
+For classification with a logistic loss, another variant of SGD with an
+averaging strategy is available with Stochastic Average Gradient (SAG)
+algorithm, available as a solver in :class:`LogisticRegression`.
+
 .. topic:: Examples:
 
  - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_separating_hyperplane.py`,
  - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_iris.py`
  - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_weighted_samples.py`
  - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_comparison.py`
- - :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py` (See the `Note`)
-
-:class:`SGDClassifier` supports averaged SGD (ASGD). Averaging can be enabled
-by setting ```average=True```. ASGD works by averaging the coefficients
-of the plain SGD over each iteration over a sample. When using ASGD
-the learning rate can be larger and even constant leading on some
-datasets to a speed up in training time.
-
-For classification with a logistic loss, another variant of SGD with an
-averaging strategy is available with Stochastic Average Gradient (SAG)
-algorithm, available as a solver in :class:`LogisticRegression`.
+ - :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py`
+   (See the Note in the example)
 
 Regression
 ==========
@@ -187,13 +215,18 @@ parameter. :class:`SGDRegressor` supports the following loss functions:
   * ``loss="huber"``: Huber loss for robust regression,
   * ``loss="epsilon_insensitive"``: linear Support Vector Regression.
 
+Please refer to the :ref:`mathematical section below
+<sgd_mathematical_formulation>` for formulas.
 The Huber and epsilon-insensitive loss functions can be used for
 robust regression. The width of the insensitive region has to be
 specified via the parameter ``epsilon``. This parameter depends on the
 scale of the target variables.
 
-:class:`SGDRegressor` supports averaged SGD as :class:`SGDClassifier`.
-Averaging can be enabled by setting ```average=True```.
+The `penalty` parameter determines the regularization to be used (see
+description above in the classification section).
+
+:class:`SGDRegressor` also supports averaged SGD [#4]_ (here again, see
+description above in the classification section).
 
 For regression with a squared loss and a l2 penalty, another variant of
 SGD with an averaging strategy is available with Stochastic Average
@@ -204,11 +237,13 @@ Stochastic Gradient Descent for sparse data
 ===========================================
 
 .. note:: The sparse implementation produces slightly different results
-  than the dense implementation due to a shrunk learning rate for the
-  intercept.
+  from the dense implementation, due to a shrunk learning rate for the
+  intercept. See :ref:`implementation_details`.
 
 There is built-in support for sparse data given in any matrix in a format
-supported by `scipy.sparse <https://docs.scipy.org/doc/scipy/reference/sparse.html>`_. For maximum efficiency, however, use the CSR
+supported by `scipy.sparse
+<https://docs.scipy.org/doc/scipy/reference/sparse.html>`_. For maximum
+efficiency, however, use the CSR
 matrix format as defined in `scipy.sparse.csr_matrix
 <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html>`_.
 
@@ -236,17 +271,17 @@ criteria to stop the algorithm when a given level of convergence is reached:
 
   * With ``early_stopping=True``, the input data is split into a training set
     and a validation set. The model is then fitted on the training set, and the
-    stopping criterion is based on the prediction score computed on the
-    validation set. The size of the validation set can be changed with the
-    parameter ``validation_fraction``.
+    stopping criterion is based on the prediction score (using the `score`
+    method) computed on the validation set. The size of the validation set
+    can be changed with the parameter ``validation_fraction``.
   * With ``early_stopping=False``, the model is fitted on the entire input data
     and the stopping criterion is based on the objective function computed on
-    the input data.
+    the training data.
 
 In both cases, the criterion is evaluated once by epoch, and the algorithm stops
 when the criterion does not improve ``n_iter_no_change`` times in a row. The
-improvement is evaluated with a tolerance ``tol``, and the algorithm stops in
-any case after a maximum number of iteration ``max_iter``.
+improvement is evaluated with absolute tolerance ``tol``, and the algorithm
+stops in any case after a maximum number of iteration ``max_iter``.
 
 
 Tips on Practical Use
@@ -265,15 +300,23 @@ Tips on Practical Use
       X_train = scaler.transform(X_train)
       X_test = scaler.transform(X_test)  # apply same transformation to test data
 
+      # Or better yet: use a pipeline!
+      from sklearn.pipeline import make_pipeline
+      est = make_pipeline(StandardScaler(), SGDClassifier())
+      est.fit(X_train)
+      est.predict(X_test)
+
     If your attributes have an intrinsic scale (e.g. word frequencies or
     indicator features) scaling is not needed.
 
   * Finding a reasonable regularization term :math:`\alpha` is
-    best done using :class:`GridSearchCV`, usually in the
+    best done using automatic hyper-parameter search, e.g.
+    :class:`~sklearn.model_selection.GridSearchCV` or
+    :class:`~sklearn.model_selection.RandomizedSearchCV`, usually in the
     range ``10.0**-np.arange(1,7)``.
 
   * Empirically, we found that SGD converges after observing
-    approx. 10^6 training samples. Thus, a reasonable first guess
+    approximately 10^6 training samples. Thus, a reasonable first guess
     for the number of iterations is ``max_iter = np.ceil(10**6 / n)``,
     where ``n`` is the size of the training set.
 
@@ -295,13 +338,16 @@ Tips on Practical Use
 Mathematical formulation
 ========================
 
+We describe here the mathematical details of the SGD procedure. A good
+overview with convergence rates can be found in [#6]_.
+
 Given a set of training examples :math:`(x_1, y_1), \ldots, (x_n, y_n)` where
-:math:`x_i \in \mathbf{R}^m` and :math:`y_i \in \{-1,1\}`, our goal is to
-learn a linear scoring function :math:`f(x) = w^T x + b` with model parameters
-:math:`w \in \mathbf{R}^m` and intercept :math:`b \in \mathbf{R}`. In order
-to make predictions, we simply look at the sign of :math:`f(x)`.
-A common choice to find the model parameters is by minimizing the regularized
-training error given by
+:math:`x_i \in \mathbf{R}^m` and :math:`y_i \in \mathcal{R}` (:math:`y_i \in
+{-1, 1}` for classification), our goal is to learn a linear scoring function
+:math:`f(x) = w^T x + b` with model parameters :math:`w \in \mathbf{R}^m` and
+intercept :math:`b \in \mathbf{R}`. In order to make predictions for binary
+classification, we simply look at the sign of :math:`f(x)`. To find the model
+parameters, we minimize the regularized training error given by
 
 .. math::
 
@@ -309,14 +355,29 @@ training error given by
 
 where :math:`L` is a loss function that measures model (mis)fit and
 :math:`R` is a regularization term (aka penalty) that penalizes model
-complexity; :math:`\alpha > 0` is a non-negative hyperparameter.
-
-Different choices for :math:`L` entail different classifiers such as
-
-   - Hinge: (soft-margin) Support Vector Machines.
-   - Log:   Logistic Regression.
-   - Least-Squares: Ridge Regression.
-   - Epsilon-Insensitive: (soft-margin) Support Vector Regression.
+complexity; :math:`\alpha > 0` is a non-negative hyperparameter that controls
+the regularization stength.
+
+Different choices for :math:`L` entail different classifiers or regressors:
+
+- Hinge (soft-margin): equivalent to Support Vector Classification.
+  :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`.
+- Perceptron: 
+  :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`.
+- Modified Huber: 
+  :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) >
+  1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise.
+- Log: equivalent to Logistic Regression.
+  :math:`L(y_i, f(x_i)) = \log(1 + \exp (-y_i f(x_i)))`.
+- Least-Squares: Linear regression (Ridge or Lasso depending on
+  :math:`R`).
+  :math:`L(y_i, f(x_i)) = \frac{1}{2}(y_i - f(x_i))^2`.
+- Huber: less sensitive to outliers than least-squares. It is equivalent to
+  least squares when :math:`|y_i - f(x_i)| \leq \varepsilon`, and
+  :math:`L(y_i, f(x_i)) = \varepsilon |y_i - f(x_i)| - \frac{1}{2}
+  \varepsilon^2` otherwise.
+- Epsilon-Insensitive: (soft-margin) equivalent to Support Vector Regression.
+  :math:`L(y_i, f(x_i)) = \max(0, |y_i - f(x_i)| - \varepsilon)`.
 
 All of the above loss functions can be regarded as an upper bound on the
 misclassification error (Zero-one loss) as shown in the Figure below.
@@ -326,15 +387,18 @@ misclassification error (Zero-one loss) as shown in the Figure below.
     :align: center
     :scale: 75
 
-Popular choices for the regularization term :math:`R` include:
+Popular choices for the regularization term :math:`R` (the `penalty`
+parameter) include:
 
-   - L2 norm: :math:`R(w) := \frac{1}{2} \sum_{i=1}^{n} w_i^2`,
-   - L1 norm: :math:`R(w) := \sum_{i=1}^{n} |w_i|`, which leads to sparse
+   - L2 norm: :math:`R(w) := \frac{1}{2} \sum_{j=1}^{m} w_j^2 = ||w||_2^2`,
+   - L1 norm: :math:`R(w) := \sum_{j=1}^{m} |w_j|`, which leads to sparse
      solutions.
-   - Elastic Net: :math:`R(w) := \frac{\rho}{2} \sum_{i=1}^{n} w_i^2 + (1-\rho) \sum_{i=1}^{n} |w_i|`, a convex combination of L2 and L1, where :math:`\rho` is given by ``1 - l1_ratio``.
+   - Elastic Net: :math:`R(w) := \frac{\rho}{2} \sum_{j=1}^{n} w_j^2 +
+     (1-\rho) \sum_{j=1}^{m} |w_j|`, a convex combination of L2 and L1, where
+     :math:`\rho` is given by ``1 - l1_ratio``.
 
 The Figure below shows the contours of the different regularization terms
-in the parameter space when :math:`R(w) = 1`.
+in a 2-dimensional parameter space (:math:`m=2`) when :math:`R(w) = 1`.
 
 .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_penalties_001.png
     :target: ../auto_examples/linear_model/plot_sgd_penalties.html
@@ -355,12 +419,13 @@ example updates the model parameters according to the update rule given by
 
 .. math::
 
-    w \leftarrow w - \eta (\alpha \frac{\partial R(w)}{\partial w}
-    + \frac{\partial L(w^T x_i + b, y_i)}{\partial w})
+    w \leftarrow w - \eta \left[\alpha \frac{\partial R(w)}{\partial w}
+    + \frac{\partial L(w^T x_i + b, y_i)}{\partial w}\right]
 
 where :math:`\eta` is the learning rate which controls the step-size in
 the parameter space.  The intercept :math:`b` is updated similarly but
-without regularization.
+without regularization (and with additional decay for sparse matrices, as
+detailed in :ref:`implementation_details`).
 
 The learning rate :math:`\eta` can be either constant or gradually decaying. For
 classification, the default learning rate schedule (``learning_rate='optimal'``)
@@ -395,58 +460,61 @@ and use ``eta0`` to specify the starting learning rate. When the stopping
 criterion is reached, the learning rate is divided by 5, and the algorithm
 does not stop. The algorithm stops when the learning rate goes below 1e-6.
 
-The model parameters can be accessed through the members ``coef_`` and
-``intercept_``:
+The model parameters can be accessed through the ``coef_`` and
+``intercept_`` attributes: ``coef_`` holds the weights :math:`w` and
+``intercept_`` holds :math:`b`.
 
-     - Member ``coef_`` holds the weights :math:`w`
-
-     - Member ``intercept_`` holds :math:`b`
-
-.. topic:: References:
-
- * `"Solving large scale linear prediction problems using stochastic
-   gradient descent algorithms"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.58.7377>`_
-   T. Zhang - In Proceedings of ICML '04.
-
- * `"Regularization and variable selection via the elastic net"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.124.4696>`_
-   H. Zou, T. Hastie - Journal of the Royal Statistical Society Series B,
-   67 (2), 301-320.
-
- * `"Towards Optimal One Pass Large Scale Learning with
-   Averaged Stochastic Gradient Descent"
-   <https://arxiv.org/pdf/1107.2490v2.pdf>`_
-   Xu, Wei
+When using Averaged SGD (with the `average` parameter), `coef_` is set to the
+average weight across all updates:
+`coef_` :math:`= \frac{1}{T} \sum_{t=0}^{T-1} w^{(t)}`,
+where :math:`T` is the total number of updates, found in the `t_` attribute.
 
+.. _implementation_details:
 
 Implementation details
 ======================
 
-The implementation of SGD is influenced by the `Stochastic Gradient SVM
-<https://leon.bottou.org/projects/sgd>`_  of Léon Bottou. Similar to SvmSGD,
+The implementation of SGD is influenced by the `Stochastic Gradient SVM` of
+[#1]_.
+Similar to SvmSGD,
 the weight vector is represented as the product of a scalar and a vector
 which allows an efficient weight update in the case of L2 regularization.
-In the case of sparse feature vectors, the intercept is updated with a
+In the case of sparse input `X`, the intercept is updated with a
 smaller learning rate (multiplied by 0.01) to account for the fact that
 it is updated more frequently. Training examples are picked up sequentially
 and the learning rate is lowered after each observed example. We adopted the
-learning rate schedule from Shalev-Shwartz et al. 2007.
+learning rate schedule from [#2]_.
 For multi-class classification, a "one versus all" approach is used.
-We use the truncated gradient algorithm proposed by Tsuruoka et al. 2009
+We use the truncated gradient algorithm proposed in [#3]_
 for L1 regularization (and the Elastic Net).
 The code is written in Cython.
 
 .. topic:: References:
 
- * `"Stochastic Gradient Descent" <https://leon.bottou.org/projects/sgd>`_ L. Bottou - Website, 2010.
-
- * `"The Tradeoffs of Large Scale Machine Learning" <https://leon.bottou.org/slides/largescale/lstut.pdf>`_ L. Bottou - Website, 2011.
-
- * `"Pegasos: Primal estimated sub-gradient solver for svm"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.74.8513>`_
-   S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07.
-
- * `"Stochastic gradient descent training for l1-regularized log-linear models with cumulative penalty"
-   <https://www.aclweb.org/anthology/P/P09/P09-1054.pdf>`_
-   Y. Tsuruoka, J. Tsujii, S. Ananiadou -  In Proceedings of the AFNLP/ACL '09.
+   .. [#1] `"Stochastic Gradient Descent"
+       <https://leon.bottou.org/projects/sgd>`_ L. Bottou - Website, 2010.
+
+   .. [#2] `"Pegasos: Primal estimated sub-gradient solver for svm"
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.74.8513>`_
+      S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07.
+
+   .. [#3] `"Stochastic gradient descent training for l1-regularized
+      log-linear models with cumulative penalty"
+      <https://www.aclweb.org/anthology/P/P09/P09-1054.pdf>`_
+      Y. Tsuruoka, J. Tsujii, S. Ananiadou - In Proceedings of the AFNLP/ACL
+      '09.
+
+   .. [#4] `"Towards Optimal One Pass Large Scale Learning with
+      Averaged Stochastic Gradient Descent"
+      <https://arxiv.org/pdf/1107.2490v2.pdf>`_
+      Xu, Wei
+
+   .. [#5] `"Regularization and variable selection via the elastic net"
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.124.4696>`_
+      H. Zou, T. Hastie - Journal of the Royal Statistical Society Series B,
+      67 (2), 301-320.
+
+   .. [#6] `"Solving large scale linear prediction problems using stochastic
+      gradient descent algorithms"
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.58.7377>`_
+      T. Zhang - In Proceedings of ICML '04.
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 706a9ff559aa8..23dc7fbf67b65 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -4,6 +4,9 @@
 Support Vector Machines
 =======================
 
+.. TODO: Describe tol parameter
+.. TODO: Describe max_iter parameter
+
 .. currentmodule:: sklearn.svm
 
 **Support vector machines (SVMs)** are a set of supervised learning
@@ -49,7 +52,7 @@ Classification
 ==============
 
 :class:`SVC`, :class:`NuSVC` and :class:`LinearSVC` are classes
-capable of performing multi-class classification on a dataset.
+capable of performing binary and multi-class classification on a dataset.
 
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_iris_svc_001.png
@@ -60,16 +63,16 @@ capable of performing multi-class classification on a dataset.
 :class:`SVC` and :class:`NuSVC` are similar methods, but accept
 slightly different sets of parameters and have different mathematical
 formulations (see section :ref:`svm_mathematical_formulation`). On the
-other hand, :class:`LinearSVC` is another implementation of Support
+other hand, :class:`LinearSVC` is another (faster) implementation of Support
 Vector Classification for the case of a linear kernel. Note that
-:class:`LinearSVC` does not accept keyword ``kernel``, as this is
-assumed to be linear. It also lacks some of the members of
+:class:`LinearSVC` does not accept parameter ``kernel``, as this is
+assumed to be linear. It also lacks some of the attributes of
 :class:`SVC` and :class:`NuSVC`, like ``support_``.
 
 As other classifiers, :class:`SVC`, :class:`NuSVC` and
-:class:`LinearSVC` take as input two arrays: an array X of size ``[n_samples,
-n_features]`` holding the training samples, and an array y of class labels
-(strings or integers), size ``[n_samples]``::
+:class:`LinearSVC` take as input two arrays: an array `X` of shape
+`(n_samples, n_features)` holding the training samples, and an array `y` of
+class labels (strings or integers), of shape `(n_samples)`::
 
 
     >>> from sklearn import svm
@@ -84,10 +87,10 @@ After being fitted, the model can then be used to predict new values::
     >>> clf.predict([[2., 2.]])
     array([1])
 
-SVMs decision function depends on some subset of the training data,
-called the support vectors. Some properties of these support vectors
-can be found in members ``support_vectors_``, ``support_`` and
-``n_support``::
+SVMs decision function (detailed in the :ref:`svm_mathematical_formulation`)
+depends on some subset of the training data, called the support vectors. Some
+properties of these support vectors can be found in attributes
+``support_vectors_``, ``support_`` and ``n_support``::
 
     >>> # get support vectors
     >>> clf.support_vectors_
@@ -100,19 +103,25 @@ can be found in members ``support_vectors_``, ``support_`` and
     >>> clf.n_support_
     array([1, 1]...)
 
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py`,
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`,
+
 .. _svm_multi_class:
 
 Multi-class classification
 --------------------------
 
-:class:`SVC` and :class:`NuSVC` implement the "one-against-one"
-approach (Knerr et al., 1990) for multi- class classification. If
-``n_class`` is the number of classes, then ``n_class * (n_class - 1) / 2``
+:class:`SVC` and :class:`NuSVC` implement the "one-versus-one"
+approach for multi-class classification. In total,
+``n_classes * (n_classes - 1) / 2``
 classifiers are constructed and each one trains data from two classes.
 To provide a consistent interface with other classifiers, the
-``decision_function_shape`` option allows to monotically transform the results of the
-"one-against-one" classifiers to a decision function of shape ``(n_samples,
-n_classes)``.
+``decision_function_shape`` option allows to monotonically transform the
+results of the "one-versus-one" classifiers to a "one-vs-rest" decision
+function of shape ``(n_samples, n_classes)``.
 
     >>> X = [[0], [1], [2], [3]]
     >>> Y = [0, 1, 2, 3]
@@ -128,8 +137,7 @@ n_classes)``.
     4
 
 On the other hand, :class:`LinearSVC` implements "one-vs-the-rest"
-multi-class strategy, thus training n_class models. If there are only
-two classes, only one model is trained::
+multi-class strategy, thus training `n_classes` models.
 
     >>> lin_clf = svm.LinearSVC()
     >>> lin_clf.fit(X, Y)
@@ -142,39 +150,37 @@ See :ref:`svm_mathematical_formulation` for a complete description of
 the decision function.
 
 Note that the :class:`LinearSVC` also implements an alternative multi-class
-strategy, the so-called multi-class SVM formulated by Crammer and Singer, by
-using the option ``multi_class='crammer_singer'``. This method is consistent,
-which is not true for one-vs-rest classification.
-In practice, one-vs-rest classification is usually preferred, since the results
-are mostly similar, but the runtime is significantly less.
+strategy, the so-called multi-class SVM formulated by Crammer and Singer
+[#8]_, by using the option ``multi_class='crammer_singer'``. In practice,
+one-vs-rest classification is usually preferred, since the results are mostly
+similar, but the runtime is significantly less.
 
 For "one-vs-rest" :class:`LinearSVC` the attributes ``coef_`` and ``intercept_``
-have the shape ``[n_class, n_features]`` and ``[n_class]`` respectively.
-Each row of the coefficients corresponds to one of the ``n_class`` many
+have the shape ``(n_classes, n_features)`` and ``(n_classes,)`` respectively.
+Each row of the coefficients corresponds to one of the ``n_classes``
 "one-vs-rest" classifiers and similar for the intercepts, in the
 order of the "one" class.
 
-In the case of "one-vs-one" :class:`SVC`, the layout of the attributes
-is a little more involved. In the case of having a linear kernel, the
-attributes ``coef_`` and ``intercept_`` have the shape
-``[n_class * (n_class - 1) / 2, n_features]`` and
-``[n_class * (n_class - 1) / 2]`` respectively. This is similar to the
-layout for :class:`LinearSVC` described above, with each row now corresponding
+In the case of "one-vs-one" :class:`SVC` and :class:`NuSVC`, the layout of
+the attributes is a little more involved. In the case of a linear
+kernel, the attributes ``coef_`` and ``intercept_`` have the shape
+``(n_classes * (n_classes - 1) / 2, n_features)`` and ``(n_classes *
+(n_classes - 1) / 2)`` respectively. This is similar to the layout for
+:class:`LinearSVC` described above, with each row now corresponding
 to a binary classifier. The order for classes
 0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . .
 . "n-1 vs n".
 
-The shape of ``dual_coef_`` is ``[n_class-1, n_SV]`` with
+The shape of ``dual_coef_`` is ``(n_classes-1, n_SV)`` with
 a somewhat hard to grasp layout.
 The columns correspond to the support vectors involved in any
-of the ``n_class * (n_class - 1) / 2`` "one-vs-one" classifiers.
-Each of the support vectors is used in ``n_class - 1`` classifiers.
-The ``n_class - 1`` entries in each row correspond to the dual coefficients
+of the ``n_classes * (n_classes - 1) / 2`` "one-vs-one" classifiers.
+Each of the support vectors is used in ``n_classes - 1`` classifiers.
+The ``n_classes - 1`` entries in each row correspond to the dual coefficients
 for these classifiers.
 
-This might be made more clear by an example:
-
-Consider a three class problem with class 0 having three support vectors
+This might be clearer with an example: consider a three class problem with
+class 0 having three support vectors
 :math:`v^{0}_0, v^{1}_0, v^{2}_0` and class 1 and 2 having two support vectors
 :math:`v^{0}_1, v^{1}_1` and :math:`v^{0}_2, v^{1}_2` respectively.  For each
 support vector :math:`v^{j}_i`, there are two dual coefficients.  Let's call
@@ -198,6 +204,9 @@ Then ``dual_coef_`` looks like this:
 |:math:`\alpha^{1}_{2,0}`|:math:`\alpha^{1}_{2,1}`|                  |
 +------------------------+------------------------+------------------+
 
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py`,
 
 .. _scores_probabilities:
 
@@ -209,18 +218,29 @@ per-class scores for each sample (or a single score per sample in the binary
 case). When the constructor option ``probability`` is set to ``True``,
 class membership probability estimates (from the methods ``predict_proba`` and
 ``predict_log_proba``) are enabled. In the binary case, the probabilities are
-calibrated using Platt scaling: logistic regression on the SVM's scores,
+calibrated using Platt scaling [#1]_: logistic regression on the SVM's scores,
 fit by an additional cross-validation on the training data.
-In the multiclass case, this is extended as per Wu et al. (2004).
+In the multiclass case, this is extended as per [#2]_.
+
+.. note::
 
-Needless to say, the cross-validation involved in Platt scaling
+  The same probability calibration procedure is available for all estimators
+  via the :class:`~sklearn.calibration.CalibratedClassifierCV` (see
+  :ref:`calibration`). In the case of :class:`SVC` and :class:`NuSVC`, this
+  procedure is builtin in `libsvm`_ which is used under the hood, so it does
+  not rely on scikit-learn's
+  :class:`~sklearn.calibration.CalibratedClassifierCV`.
+
+The cross-validation involved in Platt scaling
 is an expensive operation for large datasets.
-In addition, the probability estimates may be inconsistent with the scores,
-in the sense that the "argmax" of the scores
-may not be the argmax of the probabilities.
-(E.g., in binary classification,
-a sample may be labeled by ``predict`` as belonging to a class
-that has probability <½ according to ``predict_proba``.)
+In addition, the probability estimates may be inconsistent with the scores:
+
+- the "argmax" of the scores may not be the argmax of the probabilities
+- in binary classification, a sample may be labeled by ``predict`` as
+  belonging to the positive class even if the output of `predict_proba` is
+  less than 0.5; and similarly, it could be labeled as negative even if the
+  output of `predict_proba` is more than 0.5.
+
 Platt's method is also known to have theoretical issues.
 If confidence scores are required, but these do not have to be probabilities,
 then it is advisable to set ``probability=False``
@@ -231,35 +251,23 @@ unlike ``decision_function``, the ``predict`` method does not try to break ties
 by default. You can set ``break_ties=True`` for the output of ``predict`` to be
 the same as ``np.argmax(clf.decision_function(...), axis=1)``, otherwise the
 first class among the tied classes will always be returned; but have in mind
-that it comes with a computational cost.
-
-.. figure:: ../auto_examples/svm/images/sphx_glr_plot_svm_tie_breaking_001.png
-   :target: ../auto_examples/svm/plot_svm_tie_breaking.html
-   :align: center
-
-.. topic:: References:
-
- * Wu, Lin and Weng,
-   `"Probability estimates for multi-class classification by pairwise coupling"
-   <https://www.csie.ntu.edu.tw/~cjlin/papers/svmprob/svmprob.pdf>`_,
-   JMLR 5:975-1005, 2004.
- 
- 
- * Platt
-   `"Probabilistic outputs for SVMs and comparisons to regularized likelihood methods"
-   <https://www.cs.colorado.edu/~mozer/Teaching/syllabi/6622/papers/Platt1999.pdf>`_.
+that it comes with a computational cost. See
+:ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an example on
+tie breaking.
 
 Unbalanced problems
 --------------------
 
 In problems where it is desired to give more importance to certain
-classes or certain individual samples keywords ``class_weight`` and
+classes or certain individual samples, the parameters ``class_weight`` and
 ``sample_weight`` can be used.
 
-:class:`SVC` (but not :class:`NuSVC`) implement a keyword
+:class:`SVC` (but not :class:`NuSVC`) implements the parameter
 ``class_weight`` in the ``fit`` method. It's a dictionary of the form
 ``{class_label : value}``, where value is a floating point number > 0
 that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
+The figure below illustrates the decision boundary of an unbalanced problem,
+with and without weight correction.
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_unbalanced_001.png
    :target: ../auto_examples/svm/plot_separating_hyperplane_unbalanced.html
@@ -269,24 +277,21 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
 
 :class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`, :class:`LinearSVC`,
 :class:`LinearSVR` and :class:`OneClassSVM` implement also weights for
-individual samples in method ``fit`` through keyword ``sample_weight``. Similar
-to ``class_weight``, these set the parameter ``C`` for the i-th example to
-``C * sample_weight[i]``.
-
+individual samples in the `fit` method through the ``sample_weight`` parameter.
+Similar to ``class_weight``, this sets the parameter ``C`` for the i-th
+example to ``C * sample_weight[i]``, which will encourage the classifier to
+get these samples right. The figure below illustrates the effect of sample
+weighting on the decision boundary. The size of the circles is proportional
+to the sample weights:
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_weighted_samples_001.png
    :target: ../auto_examples/svm/plot_weighted_samples.html
    :align: center
    :scale: 75
 
-
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py`,
- * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py`,
  * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py`
- * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`,
- * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
  * :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py`,
 
 
@@ -303,13 +308,13 @@ above) depends only on a subset of the training data, because the cost
 function for building the model does not care about training points
 that lie beyond the margin. Analogously, the model produced by Support
 Vector Regression depends only on a subset of the training data,
-because the cost function for building the model ignores any training
-data close to the model prediction.
+because the cost function ignores samples whose prediction is close to their
+target.
 
 There are three different implementations of Support Vector Regression:
 :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR`
 provides a faster implementation than :class:`SVR` but only considers
-linear kernels, while :class:`NuSVR` implements a slightly different
+the linear kernel, while :class:`NuSVR` implements a slightly different
 formulation than :class:`SVR` and :class:`LinearSVR`. See
 :ref:`svm_implementation_details` for further details.
 
@@ -331,8 +336,6 @@ floating point values instead of integer values::
 
  * :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py`
 
-
-
 .. _svm_outlier_detection:
 
 Density estimation, novelty detection
@@ -350,14 +353,14 @@ Support Vector Machines are powerful tools, but their compute and
 storage requirements increase rapidly with the number of training
 vectors. The core of an SVM is a quadratic programming problem (QP),
 separating support vectors from the rest of the training data. The QP
-solver used by this `libsvm`_-based implementation scales between
+solver used by the `libsvm`_-based implementation scales between
 :math:`O(n_{features} \times n_{samples}^2)` and
 :math:`O(n_{features} \times n_{samples}^3)` depending on how efficiently
 the `libsvm`_ cache is used in practice (dataset dependent). If the data
 is very sparse :math:`n_{features}` should be replaced by the average number
 of non-zero features in a sample vector.
 
-Also note that for the linear case, the algorithm used in
+For the linear case, the algorithm used in
 :class:`LinearSVC` by the `liblinear`_ implementation is much more
 efficient than its `libsvm`_-based :class:`SVC` counterpart and can
 scale almost linearly to millions of samples and/or features.
@@ -369,16 +372,16 @@ Tips on Practical Use
 
   * **Avoiding data copy**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and
     :class:`NuSVR`, if the data passed to certain methods is not C-ordered
-    contiguous, and double precision, it will be copied before calling the
+    contiguous and double precision, it will be copied before calling the
     underlying C implementation. You can check whether a given numpy array is
     C-contiguous by inspecting its ``flags`` attribute.
 
     For :class:`LinearSVC` (and :class:`LogisticRegression
     <sklearn.linear_model.LogisticRegression>`) any input passed as a numpy
-    array will be copied and converted to the liblinear internal sparse data
+    array will be copied and converted to the `liblinear`_ internal sparse data
     representation (double precision floats and int32 indices of non-zero
     components). If you want to fit a large-scale linear classifier without
-    copying a dense numpy C-contiguous double precision array as input we
+    copying a dense numpy C-contiguous double precision array as input, we
     suggest to use the :class:`SGDClassifier
     <sklearn.linear_model.SGDClassifier>` class instead.  The objective
     function can be configured to be almost the same as the :class:`LinearSVC`
@@ -390,26 +393,44 @@ Tips on Practical Use
     recommended to set ``cache_size`` to a higher value than the default of
     200(MB), such as 500(MB) or 1000(MB).
 
+
   * **Setting C**: ``C`` is ``1`` by default and it's a reasonable default
-    choice.  If you have a lot of noisy observations you should decrease it.
-    It corresponds to regularize more the estimation.
+    choice.  If you have a lot of noisy observations you should decrease it:
+    decreasing C corresponds to more regularization.
     
     :class:`LinearSVC` and :class:`LinearSVR` are less sensitive to ``C`` when
     it becomes large, and prediction results stop improving after a certain 
     threshold. Meanwhile, larger ``C`` values will take more time to train, 
-    sometimes up to 10 times longer, as shown by Fan et al. (2008)
+    sometimes up to 10 times longer, as shown in [#3]_.
 
   * Support Vector Machine algorithms are not scale invariant, so **it
     is highly recommended to scale your data**. For example, scale each
     attribute on the input vector X to [0,1] or [-1,+1], or standardize it
     to have mean 0 and variance 1. Note that the *same* scaling must be
-    applied to the test vector to obtain meaningful results. See section
-    :ref:`preprocessing` for more details on scaling and normalization.
+    applied to the test vector to obtain meaningful results. This can be done
+    easily by using a :class:`~sklearn.pipeline.Pipeline`::
+
+        >>> from sklearn.pipeline import make_pipeline
+        >>> from sklearn.preprocessing import StandardScaler
+        >>> from sklearn.svm import SVC
+
+        >>> clf = make_pipeline(StandardScaler(), SVC())
+    
+    See section :ref:`preprocessing` for more details on scaling and
+    normalization.
+  
+  .. _shrinking_svm:
+
+  * Regarding the `shrinking` parameter, quoting [#4]_: *We found that if the
+    number of iterations is large, then shrinking can shorten the training
+    time. However, if we loosely solve the optimization problem (e.g., by
+    using a large stopping tolerance), the code without using shrinking may
+    be much faster*
 
   * Parameter ``nu`` in :class:`NuSVC`/:class:`OneClassSVM`/:class:`NuSVR`
     approximates the fraction of training errors and support vectors.
 
-  * In :class:`SVC`, if data for classification are unbalanced (e.g. many
+  * In :class:`SVC`, if the data is unbalanced (e.g. many
     positive and few negative), set ``class_weight='balanced'`` and/or try
     different penalty parameters ``C``.
 
@@ -425,9 +446,9 @@ Tips on Practical Use
 
     The underlying :class:`LinearSVC` implementation uses a random number
     generator to select features when fitting the model with a dual coordinate
-    descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon,
+    descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon
     to have slightly different results for the same input data. If that
-    happens, try with a smaller tol parameter. This randomness can also be
+    happens, try with a smaller `tol` parameter. This randomness can also be
     controlled with the ``random_state`` parameter. When ``dual`` is
     set to ``False`` the underlying implementation of :class:`LinearSVC` is
     not random and ``random_state`` has no effect on the results.
@@ -435,18 +456,11 @@ Tips on Practical Use
   * Using L1 penalization as provided by ``LinearSVC(loss='l2', penalty='l1',
     dual=False)`` yields a sparse solution, i.e. only a subset of feature
     weights is different from zero and contribute to the decision function.
-    Increasing ``C`` yields a more complex model (more feature are selected).
+    Increasing ``C`` yields a more complex model (more features are selected).
     The ``C`` value that yields a "null" model (all weights equal to zero) can
     be calculated using :func:`l1_min_c`.
 
 
-.. topic:: References:
-
- * Fan, Rong-En, et al.,
-   `"LIBLINEAR: A library for large linear classification."
-   <https://www.csie.ntu.edu.tw/~cjlin/papers/liblinear.pdf>`_,
-   Journal of machine learning research 9.Aug (2008): 1871-1874.
-
 .. _svm_kernels:
 
 Kernel functions
@@ -457,15 +471,15 @@ The *kernel function* can be any of the following:
   * linear: :math:`\langle x, x'\rangle`.
 
   * polynomial: :math:`(\gamma \langle x, x'\rangle + r)^d`, where
-    :math:`d` is specified by keyword ``degree``, :math:`r` by ``coef0``.
+    :math:`d` is specified by parameter ``degree``, :math:`r` by ``coef0``.
 
   * rbf: :math:`\exp(-\gamma \|x-x'\|^2)`, where :math:`\gamma` is
-    specified by keyword ``gamma``, must be greater than 0.
+    specified by parameter ``gamma``, must be greater than 0.
 
   * sigmoid :math:`\tanh(\gamma \langle x,x'\rangle + r)`,
     where :math:`r` is specified by ``coef0``.
 
-Different kernels are specified by keyword kernel at initialization::
+Different kernels are specified by the `kernel` parameter::
 
     >>> linear_svc = svm.SVC(kernel='linear')
     >>> linear_svc.kernel
@@ -474,6 +488,26 @@ Different kernels are specified by keyword kernel at initialization::
     >>> rbf_svc.kernel
     'rbf'
 
+Parameters of the RBF Kernel
+----------------------------
+
+When training an SVM with the *Radial Basis Function* (RBF) kernel, two
+parameters must be considered: ``C`` and ``gamma``.  The parameter ``C``,
+common to all SVM kernels, trades off misclassification of training examples
+against simplicity of the decision surface. A low ``C`` makes the decision
+surface smooth, while a high ``C`` aims at classifying all training examples
+correctly.  ``gamma`` defines how much influence a single training example has.
+The larger ``gamma`` is, the closer other examples must be to be affected.
+
+Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance.  One
+is advised to use :class:`sklearn.model_selection.GridSearchCV` with 
+``C`` and ``gamma`` spaced exponentially far apart to choose good values.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py`
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
+
 
 Custom Kernels
 --------------
@@ -495,8 +529,8 @@ classifiers, except that:
 Using Python functions as kernels
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You can also use your own defined kernels by passing a function to the
-keyword ``kernel`` in the constructor.
+You can use your own defined kernels by passing a function to the
+``kernel`` parameter.
 
 Your kernel must take as arguments two matrices of shape
 ``(n_samples_1, n_features)``, ``(n_samples_2, n_features)``
@@ -519,77 +553,81 @@ instance that will use that kernel::
 Using the Gram matrix
 ~~~~~~~~~~~~~~~~~~~~~
 
-Set ``kernel='precomputed'`` and pass the Gram matrix instead of X in the fit
-method. At the moment, the kernel values between *all* training vectors and the
-test vectors must be provided.
+You can pass pre-computed kernels by using the ``kernel='precomputed'``
+option. You should then pass Gram matrix instead of X to the `fit` and
+`predict` methods. The kernel values between *all* training vectors and the
+test vectors must be provided:
 
     >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split 
     >>> from sklearn import svm
-    >>> X = np.array([[0, 0], [1, 1]])
-    >>> y = [0, 1]
+    >>> X, y = make_classification(n_samples=10, random_state=0)
+    >>> X_train , X_test , y_train, y_test = train_test_split(X, y, random_state=0)
     >>> clf = svm.SVC(kernel='precomputed')
     >>> # linear kernel computation
-    >>> gram = np.dot(X, X.T)
-    >>> clf.fit(gram, y)
+    >>> gram_train = np.dot(X_train, X_train.T)
+    >>> clf.fit(gram_train, y_train)
     SVC(kernel='precomputed')
     >>> # predict on training examples
-    >>> clf.predict(gram)
-    array([0, 1])
-
-Parameters of the RBF Kernel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When training an SVM with the *Radial Basis Function* (RBF) kernel, two
-parameters must be considered: ``C`` and ``gamma``.  The parameter ``C``,
-common to all SVM kernels, trades off misclassification of training examples
-against simplicity of the decision surface. A low ``C`` makes the decision
-surface smooth, while a high ``C`` aims at classifying all training examples
-correctly.  ``gamma`` defines how much influence a single training example has.
-The larger ``gamma`` is, the closer other examples must be to be affected.
+    >>> gram_test = np.dot(X_test, X_train.T)
+    >>> clf.predict(gram_test)
+    array([0, 1, 0])
 
-Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance.  One
-is advised to use :class:`sklearn.model_selection.GridSearchCV` with 
-``C`` and ``gamma`` spaced exponentially far apart to choose good values.
-
-.. topic:: Examples:
-
- * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py`
 
 .. _svm_mathematical_formulation:
 
 Mathematical formulation
 ========================
 
-A support vector machine constructs a hyper-plane or set of hyper-planes
-in a high or infinite dimensional space, which can be used for
+A support vector machine constructs a hyper-plane or set of hyper-planes in a
+high or infinite dimensional space, which can be used for
 classification, regression or other tasks. Intuitively, a good
 separation is achieved by the hyper-plane that has the largest distance
 to the nearest training data points of any class (so-called functional
 margin), since in general the larger the margin the lower the
-generalization error of the classifier.
-
+generalization error of the classifier. The figure below shows the decision
+function for a linearly separable problem, with three samples on the
+margin boundaries, called "support vectors":
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_001.png
    :align: center
    :scale: 75
 
+In general, when the problem isn't linearly separable, the support vectors
+are the samples *within* the margin boundaries.
+
+We recommend [#5]_ and [#6]_ as good references for the theory and
+practicalities of SVMs.
+
 SVC
 ---
 
 Given training vectors :math:`x_i \in \mathbb{R}^p`, i=1,..., n, in two classes, and a
-vector :math:`y \in \{1, -1\}^n`, SVC solves the following primal problem:
+vector :math:`y \in \{1, -1\}^n`, our goal is to find :math:`w \in
+\mathbb{R}^p` and :math:`b \in \mathbb{R}` such that the prediction given by
+:math:`\text{sign} (w^T\phi(x) + b)` is correct for most samples.
 
+SVC solves the following primal problem:
 
 .. math::
 
     \min_ {w, b, \zeta} \frac{1}{2} w^T w + C \sum_{i=1}^{n} \zeta_i
 
-
-
     \textrm {subject to } & y_i (w^T \phi (x_i) + b) \geq 1 - \zeta_i,\\
     & \zeta_i \geq 0, i=1, ..., n
 
-Its dual is
+Intuitively, we're trying to maximize the margin (by minimizing
+:math:`||w||^2 = w^Tw`), while incurring a penalty when a sample is
+misclassified or within the margin boundary. Ideally, the value :math:`y_i
+(w^T \phi (x_i) + b)` would be :math:`\geq 1` for all samples, which
+indicates a perfect prediction. But problems are usually not always perfectly
+separable with a hyperplane, so we allow some samples to be at a distance :math:`\zeta_i` from
+their correct margin boundary. The penalty term `C` controls the strengh of
+this penalty, and as a result, acts as an inverse regularization parameter
+(see note below).
+
+The dual problem to the primal is
 
 .. math::
 
@@ -599,16 +637,29 @@ Its dual is
    \textrm {subject to } & y^T \alpha = 0\\
    & 0 \leq \alpha_i \leq C, i=1, ..., n
 
-where :math:`e` is the vector of all ones, :math:`C > 0` is the upper bound,
-:math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
+where :math:`e` is the vector of all ones,
+and :math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
 :math:`Q_{ij} \equiv y_i y_j K(x_i, x_j)`, where :math:`K(x_i, x_j) = \phi (x_i)^T \phi (x_j)`
-is the kernel. Here training vectors are implicitly mapped into a higher
-(maybe infinite) dimensional space by the function :math:`\phi`.
+is the kernel. The terms :math:`\alpha_i` are called the dual coefficients,
+and they are upper-bounded by :math:`C`.
+This dual representation highlights the fact that training vectors are
+implicitly mapped into a higher (maybe infinite)
+dimensional space by the function :math:`\phi`: see `kernel trick
+<https://en.wikipedia.org/wiki/Kernel_method>`_.
 
+Once the optimization problem is solved, the output of
+:term:`decision_function` for a given sample :math:`x` becomes:
 
-The decision function is:
+.. math:: \sum_{i\in SV} y_i \alpha_i K(x_i, x) + b,
 
-.. math:: \operatorname{sgn}(\sum_{i=1}^n y_i \alpha_i K(x_i, x) + \rho)
+and the predicted class correspond to its sign. We only need to sum over the
+support vectors (i.e. the samples that lie within the margin) because the
+dual coefficients :math:`\alpha_i` are zero for the other samples.
+
+These parameters can be accessed through the attributes ``dual_coef_``
+which holds the product :math:`y_i \alpha_i`, ``support_vectors_`` which
+holds the support vectors, and ``intercept_`` which holds the independent
+term :math:`b`
 
 .. note::
 
@@ -619,37 +670,37 @@ The decision function is:
     estimator used is :class:`sklearn.linear_model.Ridge <ridge>` regression,
     the relation between them is given as :math:`C = \frac{1}{alpha}`.
 
-.. TODO multiclass case ?/
+LinearSVC
+---------
 
-This parameters can be accessed through the members ``dual_coef_``
-which holds the product :math:`y_i \alpha_i`, ``support_vectors_`` which
-holds the support vectors, and ``intercept_`` which holds the independent
-term :math:`\rho` :
-
-.. topic:: References:
-
- * `"Automatic Capacity Tuning of Very Large VC-dimension Classifiers"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.7215>`_,
-   I. Guyon, B. Boser, V. Vapnik - Advances in neural information
-   processing 1993.
+The primal problem can be equivalently formulated as
 
+.. math::
 
- * `"Support-vector networks"
-   <https://link.springer.com/article/10.1007%2FBF00994018>`_,
-   C. Cortes, V. Vapnik - Machine Learning, 20, 273-297 (1995).
+    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, y_i (w^T \phi(x_i) + b)),
 
+where we make use of the `hinge loss
+<https://en.wikipedia.org/wiki/Hinge_loss>`_. This is the form that is
+directly optimized by :class:`LinearSVC`, but unlike the dual form, this one
+does not involve inner products between samples, so the famous kernel trick
+cannot be applied. This is why only the linear kernel is supported by
+:class:`LinearSVC` (:math:`\phi` is the identity function).
 
+.. _nu_svc:
 
 NuSVC
 -----
 
-We introduce a new parameter :math:`\nu` which controls the number of
-support vectors and training errors. The parameter :math:`\nu \in (0,
-1]` is an upper bound on the fraction of training errors and a lower
-bound of the fraction of support vectors.
+The :math:`\nu`-SVC formulation [#7]_ is a reparameterization of the
+:math:`C`-SVC and therefore mathematically equivalent.
 
-It can be shown that the :math:`\nu`-SVC formulation is a reparameterization
-of the :math:`C`-SVC and therefore mathematically equivalent.
+We introduce a new parameter :math:`\nu` (instead of :math:`C`) which
+controls the number of support vectors and *margin errors*:
+:math:`\nu \in (0, 1]` is an upper bound on the fraction of margin errors and
+a lower bound of the fraction of support vectors. A margin error corresponds
+to a sample that lies on the wrong side of its margin boundary: it is either
+misclassified, or it is correctly classified but does not lie beyond the
+margin.
 
 
 SVR
@@ -669,7 +720,12 @@ vector :math:`y \in \mathbb{R}^n` :math:`\varepsilon`-SVR solves the following p
                           & w^T \phi (x_i) + b - y_i \leq \varepsilon + \zeta_i^*,\\
                           & \zeta_i, \zeta_i^* \geq 0, i=1, ..., n
 
-Its dual is
+Here, we are penalizing samples whose prediction is at least :math:`\varepsilon`
+away from their true target. These samples penalize the objective by
+:math:`\zeta_i` or :math:`\zeta_i^*`, depending on whether their predictions
+lie above or below the :math:`\varepsilon` tube.
+
+The dual problem is
 
 .. math::
 
@@ -679,49 +735,80 @@ Its dual is
    \textrm {subject to } & e^T (\alpha - \alpha^*) = 0\\
    & 0 \leq \alpha_i, \alpha_i^* \leq C, i=1, ..., n
 
-where :math:`e` is the vector of all ones, :math:`C > 0` is the upper bound,
+where :math:`e` is the vector of all ones,
 :math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
 :math:`Q_{ij} \equiv K(x_i, x_j) = \phi (x_i)^T \phi (x_j)`
 is the kernel. Here training vectors are implicitly mapped into a higher
 (maybe infinite) dimensional space by the function :math:`\phi`.
 
-The decision function is:
+The prediction is:
 
-.. math:: \sum_{i=1}^n (\alpha_i - \alpha_i^*) K(x_i, x) + \rho
+.. math:: \sum_{i \in SV}(\alpha_i - \alpha_i^*) K(x_i, x) + b
 
-These parameters can be accessed through the members ``dual_coef_``
+These parameters can be accessed through the attributes ``dual_coef_``
 which holds the difference :math:`\alpha_i - \alpha_i^*`, ``support_vectors_`` which
 holds the support vectors, and ``intercept_`` which holds the independent
-term :math:`\rho`
+term :math:`b`
 
-.. topic:: References:
+LinearSVR
+---------
+
+The primal problem can be equivalently formulated as
 
- * `"A Tutorial on Support Vector Regression"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.114.4288>`_,
-   Alex J. Smola, Bernhard Schölkopf - Statistics and Computing archive
-   Volume 14 Issue 3, August 2004, p. 199-222. 
+.. math::
+
+    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, |y_i - (w^T \phi(x_i) + b)| - \varepsilon),
 
+where we make use of the epsilon-insensitive loss, i.e. errors of less than
+:math:`\varepsilon` are ignored. This is the form that is directly optimized
+by :class:`LinearSVR`.
 
 .. _svm_implementation_details:
 
 Implementation details
 ======================
 
-Internally, we use `libsvm`_ and `liblinear`_ to handle all
+Internally, we use `libsvm`_ [#4]_ and `liblinear`_ [#3]_ to handle all
 computations. These libraries are wrapped using C and Cython.
+For a description of the implementation and details of the algorithms
+used, please refer to their respective papers.
+
 
 .. _`libsvm`: https://www.csie.ntu.edu.tw/~cjlin/libsvm/
 .. _`liblinear`: https://www.csie.ntu.edu.tw/~cjlin/liblinear/
 
 .. topic:: References:
 
-  For a description of the implementation and details of the algorithms
-  used, please refer to
+   .. [#1] Platt `"Probabilistic outputs for SVMs and comparisons to
+      regularized likelihood methods"
+      <https://www.cs.colorado.edu/~mozer/Teaching/syllabi/6622/papers/Platt1999.pdf>`_.
 
-    - `LIBSVM: A Library for Support Vector Machines
+   .. [#2] Wu, Lin and Weng, `"Probability estimates for multi-class
+      classification by pairwise coupling"
+      <https://www.csie.ntu.edu.tw/~cjlin/papers/svmprob/svmprob.pdf>`_, JMLR
+      5:975-1005, 2004.
+ 
+   .. [#3] Fan, Rong-En, et al.,
+      `"LIBLINEAR: A library for large linear classification."
+      <https://www.csie.ntu.edu.tw/~cjlin/papers/liblinear.pdf>`_,
+      Journal of machine learning research 9.Aug (2008): 1871-1874.
+
+   .. [#4] Chang and Lin, `LIBSVM: A Library for Support Vector Machines
       <https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_.
 
-    - `LIBLINEAR -- A Library for Large Linear Classification
-      <https://www.csie.ntu.edu.tw/~cjlin/liblinear/>`_.
+   .. [#5] Bishop, `Pattern recognition and machine learning
+      <https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf>`_,
+      chapter 7 Sparse Kernel Machines
 
+   .. [#6] `"A Tutorial on Support Vector Regression"
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.114.4288>`_,
+      Alex J. Smola, Bernhard Schölkopf - Statistics and Computing archive
+      Volume 14 Issue 3, August 2004, p. 199-222.
 
+   .. [#7] Schölkopf et. al `New Support Vector Algorithms
+      <https://www.stat.purdue.edu/~yuzhu/stat598m3/Papers/NewSVM.pdf>`_
+    
+   .. [#8] Crammer and Singer `On the Algorithmic Implementation ofMulticlass
+      Kernel-based Vector Machines
+      <http://jmlr.csail.mit.edu/papers/volume2/crammer01a/crammer01a.pdf>`_,
+      JMLR 2001.
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index b6327af0ebd88..ecd037d0631ac 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -136,14 +136,14 @@ Once trained, you can plot the tree with the plot_tree function::
 
 We can also export the tree in `Graphviz
 <https://www.graphviz.org/>`_ format using the :func:`export_graphviz`
-exporter. If you use the `conda <https://conda.io>`_ package manager, the graphviz binaries  
+exporter. If you use the `conda <https://conda.io>`_ package manager, the graphviz binaries
 
-and the python package can be installed with 
+and the python package can be installed with
 
     conda install python-graphviz
-   
+
 Alternatively binaries for graphviz can be downloaded from the graphviz project homepage,
-and the Python wrapper installed from pypi with `pip install graphviz`. 
+and the Python wrapper installed from pypi with `pip install graphviz`.
 
 Below is an example graphviz export of the above tree trained on the entire
 iris dataset; the results are saved in an output file `iris.pdf`::
@@ -414,7 +414,7 @@ it differs in that it supports numerical target variables (regression) and
 does not compute rule sets. CART constructs binary trees using the feature
 and threshold that yield the largest information gain at each node.
 
-scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn 
+scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn
 implementation does not support categorical variables for now.
 
 .. _ID3: https://en.wikipedia.org/wiki/ID3_algorithm
@@ -500,8 +500,8 @@ If the target is a continuous value, then for node :math:`m`,
 representing a region :math:`R_m` with :math:`N_m` observations, common
 criteria to minimise as for determining locations for future
 splits are Mean Squared Error, which minimizes the L2 error
-using mean values at terminal nodes, and Mean Absolute Error, which 
-minimizes the L1 error using median values at terminal nodes. 
+using mean values at terminal nodes, and Mean Absolute Error, which
+minimizes the L1 error using median values at terminal nodes.
 
 Mean Squared Error:
 
@@ -515,9 +515,9 @@ Mean Absolute Error:
 
 .. math::
 
-    \bar{y}_m = \frac{1}{N_m} \sum_{i \in N_m} y_i
+    median(y)_m = \underset{i \in N_m}{\mathrm{median}}(y_i)
 
-    H(X_m) = \frac{1}{N_m} \sum_{i \in N_m} |y_i - \bar{y}_m|
+    H(X_m) = \frac{1}{N_m} \sum_{i \in N_m} |y_i - median(y)_m|
 
 where :math:`X_m` is the training data in node :math:`m`
 
@@ -560,7 +560,7 @@ be pruned. This process stops when the pruned tree's minimal
 .. topic:: Examples:
 
     * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
-  
+
 .. topic:: References:
 
     .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification
diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index a77fb03e36f65..2b80d6fe2b762 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -963,6 +963,44 @@ div.sphx-glr-thumbcontainer {
   }
 }
 
+/* Pandas dataframe css */
+/* Taken from: https://github.com/spatialaudio/nbsphinx/blob/fb3ba670fc1ba5f54d4c487573dbc1b4ecf7e9ff/src/nbsphinx.py#L587-L619 */
+/* FIXME: to be removed when sphinx-gallery >= 5.0 will be released */
+
+table.dataframe {
+  border: none !important;
+  border-collapse: collapse;
+  border-spacing: 0;
+  border-color: transparent;
+  color: black;
+  font-size: 12px;
+  table-layout: fixed;
+}
+table.dataframe thead {
+  border-bottom: 1px solid black;
+  vertical-align: bottom;
+}
+table.dataframe tr,
+table.dataframe th,
+table.dataframe td {
+  text-align: right;
+  vertical-align: middle;
+  padding: 0.5em 0.5em;
+  line-height: normal;
+  white-space: normal;
+  max-width: none;
+  border: none;
+}
+table.dataframe th {
+  font-weight: bold;
+}
+table.dataframe tbody tr:nth-child(odd) {
+  background: #f5f5f5;
+}
+table.dataframe tbody tr:hover {
+  background: rgba(66, 165, 245, 0.2);
+}
+
 /* rellinks */
 
 .sk-btn-rellink {
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 90c99e6c04c19..4c489c1887815 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -25,6 +25,12 @@ random sampling procedures.
 - :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
   and :class:`ensemble.IsolationForest`. |Fix|
 
+- Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+  including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
+  :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
+  :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`.
+  |Efficiency| |Fix|
+
 Details are listed in the changelog below.
 
 (While we are trying to better inform users by providing this information, we
@@ -133,15 +139,25 @@ Changelog
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
    exclusively choose the components that explain the variance greater than
    `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
-- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small
-   eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
-   :user:`Gelavizh Ahmadi <gelavizh1>` and
-   :user:`Marija Vlajic Wheeler <marijavlajic>`.
+
+- |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
+  handles small eigenvalues, and does not infer 0 as the correct number of
+  components. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+  :user:`Gelavizh Ahmadi <gelavizh1>` and :user:`Marija Vlajic Wheeler
+  <marijavlajic>` and :pr:`16841` by `Nicolas Hug`_.
 
 - |Enhancement| :class:`decomposition.NMF` and
   :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
   :pr:`16280` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+- |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now
+  applies the correct inverse transform to the transformed data. :pr:`16655`
+  by :user:`Lewis Ball <lrjball>`.
+
+- |Fix| Fixed bug that was causing :class:`decomposition.KernelPCA` to sometimes
+  raise `invalid value encountered in multiply` during `fit`.
+  :pr:`16718` by :user:`Gui Miotto <gui-miotto>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -176,6 +192,11 @@ Changelog
   samples in the training set. :pr:`14516` by :user:`Johann Faouzi
   <johannfaouzi>`.
 
+- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
+  constraints, useful when features are supposed to have a positive/negative
+  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+
 - |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`,
   :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
   where the attribute `estimators_samples_` did not generate the proper indices
@@ -283,7 +304,7 @@ Changelog
 - |API| Changed the formatting of values in
   :meth:`metrics.ConfusionMatrixDisplay.plot` and
   :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
-  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and 
+  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
   `Thomas Fan`_.
 
 - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows
@@ -350,16 +371,41 @@ Changelog
 
 - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
   will now accept value 'if_binary' and will drop the first category of
-  each feature with two categories. :pr:`#16245`
+  each feature with two categories. :pr:`16245`
   by :user:`Rushabh Vasani <rushabh-v>`.
 
 - |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly
   computing statistics when calling `partial_fit` on sparse inputs.
   :pr:`16466` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Fix a bug in :class:`preprocessing.Normalizer` with norm='max',
+  which was not taking the absolute value of the maximum values before
+  normalizing the vectors. :pr:`16632` by
+  :user:`Maura Pintor <Maupin1991>` and :user:`Battista Biggio <bbiggio>`.
+
 :mod:`sklearn.svm`
 ..................
 
+- |Fix| |Efficiency| Improved ``libsvm`` and ``liblinear`` random number
+  generators used to randomly select coordinates in the coordinate descent
+  algorithms. Platform-dependent C ``rand()`` was used, which is only able to
+  generate numbers up to ``32767`` on windows platform (see this `blog
+  post <https://codeforces.com/blog/entry/61587>`) and also has poor
+  randomization power as suggested by `this presentation
+  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`.
+  It was replaced with C++11 ``mt19937``, a Mersenne Twister that correctly
+  generates 31bits/63bits random numbers on all platforms. In addition, the
+  crude "modulo" postprocessor used to get a random number in a bounded
+  interval was replaced by the tweaked Lemire method as suggested by `this blog
+  post <http://www.pcg-random.org/posts/bounded-rands.html>`.
+  Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+  including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
+  :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
+  :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`,
+  is affected. In particular users can expect a better convergence when the
+  number of samples (LibSVM) or the number of features (LibLinear) is large.
+  :pr:`13511` by :user:`Sylvain Marié <smarie>`.
+
 - |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
   `probB_`, are now deprecated as they were not useful. :pr:`15558` by
   `Thomas Fan`_.
@@ -395,6 +441,10 @@ Changelog
   pandas sparse DataFrame.
   :pr:`16021` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :func:`utils.validation.check_array` now constructs a sparse
+  matrix from a pandas DataFrame that contains only `SparseArray`s.
+  :pr:`16728` by `Thomas Fan`_.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/examples/applications/plot_outlier_detection_housing.py b/examples/applications/plot_outlier_detection_wine.py
similarity index 60%
rename from examples/applications/plot_outlier_detection_housing.py
rename to examples/applications/plot_outlier_detection_wine.py
index 41c697e2e2d2b..6f245b7e6c1cb 100644
--- a/examples/applications/plot_outlier_detection_housing.py
+++ b/examples/applications/plot_outlier_detection_wine.py
@@ -24,26 +24,13 @@
 
 First example
 -------------
-The first example illustrates how robust covariance estimation can help
-concentrating on a relevant cluster when another one exists. Here, many
-observations are confounded into one and break down the empirical covariance
-estimation.
-Of course, some screening tools would have pointed out the presence of two
-clusters (Support Vector Machines, Gaussian Mixture Models, univariate
-outlier detection, ...). But had it been a high-dimensional example, none
-of these could be applied that easily.
-
-Second example
---------------
-The second example shows the ability of the Minimum Covariance Determinant
-robust estimator of covariance to concentrate on the main mode of the data
-distribution: the location seems to be well estimated, although the covariance
-is hard to estimate due to the banana-shaped distribution. Anyway, we can
-get rid of some outlying observations.
-The One-Class SVM is able to capture the real data structure, but the
-difficulty is to adjust its kernel bandwidth parameter so as to obtain
-a good compromise between the shape of the data scatter matrix and the
-risk of over-fitting the data.
+The first example illustrates how the Minimum Covariance Determinant
+robust estimator can help concentrate on a relevant cluster when outlying
+points exist. Here the empirical covariance estimation is skewed by points
+outside of the main cluster. Of course, some screening tools would have pointed
+out the presence of two clusters (Support Vector Machines, Gaussian Mixture
+Models, univariate outlier detection, ...). But had it been a high-dimensional
+example, none of these could be applied that easily.
 
 """
 print(__doc__)
@@ -56,26 +43,24 @@
 from sklearn.svm import OneClassSVM
 import matplotlib.pyplot as plt
 import matplotlib.font_manager
-from sklearn.datasets import load_boston
-
-# Get data
-X1 = load_boston()['data'][:, [8, 10]]  # two clusters
-X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped
+from sklearn.datasets import load_wine
 
 # Define "classifiers" to be used
 classifiers = {
     "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
-                                             contamination=0.261),
+                                             contamination=0.25),
     "Robust Covariance (Minimum Covariance Determinant)":
-    EllipticEnvelope(contamination=0.261),
-    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
+    EllipticEnvelope(contamination=0.25),
+    "OCSVM": OneClassSVM(nu=0.25, gamma=0.35)}
 colors = ['m', 'g', 'b']
 legend1 = {}
 legend2 = {}
 
+# Get data
+X1 = load_wine()['data'][:, [1, 2]]  # two clusters
+
 # Learn a frontier for outlier detection with several classifiers
-xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
-xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
+xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
 for i, (clf_name, clf) in enumerate(classifiers.items()):
     plt.figure(1)
     clf.fit(X1)
@@ -83,25 +68,19 @@
     Z1 = Z1.reshape(xx1.shape)
     legend1[clf_name] = plt.contour(
         xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])
-    plt.figure(2)
-    clf.fit(X2)
-    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
-    Z2 = Z2.reshape(xx2.shape)
-    legend2[clf_name] = plt.contour(
-        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
 
 legend1_values_list = list(legend1.values())
 legend1_keys_list = list(legend1.keys())
 
 # Plot the results (= shape of the data points cloud)
 plt.figure(1)  # two clusters
-plt.title("Outlier detection on a real data set (boston housing)")
+plt.title("Outlier detection on a real data set (wine recognition)")
 plt.scatter(X1[:, 0], X1[:, 1], color='black')
 bbox_args = dict(boxstyle="round", fc="0.8")
 arrow_args = dict(arrowstyle="->")
-plt.annotate("several confounded points", xy=(24, 19),
+plt.annotate("outlying points", xy=(4, 2),
              xycoords="data", textcoords="data",
-             xytext=(13, 10), bbox=bbox_args, arrowprops=arrow_args)
+             xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args)
 plt.xlim((xx1.min(), xx1.max()))
 plt.ylim((yy1.min(), yy1.max()))
 plt.legend((legend1_values_list[0].collections[0],
@@ -109,15 +88,43 @@
             legend1_values_list[2].collections[0]),
            (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
            loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=12))
-plt.ylabel("accessibility to radial highways")
-plt.xlabel("pupil-teacher ratio by town")
+           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.ylabel("ash")
+plt.xlabel("malic_acid")
+
+plt.show()
+
+##############################################################################
+# Second example
+# --------------
+# The second example shows the ability of the Minimum Covariance Determinant
+# robust estimator of covariance to concentrate on the main mode of the data
+# distribution: the location seems to be well estimated, although the
+# covariance is hard to estimate due to the banana-shaped distribution. Anyway,
+# we can get rid of some outlying observations. The One-Class SVM is able to
+# capture the real data structure, but the difficulty is to adjust its kernel
+# bandwidth parameter so as to obtain a good compromise between the shape of
+# the data scatter matrix and the risk of over-fitting the data.
+
+# Get data
+X2 = load_wine()['data'][:, [6, 9]]  # "banana"-shaped
+
+# Learn a frontier for outlier detection with several classifiers
+xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
+for i, (clf_name, clf) in enumerate(classifiers.items()):
+    plt.figure(2)
+    clf.fit(X2)
+    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
+    Z2 = Z2.reshape(xx2.shape)
+    legend2[clf_name] = plt.contour(
+        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
 
 legend2_values_list = list(legend2.values())
 legend2_keys_list = list(legend2.keys())
 
+# Plot the results (= shape of the data points cloud)
 plt.figure(2)  # "banana" shape
-plt.title("Outlier detection on a real data set (boston housing)")
+plt.title("Outlier detection on a real data set (wine recognition)")
 plt.scatter(X2[:, 0], X2[:, 1], color='black')
 plt.xlim((xx2.min(), xx2.max()))
 plt.ylim((yy2.min(), yy2.max()))
@@ -126,8 +133,8 @@
             legend2_values_list[2].collections[0]),
            (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
            loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=12))
-plt.ylabel("% lower status of the population")
-plt.xlabel("average number of rooms per dwelling")
+           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.ylabel("color_intensity")
+plt.xlabel("flavanoids")
 
 plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 343bae08ef4a6..ef40a2247bcc5 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -61,7 +61,7 @@ def f(x):
 # Make the prediction on the meshed x-axis
 y_pred = clf.predict(xx)
 
-# Plot the function, the prediction and the 90% confidence interval based on
+# Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
 fig = plt.figure()
 plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$')
@@ -71,7 +71,7 @@ def f(x):
 plt.plot(xx, y_lower, 'k-')
 plt.fill(np.concatenate([xx, xx[::-1]]),
          np.concatenate([y_upper, y_lower[::-1]]),
-         alpha=.5, fc='b', ec='None', label='90% prediction interval')
+         alpha=.5, fc='b', ec='None', label='95% prediction interval')
 plt.xlabel('$x$')
 plt.ylabel('$f(x)$')
 plt.ylim(-10, 20)
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index bab88d71844d9..860bb14687534 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -3,79 +3,139 @@
 Gradient Boosting regression
 ============================
 
-Demonstrate Gradient Boosting on the Boston housing dataset.
+This example demonstrates Gradient Boosting to produce a predictive
+model from an ensemble of weak predictive models. Gradient boosting can be used
+for regression and classification problems. Here, we will train a model to
+tackle a diabetes regression task. We will obtain the results from
+:class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss
+and 500 regression trees of depth 4.
 
-This example fits a Gradient Boosting model with least squares loss and
-500 regression trees of depth 4.
+Note: For larger datasets (n_samples >= 10000), please refer to
+:class:`sklearn.ensemble.HistGradientBoostingRegressor`.
 """
 print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Maria Telenczuk <https://github.com/maikia>
+#         Katrina Ni <https://github.com/nilichen>
 #
 # License: BSD 3 clause
 
-import numpy as np
 import matplotlib.pyplot as plt
-
-from sklearn import ensemble
-from sklearn import datasets
-from sklearn.utils import shuffle
+import numpy as np
+from sklearn import datasets, ensemble
+from sklearn.inspection import permutation_importance
 from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+
+##############################################################################
+# Load the data
+# -------------------------------------
+#
+# First we need to load the data.
+
+diabetes = datasets.load_diabetes()
+X, y = diabetes.data, diabetes.target
+
+##############################################################################
+# Data preprocessing
+# -------------------------------------
+#
+# Next, we will split our dataset to use 90% for training and leave the rest
+# for testing. We will also set the regression model parameters. You can play
+# with these parameters to see how the results change.
+#
+# n_estimators : the number of boosting stages that will be performed.
+# Later, we will plot deviance against boosting iterations.
+#
+# max_depth : limits the number of nodes in the tree.
+# The best value depends on the interaction of the input variables.
+#
+# min_samples_split : the minimum number of samples required to split an
+# internal node.
+#
+# learning_rate : how much the contribution of each tree will shrink.
+#
+# loss : loss function to optimize. The least squares function is  used in this
+# case however, there are many other options (see
+# :class:`~sklearn.ensemble.GradientBoostingRegressor` ).
 
-# #############################################################################
-# Load data
-boston = datasets.load_boston()
-X, y = shuffle(boston.data, boston.target, random_state=13)
-X = X.astype(np.float32)
-offset = int(X.shape[0] * 0.9)
-X_train, y_train = X[:offset], y[:offset]
-X_test, y_test = X[offset:], y[offset:]
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.1, random_state=13)
 
-# #############################################################################
+params = {'n_estimators': 500,
+          'max_depth': 4,
+          'min_samples_split': 5,
+          'learning_rate': 0.01,
+          'loss': 'ls'}
+
+##############################################################################
 # Fit regression model
-params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
-          'learning_rate': 0.01, 'loss': 'ls'}
-clf = ensemble.GradientBoostingRegressor(**params)
+# -------------------------------------
+#
+# Now we will initiate the gradient boosting regressors and fit it with our
+# training data. Let's also look and the mean squared error on the test data.
+
+reg = ensemble.GradientBoostingRegressor(**params)
+reg.fit(X_train, y_train)
 
-clf.fit(X_train, y_train)
-mse = mean_squared_error(y_test, clf.predict(X_test))
-print("MSE: %.4f" % mse)
+mse = mean_squared_error(y_test, reg.predict(X_test))
+print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
 
-# #############################################################################
+##############################################################################
 # Plot training deviance
+# -------------------------------------
+#
+# Finally, we will visualize the results. To do that we will first compute the
+# test set deviance and then plot it against boosting iterations.
 
-# compute test set deviance
 test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
+for i, y_pred in enumerate(reg.staged_predict(X_test)):
+    test_score[i] = reg.loss_(y_test, y_pred)
 
-for i, y_pred in enumerate(clf.staged_predict(X_test)):
-    test_score[i] = clf.loss_(y_test, y_pred)
-
-plt.figure(figsize=(12, 6))
-plt.subplot(1, 2, 1)
+fig = plt.figure(figsize=(6, 6))
+plt.subplot(1, 1, 1)
 plt.title('Deviance')
-plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
+plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
          label='Training Set Deviance')
 plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
          label='Test Set Deviance')
 plt.legend(loc='upper right')
 plt.xlabel('Boosting Iterations')
 plt.ylabel('Deviance')
+fig.tight_layout()
+plt.show()
 
-# #############################################################################
-# Plot impurity-based feature importance
+##############################################################################
+# Plot feature importance
+# -------------------------------------
+#
+# Careful, impurity-based feature importances can be misleading for
+# high cardinality features (many unique values). As an alternative,
+# the permutation importances of ``reg`` can be computed on a
+# held out test set. See :ref:`permutation_importance` for more details.
 #
-# Warning: impurity-based feature importances can be misleading for
-# high cardinality features (many unique values). See
-# :func:`sklearn.inspection.permutation_importance` as an alternative.
+# For this example, the impurity-based and permutation methods identify the
+# same 2 strongly predictive features but not in the same order. The third most
+# predictive feature, "bp", is also the same for the 2 methods. The remaining
+# features are less predictive and the error bars of the permutation plot
+# show that they overlap with 0.
 
-feature_importance = clf.feature_importances_
-# make importances relative to max importance
-feature_importance = 100.0 * (feature_importance / feature_importance.max())
+feature_importance = reg.feature_importances_
 sorted_idx = np.argsort(feature_importance)
 pos = np.arange(sorted_idx.shape[0]) + .5
-plt.subplot(1, 2, 2)
+fig = plt.figure(figsize=(12, 6))
+plt.subplot(1, 2, 1)
 plt.barh(pos, feature_importance[sorted_idx], align='center')
-plt.yticks(pos, boston.feature_names[sorted_idx])
-plt.xlabel('Relative Importance')
-plt.title('Variable Importance')
+plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
+plt.title('Feature Importance (MDI)')
+
+result = permutation_importance(reg, X_test, y_test, n_repeats=10,
+                                random_state=42, n_jobs=2)
+sorted_idx = result.importances_mean.argsort()
+plt.subplot(1, 2, 2)
+plt.boxplot(result.importances[sorted_idx].T,
+            vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])
+plt.title("Permutation Importance (test set)")
+fig.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py
new file mode 100644
index 0000000000000..887c2f2bbe2ed
--- /dev/null
+++ b/examples/ensemble/plot_monotonic_constraints.py
@@ -0,0 +1,70 @@
+"""
+=====================
+Monotonic Constraints
+=====================
+
+This example illustrates the effect of monotonic constraints on a gradient
+boosting estimator.
+
+We build an artificial dataset where the target value is in general
+positively correlated with the first feature (with some random and
+non-random variations), and in general negatively correlated with the second
+feature.
+
+By imposing a positive (increasing) or negative (decreasing) constraint on
+the features during the learning process, the estimator is able to properly
+follow the general trend instead of being subject to the variations.
+
+This example was inspired by the `XGBoost documentation
+<https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html>`_.
+"""
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.inspection import plot_partial_dependence
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+print(__doc__)
+
+rng = np.random.RandomState(0)
+
+n_samples = 5000
+f_0 = rng.rand(n_samples)  # positive correlation with y
+f_1 = rng.rand(n_samples)  # negative correlation with y
+X = np.c_[f_0, f_1]
+noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+     5 * f_1 - np.cos(10 * np.pi * f_1) +
+     noise)
+
+fig, ax = plt.subplots()
+
+
+# Without any constraint
+gbdt = HistGradientBoostingRegressor()
+gbdt.fit(X, y)
+disp = plot_partial_dependence(
+    gbdt, X, features=[0, 1],
+    line_kw={'linewidth': 4, 'label': 'unconstrained'},
+    ax=ax)
+
+# With positive and negative constraints
+gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+gbdt.fit(X, y)
+
+plot_partial_dependence(
+    gbdt, X, features=[0, 1],
+    feature_names=('First feature\nPositive constraint',
+                   'Second feature\nNegtive constraint'),
+    line_kw={'linewidth': 4, 'label': 'constrained'},
+    ax=disp.axes_)
+
+for f_idx in (0, 1):
+    disp.axes_[0, f_idx].plot(X[:, f_idx], y, 'o', alpha=.3, zorder=-1)
+    disp.axes_[0, f_idx].set_ylim(-6, 6)
+
+plt.legend()
+fig.suptitle("Monotonic constraints illustration")
+
+plt.show()
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index c0c849def9b3a..1e4ef6a81bba8 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -62,8 +62,7 @@
 scores = -np.log10(selector.pvalues_)
 scores /= scores.max()
 plt.bar(X_indices - .45, scores, width=.2,
-        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
-        edgecolor='black')
+        label=r'Univariate score ($-Log(p_{value})$)')
 
 # #############################################################################
 # Compare to the weights of an SVM
@@ -75,8 +74,7 @@
 svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
 svm_weights /= svm_weights.sum()
 
-plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
-        color='navy', edgecolor='black')
+plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight')
 
 clf_selected = make_pipeline(
         SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()
@@ -89,8 +87,7 @@
 svm_weights_selected /= svm_weights_selected.sum()
 
 plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
-        width=.2, label='SVM weights after selection', color='c',
-        edgecolor='black')
+        width=.2, label='SVM weights after selection')
 
 
 plt.title("Comparing feature selection")
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 90e8e4cad1a9b..7e2fae467b7c5 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -119,7 +119,7 @@
     keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
 )
 
-# plot boston results
+# plot california housing results
 fig, ax = plt.subplots(figsize=(13, 6))
 means = -scores.mean()
 errors = scores.std()
diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
new file mode 100644
index 0000000000000..7583bfa0a052f
--- /dev/null
+++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -0,0 +1,671 @@
+"""
+==================================================================
+Common pitfalls in interpretation of coefficients of linear models
+==================================================================
+
+In linear models, the target value is modeled as
+a linear combination of the features (see the :ref:`linear_model` User Guide
+section for a description of a set of linear models available in
+scikit-learn).
+Coefficients in multiple linear models represent the relationship between the
+given feature, :math:`X_i` and the target, :math:`y`, assuming that all the
+other features remain constant (`conditional dependence
+<https://en.wikipedia.org/wiki/Conditional_dependence>`_).
+This is different from plotting :math:`X_i` versus :math:`y` and fitting a
+linear relationship: in that case all possible values of the other features are
+taken into account in the estimation (marginal dependence).
+
+This example will provide some hints in interpreting coefficient in linear
+models, pointing at problems that arise when either the linear model is not
+appropriate to describe the dataset, or when features are correlated.
+
+We will use data from the `"Current Population Survey"
+<https://www.openml.org/d/534>`_ from 1985 to predict
+wage as a function of various features such as experience, age, or education.
+
+.. contents::
+   :local:
+   :depth: 1
+"""
+
+print(__doc__)
+
+import numpy as np
+import scipy as sp
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+#############################################################################
+# The dataset: wages
+# ------------------
+#
+# We fetch the data from `OpenML <http://openml.org/>`_.
+# Note that setting the parameter `as_frame` to True will retrieve the data
+# as a pandas dataframe.
+
+from sklearn.datasets import fetch_openml
+
+survey = fetch_openml(data_id=534, as_frame=True)
+
+##############################################################################
+# Then, we identify features `X` and targets `y`: the column WAGE is our
+# target variable (i.e., the variable which we want to predict).
+#
+X = survey.data[survey.feature_names]
+X.describe(include="all")
+
+##############################################################################
+# Note that the dataset contains categorical and numerical variables.
+# We will need to take this into account when preprocessing the dataset
+# thereafter.
+
+X.head()
+
+##############################################################################
+# Our target for prediction: the wage.
+# Wages are described as floating-point number in dollars per hour.
+y = survey.target.values.ravel()
+survey.target.head()
+
+###############################################################################
+# We split the sample into a train and a test dataset.
+# Only the train dataset will be used in the following exploratory analysis.
+# This is a way to emulate a real situation where predictions are performed on
+# an unknown target, and we don't want our analysis and decisions to be biased
+# by our knowledge of the test data.
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, random_state=42
+)
+
+##############################################################################
+# First, let's get some insights by looking at the variable distributions and
+# at the pairwise relationships between them. Only numerical
+# variables will be used. In the following plot, each dot represents a sample.
+#
+#   .. _marginal_dependencies:
+
+train_dataset = X_train.copy()
+train_dataset.insert(0, "WAGE", y_train)
+_ = sns.pairplot(train_dataset, kind='reg', diag_kind='kde')
+
+##############################################################################
+# Looking closely at the WAGE distribution reveals that it has a
+# long tail. For this reason, we should take its logarithm
+# to turn it approximately into a normal distribution (linear models such
+# as ridge or lasso work best for a normal distribution of error).
+#
+# The WAGE is increasing when EDUCATION is increasing.
+# Note that the dependence between WAGE and EDUCATION
+# represented here is a marginal dependence, i.e., it describes the behavior
+# of a specific variable without keeping the others fixed.
+#
+# Also, the EXPERIENCE and AGE are strongly linearly correlated.
+#
+# .. _the-pipeline:
+#
+# The machine-learning pipeline
+# -----------------------------
+#
+# To design our machine-learning pipeline, we first manually
+# check the type of data that we are dealing with:
+
+survey.data.info()
+
+#############################################################################
+# As seen previously, the dataset contains columns with different data types
+# and we need to apply a specific preprocessing for each data types.
+# In particular categorical variables cannot be included in linear model if not
+# coded as integers first. In addition, to avoid categorical features to be
+# treated as ordered values, we need to one-hot-encode them.
+# Our pre-processor will
+#
+# - one-hot encode (i.e., generate a column by category) the categorical
+#   columns;
+# - as a first approach (we will see after how the normalisation of numerical
+#   values will affect our discussion), keep numerical values as they are.
+
+from sklearn.compose import make_column_transformer
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR',
+                       'MARR', 'UNION', 'SEX', 'SOUTH']
+numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE']
+
+preprocessor = make_column_transformer(
+    (OneHotEncoder(drop='if_binary'), categorical_columns),
+    remainder='passthrough'
+)
+
+##############################################################################
+# To describe the dataset as a linear model we use a ridge regressor
+# with a very small regularization and to model the logarithm of the WAGE.
+
+
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import Ridge
+from sklearn.compose import TransformedTargetRegressor
+
+model = make_pipeline(
+    preprocessor,
+    TransformedTargetRegressor(
+        regressor=Ridge(alpha=1e-10),
+        func=np.log10,
+        inverse_func=sp.special.exp10
+    )
+)
+
+##############################################################################
+# Processing the dataset
+# ----------------------
+#
+# First, we fit the model.
+
+_ = model.fit(X_train, y_train)
+
+##############################################################################
+# Then we check the performance of the computed model plotting its predictions
+# on the test set and computing,
+# for example, the median absolute error of the model.
+
+from sklearn.metrics import median_absolute_error
+
+y_pred = model.predict(X_train)
+
+mae = median_absolute_error(y_train, y_pred)
+string_score = f'MAE on training set: {mae:.2f} $/hour'
+y_pred = model.predict(X_test)
+mae = median_absolute_error(y_test, y_pred)
+string_score += f'\nMAE on testing set: {mae:.2f} $/hour'
+fig, ax = plt.subplots(figsize=(5, 5))
+plt.scatter(y_test, y_pred)
+ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
+plt.text(3, 20, string_score)
+plt.title('Ridge model, small regularization')
+plt.ylabel('Model predictions')
+plt.xlabel('Truths')
+plt.xlim([0, 27])
+_ = plt.ylim([0, 27])
+
+##############################################################################
+# The model learnt is far from being a good model making accurate predictions:
+# this is obvious when looking at the plot above, where good predictions
+# should lie on the red line.
+#
+# In the following section, we will interpret the coefficients of the model.
+# While we do so, we should keep in mind that any conclusion we draw is
+# about the model that we build, rather than about the true (real-world)
+# generative process of the data.
+#
+# Interpreting coefficients: scale matters
+# ---------------------------------------------
+#
+# First of all, we can take a look to the values of the coefficients of the
+# regressor we have fitted.
+
+feature_names = (model.named_steps['columntransformer']
+                      .named_transformers_['onehotencoder']
+                      .get_feature_names(input_features=categorical_columns))
+feature_names = np.concatenate(
+    [feature_names, numerical_columns])
+
+coefs = pd.DataFrame(
+    model.named_steps['transformedtargetregressor'].regressor_.coef_,
+    columns=['Coefficients'], index=feature_names
+)
+
+coefs
+
+##############################################################################
+# The AGE coefficient is expressed in "dollars/hour per living years" while the
+# EDUCATION one is expressed in "dollars/hour per years of education". This
+# representation of the coefficients has the benefit of making clear the
+# practical predictions of the model: an increase of :math:`1` year in AGE
+# means a decrease of :math:`0.030867` dollars/hour, while an increase of
+# :math:`1` year in EDUCATION means an increase of :math:`0.054699`
+# dollars/hour. On the other hand, categorical variables (as UNION or SEX) are
+# adimensional numbers taking either the value 0 or 1. Their coefficients
+# are expressed in dollars/hour. Then, we cannot compare the magnitude of
+# different coefficients since the features have different natural scales, and
+# hence value ranges, because of their different unit of measure. This is more
+# visible if we plot the coefficients.
+
+coefs.plot(kind='barh', figsize=(9, 7))
+plt.title('Ridge model, small regularization')
+plt.axvline(x=0, color='.5')
+plt.subplots_adjust(left=.3)
+
+###############################################################################
+# Indeed, from the plot above the most important factor in determining WAGE
+# appears to be the
+# variable UNION, even if our intuition might tell us that variables
+# like EXPERIENCE should have more impact.
+#
+# Looking at the coefficient plot to gauge feature importance can be
+# misleading as some of them vary on a small scale, while others, like AGE,
+# varies a lot more, several decades.
+#
+# This is visible if we compare the standard deviations of different
+# features.
+
+X_train_preprocessed = pd.DataFrame(
+    model.named_steps['columntransformer'].transform(X_train),
+    columns=feature_names
+)
+
+X_train_preprocessed.std(axis=0).plot(kind='barh', figsize=(9, 7))
+plt.title('Features std. dev.')
+plt.subplots_adjust(left=.3)
+
+###############################################################################
+# Multiplying the coefficients by the standard deviation of the related
+# feature would reduce all the coefficients to the same unit of measure.
+# As we will see :ref:`after<scaling_num>` this is equivalent to normalize
+# numerical variables to their standard deviation,
+# as :math:`y = \sum{coef_i \times X_i} =
+# \sum{(coef_i \times std_i) \times (X_i / std_i)}`.
+#
+# In that way, we emphasize that the
+# greater the variance of a feature, the larger the weight of the corresponding
+# coefficient on the output, all else being equal.
+
+coefs = pd.DataFrame(
+    model.named_steps['transformedtargetregressor'].regressor_.coef_ *
+    X_train_preprocessed.std(axis=0),
+    columns=['Coefficient importance'], index=feature_names
+)
+coefs.plot(kind='barh', figsize=(9, 7))
+plt.title('Ridge model, small regularization')
+plt.axvline(x=0, color='.5')
+plt.subplots_adjust(left=.3)
+
+###############################################################################
+# Now that the coefficients have been scaled, we can safely compare them.
+#
+# .. warning::
+#
+#   Why does the plot above suggest that an increase in age leads to a
+#   decrease in wage? Why the :ref:`initial pairplot
+#   <marginal_dependencies>` is telling the opposite?
+#
+# The plot above tells us about dependencies between a specific feature and
+# the target when all other features remain constant, i.e., **conditional
+# dependencies**. An increase of the AGE will induce a decrease
+# of the WAGE when all other features remain constant. On the contrary, an
+# increase of the EXPERIENCE will induce an increase of the WAGE when all
+# other features remain constant.
+# Also, AGE, EXPERIENCE and EDUCATION are the three variables that most
+# influence the model.
+#
+# Checking the variability of the coefficients
+# --------------------------------------------
+#
+# We can check the coefficient variability through cross-validation:
+# it is a form of data perturbation (related to
+# `resampling <https://en.wikipedia.org/wiki/Resampling_(statistics)>`_).
+#
+# If coefficients vary significantly when changing the input dataset
+# their robustness is not guaranteed, and they should probably be interpreted
+# with caution.
+
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import RepeatedKFold
+
+cv_model = cross_validate(
+    model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),
+    return_estimator=True, n_jobs=-1
+)
+coefs = pd.DataFrame(
+    [est.named_steps['transformedtargetregressor'].regressor_.coef_ *
+     X_train_preprocessed.std(axis=0)
+     for est in cv_model['estimator']],
+    columns=feature_names
+)
+plt.figure(figsize=(9, 7))
+sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
+plt.axvline(x=0, color='.5')
+plt.xlabel('Coefficient importance')
+plt.title('Coefficient importance and its variability')
+plt.subplots_adjust(left=.3)
+
+###############################################################################
+# The problem of correlated variables
+# -----------------------------------
+#
+# The AGE and EXPERIENCE coefficients are affected by strong variability which
+# might be due to the collinearity between the 2 features: as AGE and
+# EXPERIENCE vary together in the data, their effect is difficult to tease
+# apart.
+#
+# To verify this interpretation we plot the variability of the AGE and
+# EXPERIENCE coefficient.
+#
+# .. _covariation:
+
+plt.ylabel('Age coefficient')
+plt.xlabel('Experience coefficient')
+plt.grid(True)
+plt.xlim(-0.4, 0.5)
+plt.ylim(-0.4, 0.5)
+plt.scatter(coefs["AGE"], coefs["EXPERIENCE"])
+_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE '
+              'across folds')
+
+###############################################################################
+# Two regions are populated: when the EXPERIENCE coefficient is
+# positive the AGE one is negative and viceversa.
+#
+# To go further we remove one of the 2 features and check what is the impact
+# on the model stability.
+
+column_to_drop = ['AGE']
+
+cv_model = cross_validate(
+    model, X.drop(columns=column_to_drop), y,
+    cv=RepeatedKFold(n_splits=5, n_repeats=5),
+    return_estimator=True, n_jobs=-1
+)
+coefs = pd.DataFrame(
+    [est.named_steps['transformedtargetregressor'].regressor_.coef_ *
+     X_train_preprocessed.drop(columns=column_to_drop).std(axis=0)
+     for est in cv_model['estimator']],
+    columns=feature_names[:-1]
+)
+plt.figure(figsize=(9, 7))
+sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
+plt.axvline(x=0, color='.5')
+plt.title('Coefficient importance and its variability')
+plt.xlabel('Coefficient importance')
+plt.subplots_adjust(left=.3)
+
+###############################################################################
+# The estimation of the EXPERIENCE coefficient is now less variable and
+# remain important for all models trained during cross-validation.
+#
+# .. _scaling_num:
+#
+# Preprocessing numerical variables
+# ---------------------------------
+#
+# As said above (see ":ref:`the-pipeline`"), we could also choose to scale
+# numerical values before training the model.
+# This can be useful to apply a similar amount regularization to all of them
+# in the Ridge.
+# The preprocessor is redefined in order to subtract the mean and scale
+# variables to unit variance.
+
+from sklearn.preprocessing import StandardScaler
+
+preprocessor = make_column_transformer(
+    (OneHotEncoder(drop='if_binary'), categorical_columns),
+    (StandardScaler(), numerical_columns),
+    remainder='passthrough'
+)
+
+###############################################################################
+# The model will stay unchanged.
+
+model = make_pipeline(
+    preprocessor,
+    TransformedTargetRegressor(
+        regressor=Ridge(alpha=1e-10),
+        func=np.log10,
+        inverse_func=sp.special.exp10
+    )
+)
+
+_ = model.fit(X_train, y_train)
+
+##############################################################################
+# Again, we check the performance of the computed
+# model using, for example, the median absolute error of the model and the R
+# squared coefficient.
+
+y_pred = model.predict(X_train)
+mae = median_absolute_error(y_train, y_pred)
+string_score = f'MAE on training set: {mae:.2f} $/hour'
+y_pred = model.predict(X_test)
+mae = median_absolute_error(y_test, y_pred)
+string_score += f'\nMAE on testing set: {mae:.2f} $/hour'
+fig, ax = plt.subplots(figsize=(6, 6))
+plt.scatter(y_test, y_pred)
+ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
+
+plt.text(3, 20, string_score)
+
+plt.title('Ridge model, small regularization, normalized variables')
+plt.ylabel('Model predictions')
+plt.xlabel('Truths')
+plt.xlim([0, 27])
+_ = plt.ylim([0, 27])
+
+##############################################################################
+# For the coefficient analysis, scaling is not needed this time.
+
+coefs = pd.DataFrame(
+    model.named_steps['transformedtargetregressor'].regressor_.coef_,
+    columns=['Coefficients'], index=feature_names
+)
+coefs.plot(kind='barh', figsize=(9, 7))
+plt.title('Ridge model, small regularization, normalized variables')
+plt.axvline(x=0, color='.5')
+plt.subplots_adjust(left=.3)
+
+##############################################################################
+# We now inspect the coefficients across several cross-validation folds.
+
+cv_model = cross_validate(
+    model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),
+    return_estimator=True, n_jobs=-1
+)
+coefs = pd.DataFrame(
+    [est.named_steps['transformedtargetregressor'].regressor_.coef_
+     for est in cv_model['estimator']],
+    columns=feature_names
+)
+plt.figure(figsize=(9, 7))
+sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
+plt.axvline(x=0, color='.5')
+plt.title('Coefficient variability')
+plt.subplots_adjust(left=.3)
+
+##############################################################################
+# The result is quite similar to the non-normalized case.
+#
+# Linear models with regularization
+# ---------------------------------
+#
+# In machine-learning practice, Ridge Regression is more often used with
+# non-negligible regularization.
+#
+# Above, we limited this regularization to a very little amount.
+# Regularization improves the conditioning of the problem and reduces the
+# variance of the estimates. RidgeCV applies cross validation in order to
+# determine which value of the regularization parameter (`alpha`) is best
+# suited for prediction.
+
+from sklearn.linear_model import RidgeCV
+
+model = make_pipeline(
+    preprocessor,
+    TransformedTargetRegressor(
+        regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)),
+        func=np.log10,
+        inverse_func=sp.special.exp10
+    )
+)
+
+_ = model.fit(X_train, y_train)
+
+##############################################################################
+# First we check which value of :math:`\alpha` has been selected.
+
+model[-1].regressor_.alpha_
+
+##############################################################################
+# Then we check the quality of the predictions.
+
+y_pred = model.predict(X_train)
+mae = median_absolute_error(y_train, y_pred)
+string_score = f'MAE on training set: {mae:.2f} $/hour'
+y_pred = model.predict(X_test)
+mae = median_absolute_error(y_test, y_pred)
+string_score += f'\nMAE on testing set: {mae:.2f} $/hour'
+
+fig, ax = plt.subplots(figsize=(6, 6))
+plt.scatter(y_test, y_pred)
+ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
+
+plt.text(3, 20, string_score)
+
+plt.title('Ridge model, regularization, normalized variables')
+plt.ylabel('Model predictions')
+plt.xlabel('Truths')
+plt.xlim([0, 27])
+_ = plt.ylim([0, 27])
+
+##############################################################################
+# The ability to reproduce the data of the regularized model is similar to
+# the one of the non-regularized model.
+
+coefs = pd.DataFrame(
+    model.named_steps['transformedtargetregressor'].regressor_.coef_,
+    columns=['Coefficients'], index=feature_names
+)
+coefs.plot(kind='barh', figsize=(9, 7))
+plt.title('Ridge model, regularization, normalized variables')
+plt.axvline(x=0, color='.5')
+plt.subplots_adjust(left=.3)
+
+##############################################################################
+# The coefficients are significantly different.
+# AGE and EXPERIENCE coefficients are both positive but they now have less
+# influence on the prediction.
+#
+# The regularization reduces the influence of correlated
+# variables on the model because the weight is shared between the two
+# predictive variables, so neither alone would have strong weights.
+#
+# On the other hand, the weights obtained with regularization are more
+# stable  (see the :ref:`ridge_regression` User Guide section). This
+# increased stability is visible from the plot, obtained from data
+# perturbations, in a cross validation. This plot can  be compared with
+# the :ref:`previous one<covariation>`.
+
+cv_model = cross_validate(
+    model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),
+    return_estimator=True, n_jobs=-1
+)
+coefs = pd.DataFrame(
+    [est.named_steps['transformedtargetregressor'].regressor_.coef_ *
+     X_train_preprocessed.std(axis=0)
+     for est in cv_model['estimator']],
+    columns=feature_names
+)
+
+plt.ylabel('Age coefficient')
+plt.xlabel('Experience coefficient')
+plt.grid(True)
+plt.xlim(-0.4, 0.5)
+plt.ylim(-0.4, 0.5)
+plt.scatter(coefs["AGE"], coefs["EXPERIENCE"])
+_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE '
+              'across folds')
+
+##############################################################################
+# Linear models with sparse coefficients
+# --------------------------------------
+#
+# Another possibility to take into account correlated variables in the dataset,
+# is to estimate sparse coefficients. In some way we already did it manually
+# when we dropped the AGE column in a previous Ridge estimation.
+#
+# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse
+# coefficients. LassoCV applies cross validation in order to
+# determine which value of the regularization parameter (`alpha`) is best
+# suited for the model estimation.
+
+from sklearn.linear_model import LassoCV
+
+model = make_pipeline(
+    preprocessor,
+    TransformedTargetRegressor(
+        regressor=LassoCV(alphas=np.logspace(-10, 10, 21), max_iter=100000),
+        func=np.log10,
+        inverse_func=sp.special.exp10
+    )
+)
+
+_ = model.fit(X_train, y_train)
+
+##############################################################################
+# First we verify which value of :math:`\alpha` has been selected.
+
+model[-1].regressor_.alpha_
+
+##############################################################################
+# Then we check the quality of the predictions.
+
+y_pred = model.predict(X_train)
+mae = median_absolute_error(y_train, y_pred)
+string_score = f'MAE on training set: {mae:.2f} $/hour'
+y_pred = model.predict(X_test)
+mae = median_absolute_error(y_test, y_pred)
+string_score += f'\nMAE on testing set: {mae:.2f} $/hour'
+
+fig, ax = plt.subplots(figsize=(6, 6))
+plt.scatter(y_test, y_pred)
+ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
+
+plt.text(3, 20, string_score)
+
+plt.title('Lasso model, regularization, normalized variables')
+plt.ylabel('Model predictions')
+plt.xlabel('Truths')
+plt.xlim([0, 27])
+_ = plt.ylim([0, 27])
+
+##############################################################################
+# For our dataset, again the model is not very predictive.
+
+coefs = pd.DataFrame(
+    model.named_steps['transformedtargetregressor'].regressor_.coef_,
+    columns=['Coefficients'], index=feature_names
+)
+coefs.plot(kind='barh', figsize=(9, 7))
+plt.title('Lasso model, regularization, normalized variables')
+plt.axvline(x=0, color='.5')
+plt.subplots_adjust(left=.3)
+
+#############################################################################
+# A Lasso model identifies the correlation between
+# AGE and EXPERIENCE and suppresses one of them for the sake of the prediction.
+#
+# It is important to keep in mind that the coefficients that have been
+# dropped may still be related to the outcome by themselves: the model
+# chose to suppress them because they bring little or no additional
+# information on top of the other features. Additionnaly, this selection
+# is unstable for correlated features, and should be interpreted with
+# caution.
+#
+# Lessons learned
+# ---------------
+#
+# * Coefficients must be scaled to the same unit of measure to retrieve
+#   feature importance. Scaling them with the standard-deviation of the
+#   feature is a useful proxy.
+# * Coefficients in multivariate linear models represent the dependency
+#   between a given feature and the target, **conditional** on the other
+#   features.
+# * Correlated features induce instabilities in the coefficients of linear
+#   models and their effects cannot be well teased apart.
+# * Different linear models respond differently to feature correlation and
+#   coefficients could significantly vary from one another.
+# * Inspecting coefficients across the folds of a cross-validation loop
+#   gives an idea of their stability.
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 9de22673606f2..1cb482f5e970b 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -16,7 +16,7 @@
 from sklearn.datasets import make_classification
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from matplotlib import cm
-from sklearn.utils.fixes import logsumexp
+from scipy.special import logsumexp
 
 print(__doc__)
 
diff --git a/pyproject.toml b/pyproject.toml
index 2547baae5874d..6011f5d2ea1ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,6 +4,11 @@ requires = [
     "setuptools",
     "wheel",
     "Cython>=0.28.5",
-    "numpy>=1.13.3",
+    "numpy==1.13.3; python_version=='3.6' and platform_system!='AIX'",
+    "numpy==1.14.5; python_version=='3.7' and platform_system!='AIX'",
+    "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'",
+    "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'",
+    "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'",
+    "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'",
     "scipy>=0.19.1",
 ]
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 59aa672533524..7f203a079f22b 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -13,7 +13,6 @@
 See http://scikit-learn.org for complete documentation.
 """
 import sys
-import re
 import logging
 import os
 
@@ -61,7 +60,8 @@
     # This variable is injected in the __builtins__ by the build
     # process. It is used to enable importing subpackages of sklearn when
     # the binaries are not built
-    __SKLEARN_SETUP__
+    # mypy error: Cannot determine type of '__SKLEARN_SETUP__'
+    __SKLEARN_SETUP__  # type: ignore
 except NameError:
     __SKLEARN_SETUP__ = False
 
diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py
index 045dc3d297be0..1586f9e991a8d 100644
--- a/sklearn/_build_utils/deprecated_modules.py
+++ b/sklearn/_build_utils/deprecated_modules.py
@@ -271,7 +271,8 @@
 _FILE_CONTENT_TEMPLATE = """
 # THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
 import sys
-from . import {new_module_name}
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import {new_module_name}  # type: ignore
 from {relative_dots}externals._pep562 import Pep562
 from {relative_dots}utils.deprecation import _raise_dep_warning_if_not_pytest
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index a5490efa28c0a..8a719d49bd6de 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -21,7 +21,7 @@
 from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
                    MetaEstimatorMixin)
 from .preprocessing import label_binarize, LabelBinarizer
-from .utils import check_X_y, check_array, indexable, column_or_1d
+from .utils import check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted, check_consistent_length
 from .utils.validation import _check_sample_weight
 from .isotonic import IsotonicRegression
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index c98272d6aae33..bfc27a84c7b76 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -18,8 +18,7 @@
 from ..utils.extmath import (make_nonnegative, randomized_svd,
                              safe_sparse_dot)
 
-from ..utils.validation import (assert_all_finite, check_array,
-                                _deprecate_positional_args)
+from ..utils.validation import assert_all_finite, _deprecate_positional_args
 
 
 __all__ = ['SpectralCoclustering',
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 52c962052f9bc..5422b10cc4dd7 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -14,7 +14,6 @@
 from scipy import sparse
 
 from ..base import BaseEstimator, ClusterMixin
-from ..utils import check_array
 from ..utils.validation import _check_sample_weight, _deprecate_positional_args
 from ..neighbors import NearestNeighbors
 
@@ -30,19 +29,19 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
 
     Parameters
     ----------
-    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
-            array of shape (n_samples, n_samples)
+    X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
         A feature array, or array of distances between samples if
         ``metric='precomputed'``.
 
-    eps : float, optional
+    eps : float, default=0.5
         The maximum distance between two samples for one to be considered
         as in the neighborhood of the other. This is not a maximum bound
         on the distances of points within a cluster. This is the most
         important DBSCAN parameter to choose appropriately for your data set
         and distance function.
 
-    min_samples : int, optional
+    min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point
         to be considered as a core point. This includes the point itself.
 
@@ -56,33 +55,33 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         X may be a :term:`sparse graph <sparse graph>`,
         in which case only "nonzero" elements may be considered neighbors.
 
-    metric_params : dict, optional
+    metric_params : dict, default=None
         Additional keyword arguments for the metric function.
 
         .. versionadded:: 0.19
 
-    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
         See NearestNeighbors module documentation for details.
 
-    leaf_size : int, optional (default = 30)
+    leaf_size : int, default=30
         Leaf size passed to BallTree or cKDTree. This can affect the speed
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
 
-    p : float, optional
+    p : float, default=2
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    sample_weight : array, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Weight of each sample, such that a sample with a weight of at least
         ``min_samples`` is by itself a core sample; a sample with negative
         weight may inhibit its eps-neighbor from being core.
         Note that weights are absolute, and default to 1.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of parallel jobs to run for neighbors search. ``None`` means
         1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
         using all processors. See :term:`Glossary <n_jobs>` for more details.
@@ -91,10 +90,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
 
     Returns
     -------
-    core_samples : array [n_core_samples]
+    core_samples : ndarray of shape (n_core_samples,)
         Indices of core samples.
 
-    labels : array [n_samples]
+    labels : ndarray of shape (n_samples,)
         Cluster labels for each point.  Noisy samples are given the label -1.
 
     See also
@@ -201,7 +200,7 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    n_jobs : int or None, default=None
+    n_jobs : int, default=None
         The number of parallel jobs to run.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
@@ -209,13 +208,13 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     Attributes
     ----------
-    core_sample_indices_ : array, shape = [n_core_samples]
+    core_sample_indices_ : ndarray of shape (n_core_samples,)
         Indices of core samples.
 
-    components_ : array, shape = [n_core_samples, n_features]
+    components_ : ndarray of shape (n_core_samples, n_features)
         Copy of each core sample found by training.
 
-    labels_ : array, shape = [n_samples]
+    labels_ : ndarray of shape (n_samples)
         Cluster labels for each point in the dataset given to fit().
         Noisy samples are given the label -1.
 
@@ -289,13 +288,13 @@ def fit(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features), or \
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
             (n_samples, n_samples)
             Training instances to cluster, or distances between instances if
             ``metric='precomputed'``. If a sparse matrix is provided, it will
             be converted into a sparse ``csr_matrix``.
 
-        sample_weight : array, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             Weight of each sample, such that a sample with a weight of at least
             ``min_samples`` is by itself a core sample; a sample with a
             negative weight may inhibit its eps-neighbor from being core.
@@ -368,13 +367,13 @@ def fit_predict(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features), or \
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
             (n_samples, n_samples)
             Training instances to cluster, or distances between instances if
             ``metric='precomputed'``. If a sparse matrix is provided, it will
             be converted into a sparse ``csr_matrix``.
 
-        sample_weight : array, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             Weight of each sample, such that a sample with a weight of at least
             ``min_samples`` is by itself a core sample; a sample with a
             negative weight may inhibit its eps-neighbor from being core.
@@ -385,7 +384,7 @@ def fit_predict(self, X, y=None, sample_weight=None):
 
         Returns
         -------
-        labels : ndarray, shape (n_samples,)
+        labels : ndarray of shape (n_samples,)
             Cluster labels. Noisy samples are given the label -1.
         """
         self.fit(X, sample_weight=sample_weight)
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index e95c8fe0490a4..65c8871fbb456 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -10,6 +10,7 @@
 
 import numpy as np
 cimport numpy as np
+from threadpoolctl import threadpool_limits
 cimport cython
 from cython cimport floating
 from cython.parallel import prange, parallel
@@ -29,7 +30,19 @@ from ._k_means_fast cimport _center_shift
 np.import_array()
 
 
-def _init_bounds_dense(
+# Threadpoolctl wrappers to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubsciption.
+def elkan_iter_chunked_dense(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _elkan_iter_chunked_dense(*args, **kwargs)
+
+
+def elkan_iter_chunked_sparse(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _elkan_iter_chunked_sparse(*args, **kwargs)
+
+
+def init_bounds_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[:, ::1] centers,                  # IN
         floating[:, ::1] center_half_distances,    # IN
@@ -99,7 +112,7 @@ def _init_bounds_dense(
         upper_bounds[i] = min_dist
 
 
-def _init_bounds_sparse(
+def init_bounds_sparse(
         X,                                       # IN
         floating[:, ::1] centers,                # IN
         floating[:, ::1] center_half_distances,  # IN
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 93e2c6f0b9c89..4a33b0c5c8a02 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -8,6 +8,7 @@
 
 import numpy as np
 cimport numpy as np
+from threadpoolctl import threadpool_limits
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
@@ -25,6 +26,18 @@ from ._k_means_fast cimport _average_centers, _center_shift
 np.import_array()
 
 
+# Threadpoolctl wrappers to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubsciption.
+def lloyd_iter_chunked_dense(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _lloyd_iter_chunked_dense(*args, **kwargs)
+
+
+def lloyd_iter_chunked_sparse(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _lloyd_iter_chunked_sparse(*args, **kwargs)
+
+
 def _lloyd_iter_chunked_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 27ec0e5f388f6..b185983c4b0f9 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import scipy.sparse as sp
-from threadpoolctl import threadpool_limits
 
 from ..base import BaseEstimator, ClusterMixin, TransformerMixin
 from ..metrics.pairwise import euclidean_distances
@@ -32,12 +31,12 @@
 from ._k_means_fast import _inertia_dense
 from ._k_means_fast import _inertia_sparse
 from ._k_means_fast import _mini_batch_update_csr
-from ._k_means_lloyd import _lloyd_iter_chunked_dense
-from ._k_means_lloyd import _lloyd_iter_chunked_sparse
-from ._k_means_elkan import _init_bounds_dense
-from ._k_means_elkan import _init_bounds_sparse
-from ._k_means_elkan import _elkan_iter_chunked_dense
-from ._k_means_elkan import _elkan_iter_chunked_sparse
+from ._k_means_lloyd import lloyd_iter_chunked_dense
+from ._k_means_lloyd import lloyd_iter_chunked_sparse
+from ._k_means_elkan import init_bounds_dense
+from ._k_means_elkan import init_bounds_sparse
+from ._k_means_elkan import elkan_iter_chunked_dense
+from ._k_means_elkan import elkan_iter_chunked_sparse
 
 
 ###############################################################################
@@ -420,12 +419,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     center_shift = np.zeros(n_clusters, dtype=X.dtype)
 
     if sp.issparse(X):
-        init_bounds = _init_bounds_sparse
-        elkan_iter = _elkan_iter_chunked_sparse
+        init_bounds = init_bounds_sparse
+        elkan_iter = elkan_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        init_bounds = _init_bounds_dense
-        elkan_iter = _elkan_iter_chunked_dense
+        init_bounds = init_bounds_dense
+        elkan_iter = elkan_iter_chunked_dense
         _inertia = _inertia_dense
 
     init_bounds(X, centers, center_half_distances,
@@ -559,10 +558,10 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     center_shift = np.zeros(n_clusters, dtype=X.dtype)
 
     if sp.issparse(X):
-        lloyd_iter = _lloyd_iter_chunked_sparse
+        lloyd_iter = lloyd_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        lloyd_iter = _lloyd_iter_chunked_dense
+        lloyd_iter = lloyd_iter_chunked_dense
         _inertia = _inertia_dense
 
     for i in range(max_iter):
@@ -594,7 +593,8 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     return labels, inertia, centers, i + 1
 
 
-def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
+def _labels_inertia(X, sample_weight, x_squared_norms, centers,
+                    n_threads=None):
     """E step of the K-means EM algorithm.
 
     Compute the labels and the inertia of the given samples and centers.
@@ -615,7 +615,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     centers : ndarray, shape (n_clusters, n_features)
         The cluster centers.
 
-    n_threads : int, default=1
+    n_threads : int, default=None
         The number of OpenMP threads to use for the computation. Parallelism is
         sample-wise on the main cython loop which assigns each sample to its
         closest center.
@@ -631,16 +631,18 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
 
+    n_threads = _openmp_effective_n_threads(n_threads)
+
     sample_weight = _check_normalize_sample_weight(sample_weight, X)
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
 
     if sp.issparse(X):
-        _labels = _lloyd_iter_chunked_sparse
+        _labels = lloyd_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        _labels = _lloyd_iter_chunked_dense
+        _labels = lloyd_iter_chunked_dense
         _inertia = _inertia_dense
 
     _labels(X, sample_weight, x_squared_norms, centers, centers,
@@ -1033,22 +1035,19 @@ def fit(self, X, y=None, sample_weight=None):
         # seeds for the initializations of the kmeans runs.
         seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
 
-        # limit number of threads in second level of nested parallelism
-        # (i.e. BLAS) to avoid oversubsciption.
-        with threadpool_limits(limits=1, user_api="blas"):
-            for seed in seeds:
-                # run a k-means once
-                labels, inertia, centers, n_iter_ = kmeans_single(
-                    X, sample_weight, self.n_clusters, max_iter=self.max_iter,
-                    init=init, verbose=self.verbose, tol=tol,
-                    x_squared_norms=x_squared_norms, random_state=seed,
-                    n_threads=self._n_threads)
-                # determine if these results are the best so far
-                if best_inertia is None or inertia < best_inertia:
-                    best_labels = labels.copy()
-                    best_centers = centers.copy()
-                    best_inertia = inertia
-                    best_n_iter = n_iter_
+        for seed in seeds:
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = kmeans_single(
+                X, sample_weight, self.n_clusters, max_iter=self.max_iter,
+                init=init, verbose=self.verbose, tol=tol,
+                x_squared_norms=x_squared_norms, random_state=seed,
+                n_threads=self._n_threads)
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels.copy()
+                best_centers = centers.copy()
+                best_inertia = inertia
+                best_n_iter = n_iter_
 
         if not sp.issparse(X):
             if not self.copy_x:
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 92322b0ab0bfd..c8ca3ec569a88 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -14,7 +14,6 @@
 import warnings
 import numpy as np
 
-from ..utils import check_array
 from ..utils import gen_batches, get_chunk_n_rows
 from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index 2faddabefa157..3eac2b84b74fd 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -11,7 +11,7 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import check_random_state, as_float_array
-from ..utils.validation import check_array, _deprecate_positional_args
+from ..utils.validation import _deprecate_positional_args
 from ..metrics.pairwise import pairwise_kernels
 from ..neighbors import kneighbors_graph, NearestNeighbors
 from ..manifold import spectral_embedding
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 7d5a920600d7d..6e3e664c622a8 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -9,7 +9,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import SkipTest
 
 from sklearn.base import BaseEstimator, BiclusterMixin
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index e94757bca6993..903c63a00fd22 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -23,6 +23,7 @@
 from ..utils import _determine_key_type
 from ..utils.metaestimators import _BaseComposition
 from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = [
@@ -171,8 +172,9 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
     """
     _required_parameters = ['transformers']
 
+    @_deprecate_positional_args
     def __init__(self,
-                 transformers,
+                 transformers, *,
                  remainder='drop',
                  sparse_threshold=0.3,
                  n_jobs=None,
@@ -826,8 +828,9 @@ class make_column_selector:
            [-0.30151134,  0.        ,  1.        ,  0.        ],
            [ 0.90453403,  0.        ,  0.        ,  1.        ]])
     """
-
-    def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None):
+    @_deprecate_positional_args
+    def __init__(self, pattern=None, *, dtype_include=None,
+                 dtype_exclude=None):
         self.pattern = pattern
         self.dtype_include = dtype_include
         self.dtype_exclude = dtype_exclude
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 27f4ef63edf68..d8c062ed423a2 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -10,6 +10,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils import check_array, _safe_indexing
 from ..preprocessing import FunctionTransformer
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
 
 __all__ = ['TransformedTargetRegressor']
@@ -106,7 +107,8 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
     <sphx_glr_auto_examples_compose_plot_transformed_target.py>`.
 
     """
-    def __init__(self, regressor=None, transformer=None,
+    @_deprecate_positional_args
+    def __init__(self, regressor=None, *, transformer=None,
                  func=None, inverse_func=None, check_inverse=True):
         self.regressor = regressor
         self.transformer = transformer
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 77ff9adb7fc0c..1d0d93db75101 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -18,9 +18,10 @@
 from . import empirical_covariance, EmpiricalCovariance, log_likelihood
 
 from ..exceptions import ConvergenceWarning
-from ..utils.validation import check_random_state, check_array
+from ..utils.validation import check_random_state
 from ..utils.validation import _deprecate_positional_args
-from ..linear_model import _cd_fast as cd_fast
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from ..linear_model import _cd_fast as cd_fast  # type: ignore
 from ..linear_model import lars_path_gram
 from ..model_selection import check_cv, cross_val_score
 
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 06e1b4f180347..fcc13a84e803e 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -534,6 +534,27 @@ class OAS(EmpiricalCovariance):
       coefficient in the convex combination used for the computation
       of the shrunk estimate. Range is [0, 1].
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import OAS
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                             cov=real_cov,
+    ...                             size=500)
+    >>> oas = OAS().fit(X)
+    >>> oas.covariance_
+    array([[0.7533..., 0.2763...],
+           [0.2763..., 0.3964...]])
+    >>> oas.precision_
+    array([[ 1.7833..., -1.2431... ],
+           [-1.2431...,  3.3889...]])
+    >>> oas.shrinkage_
+    0.0195...
+
     Notes
     -----
     The regularised covariance is:
diff --git a/sklearn/cross_decomposition/_cca.py b/sklearn/cross_decomposition/_cca.py
index bd2e933339228..9d60b59cd07f1 100644
--- a/sklearn/cross_decomposition/_cca.py
+++ b/sklearn/cross_decomposition/_cca.py
@@ -1,5 +1,6 @@
 from ._pls import _PLS
 from ..base import _UnstableArchMixin
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['CCA']
 
@@ -102,7 +103,8 @@ class CCA(_UnstableArchMixin, _PLS):
     PLSSVD
     """
 
-    def __init__(self, n_components=2, scale=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, scale=True,
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(n_components=n_components, scale=scale,
                          deflation_mode="canonical", mode="B",
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 88951d18468d8..508448c3ede39 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -17,6 +17,7 @@
 from ..utils import check_array, check_consistent_length
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD']
@@ -248,7 +249,8 @@ class _PLS(TransformerMixin, RegressorMixin, MultiOutputMixin, BaseEstimator,
     """
 
     @abstractmethod
-    def __init__(self, n_components=2, scale=True, deflation_mode="regression",
+    def __init__(self, n_components=2, *, scale=True,
+                 deflation_mode="regression",
                  mode="A", algorithm="nipals", norm_y_weights=False,
                  max_iter=500, tol=1e-06, copy=True):
         self.n_components = n_components
@@ -650,8 +652,8 @@ class PLSRegression(_PLS):
     Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris:
     Editions Technic.
     """
-
-    def __init__(self, n_components=2, scale=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, scale=True,
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(
             n_components=n_components, scale=scale,
@@ -799,8 +801,8 @@ class PLSCanonical(_PLS):
     CCA
     PLSSVD
     """
-
-    def __init__(self, n_components=2, scale=True, algorithm="nipals",
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, scale=True, algorithm="nipals",
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(
             n_components=n_components, scale=scale,
@@ -868,8 +870,8 @@ class PLSSVD(TransformerMixin, BaseEstimator):
     PLSCanonical
     CCA
     """
-
-    def __init__(self, n_components=2, scale=True, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, scale=True, copy=True):
         self.n_components = n_components
         self.scale = scale
         self.copy = copy
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index 2d788a2cf6271..f47dcc8e8f22f 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -1,4 +1,3 @@
-import pytest
 import numpy as np
 from numpy.testing import assert_approx_equal
 
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index 909470f980a5e..d481288133991 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -9,7 +9,6 @@
 import os
 import csv
 import shutil
-import warnings
 from collections import namedtuple
 from os import environ, listdir, makedirs
 from os.path import dirname, exists, expanduser, isdir, join, splitext
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index b5efd68adbd1c..3dc3833db3417 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -393,7 +393,7 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
     Download it if necessary.
 
     =================   =======================
-    Classes                                5749
+    Classes                                   2
     Samples total                         13233
     Dimensionality                         5828
     Features            real, between 0 and 255
diff --git a/sklearn/datasets/descr/breast_cancer.rst b/sklearn/datasets/descr/breast_cancer.rst
index fea6b6f017c16..bc4d60b9a363d 100644
--- a/sklearn/datasets/descr/breast_cancer.rst
+++ b/sklearn/datasets/descr/breast_cancer.rst
@@ -18,13 +18,13 @@ Breast cancer wisconsin (diagnostic) dataset
         - compactness (perimeter^2 / area - 1.0)
         - concavity (severity of concave portions of the contour)
         - concave points (number of concave portions of the contour)
-        - symmetry 
+        - symmetry
         - fractal dimension ("coastline approximation" - 1)
 
         The mean, standard error, and "worst" or largest (mean of the three
-        largest values) of these features were computed for each image,
-        resulting in 30 features.  For instance, field 3 is Mean Radius, field
-        13 is Radius SE, field 23 is Worst Radius.
+        worst/largest values) of these features were computed for each image,
+        resulting in 30 features.  For instance, field 0 is Mean Radius, field
+        10 is Radius SE, field 20 is Worst Radius.
 
         - class:
                 - WDBC-Malignant
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index a58bdc9ed644d..224538b181696 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -8,7 +8,6 @@
 from functools import partial
 
 import pytest
-import joblib
 
 import numpy as np
 from sklearn.datasets import get_data_home
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 9b7ad28f9f235..6b2d76ce2143b 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -17,7 +17,7 @@
 from ..utils import (check_array, check_random_state, gen_even_slices,
                      gen_batches)
 from ..utils.extmath import randomized_svd, row_norms
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars
 
 
@@ -1013,7 +1013,8 @@ class SparseCoder(SparseCodingMixin, BaseEstimator):
     """
     _required_parameters = ["dictionary"]
 
-    def __init__(self, dictionary, transform_algorithm='omp',
+    @_deprecate_positional_args
+    def __init__(self, dictionary, *, transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
                  split_sign=False, n_jobs=None, positive_code=False,
                  transform_max_iter=1000):
@@ -1183,7 +1184,8 @@ class DictionaryLearning(SparseCodingMixin, BaseEstimator):
     SparsePCA
     MiniBatchSparsePCA
     """
-    def __init__(self, n_components=None, alpha=1, max_iter=1000, tol=1e-8,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-8,
                  fit_algorithm='lars', transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
                  n_jobs=None, code_init=None, dict_init=None, verbose=False,
@@ -1388,7 +1390,8 @@ class MiniBatchDictionaryLearning(SparseCodingMixin, BaseEstimator):
     MiniBatchSparsePCA
 
     """
-    def __init__(self, n_components=None, alpha=1, n_iter=1000,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, n_iter=1000,
                  fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True,
                  dict_init=None, transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index 7147fd452559c..a09b89bda6d6e 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -28,7 +28,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, check_random_state
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -138,7 +138,9 @@ class FactorAnalysis(TransformerMixin, BaseEstimator):
     FastICA: Independent component analysis, a latent variable model with
         non-Gaussian latent variables.
     """
-    def __init__(self, n_components=None, tol=1e-2, copy=True, max_iter=1000,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, tol=1e-2, copy=True,
+                 max_iter=1000,
                  noise_variance_init=None, svd_method='randomized',
                  iterated_power=3, random_state=0):
         self.n_components = n_components
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index f9e3a148f6860..7329fbbe4be1f 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -20,6 +20,7 @@
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fastica', 'FastICA']
 
@@ -390,7 +391,8 @@ def my_g(x):
     pp. 411-430*
 
     """
-    def __init__(self, n_components=None, algorithm='parallel', whiten=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, algorithm='parallel', whiten=True,
                  fun='logcosh', fun_args=None, max_iter=200, tol=1e-4,
                  w_init=None, random_state=None):
         super().__init__()
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index ac535b58e7f5e..bc34c17326f19 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -10,6 +10,7 @@
 from ._base import _BasePCA
 from ..utils import check_array, gen_batches
 from ..utils.extmath import svd_flip, _incremental_mean_and_var
+from ..utils.validation import _deprecate_positional_args
 
 
 class IncrementalPCA(_BasePCA):
@@ -163,8 +164,8 @@ class IncrementalPCA(_BasePCA):
     SparsePCA
     TruncatedSVD
     """
-
-    def __init__(self, n_components=None, whiten=False, copy=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, whiten=False, copy=True,
                  batch_size=None):
         self.n_components = n_components
         self.whiten = whiten
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index b1f83c8e0ff81..527f78d34bbb5 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -9,12 +9,12 @@
 
 from ..utils import check_random_state
 from ..utils.extmath import svd_flip
-from ..utils.validation import (check_is_fitted, check_array,
-                                _check_psd_eigenvalues)
+from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
 from ..exceptions import NotFittedError
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
+from ..utils.validation import _deprecate_positional_args
 
 
 class KernelPCA(TransformerMixin, BaseEstimator):
@@ -139,8 +139,8 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         component analysis. In Advances in kernel methods,
         MIT Press, Cambridge, MA, USA 327-352.
     """
-
-    def __init__(self, n_components=None, kernel="linear",
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, kernel="linear",
                  gamma=None, degree=3, coef0=1, kernel_params=None,
                  alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
                  tol=0, max_iter=None, remove_zero_eig=False,
@@ -217,7 +217,7 @@ def _fit_transform(self, K):
 
         # flip eigenvectors' sign to enforce deterministic output
         self.alphas_, _ = svd_flip(self.alphas_,
-                                   np.empty_like(self.alphas_).T)
+                                   np.zeros_like(self.alphas_).T)
 
         # sort eigenvectors in descending order
         indices = self.lambdas_.argsort()[::-1]
@@ -358,5 +358,6 @@ def inverse_transform(self, X):
                                  "the inverse transform is not available.")
 
         K = self._get_kernel(X, self.X_transformed_fit_)
-
+        n_samples = self.X_transformed_fit_.shape[0]
+        K.flat[::n_samples + 1] += self.alpha
         return np.dot(K, self.dual_coef_)
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index ba68e03a16191..641e68cd7fc8b 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -13,15 +13,14 @@
 
 import numpy as np
 import scipy.sparse as sp
-from scipy.special import gammaln
+from scipy.special import gammaln, logsumexp
 from joblib import Parallel, delayed, effective_n_jobs
 
 from ..base import BaseEstimator, TransformerMixin
-from ..utils import (check_random_state, check_array,
-                     gen_batches, gen_even_slices)
-from ..utils.fixes import logsumexp
+from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d,
                                _dirichlet_expectation_2d)
@@ -282,8 +281,8 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
         https://github.com/blei-lab/onlineldavb
 
     """
-
-    def __init__(self, n_components=10, doc_topic_prior=None,
+    @_deprecate_positional_args
+    def __init__(self, n_components=10, *, doc_topic_prior=None,
                  topic_word_prior=None, learning_method='batch',
                  learning_decay=.7, learning_offset=10., max_iter=10,
                  batch_size=128, evaluate_every=-1, total_samples=1e6,
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 86c9acddfea1e..f1385d21596e3 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -19,6 +19,7 @@
 from ..utils import check_random_state, check_array
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
 from ..utils.validation import check_is_fitted, check_non_negative
+from ..utils.validation import _deprecate_positional_args
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -501,7 +502,7 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
 
     rng = check_random_state(random_state)
 
-    for n_iter in range(max_iter):
+    for n_iter in range(1, max_iter + 1):
         violation = 0.
 
         # Update W
@@ -512,7 +513,7 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
             violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H,
                                                     l2_reg_H, shuffle, rng)
 
-        if n_iter == 0:
+        if n_iter == 1:
             violation_init = violation
 
         if violation_init == 0:
@@ -1071,7 +1072,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
         raise ValueError("Invalid solver parameter '%s'." % solver)
 
     if n_iter == max_iter and tol > 0:
-        warnings.warn("Maximum number of iteration %d reached. Increase it to"
+        warnings.warn("Maximum number of iterations %d reached. Increase it to"
                       " improve convergence." % max_iter, ConvergenceWarning)
 
     return W, H, n_iter
@@ -1232,8 +1233,8 @@ class NMF(TransformerMixin, BaseEstimator):
     Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
     factorization with the beta-divergence. Neural Computation, 23(9).
     """
-
-    def __init__(self, n_components=None, init=None, solver='cd',
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, init=None, solver='cd',
                  beta_loss='frobenius', tol=1e-4, max_iter=200,
                  random_state=None, alpha=0., l1_ratio=0., verbose=0,
                  shuffle=False):
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 7a0140b01fc9b..87092d7ccd17e 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -25,24 +25,25 @@
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 
-def _assess_dimension(spectrum, rank, n_samples, n_features):
-    """Compute the likelihood of a rank ``rank`` dataset.
+def _assess_dimension(spectrum, rank, n_samples):
+    """Compute the log-likelihood of a rank ``rank`` dataset.
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
     dimf) having spectrum ``spectrum``.
 
     Parameters
     ----------
-    spectrum : array of shape (n)
+    spectrum : array of shape (n_features)
         Data spectrum.
     rank : int
-        Tested rank value.
+        Tested rank value. It should be strictly lower than n_features,
+        otherwise the method isn't specified (division by zero in equation
+        (31) from the paper).
     n_samples : int
         Number of samples.
-    n_features : int
-        Number of features.
 
     Returns
     -------
@@ -54,45 +55,39 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     This implements the method of `Thomas P. Minka:
     Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
     """
-    if rank > len(spectrum):
-        raise ValueError("The tested rank cannot exceed the rank of the"
-                         " dataset")
 
-    spectrum_threshold = np.finfo(type(spectrum[0])).eps
+    n_features = spectrum.shape[0]
+    if not 1 <= rank < n_features:
+        raise ValueError("the tested rank should be in [1, n_features - 1]")
+
+    eps = 1e-15
+
+    if spectrum[rank - 1] < eps:
+        # When the tested rank is associated with a small eigenvalue, there's
+        # no point in computing the log-likelihood: it's going to be very
+        # small and won't be the max anyway. Also, it can lead to numerical
+        # issues below when computing pa, in particular in log((spectrum[i] -
+        # spectrum[j]) because this will take the log of something very small.
+        return -np.inf
 
     pu = -rank * log(2.)
-    for i in range(rank):
-        pu += (gammaln((n_features - i) / 2.) -
-               log(np.pi) * (n_features - i) / 2.)
+    for i in range(1, rank + 1):
+        pu += (gammaln((n_features - i + 1) / 2.) -
+               log(np.pi) * (n_features - i + 1) / 2.)
 
     pl = np.sum(np.log(spectrum[:rank]))
     pl = -pl * n_samples / 2.
 
-    if rank == n_features:
-        # TODO: this line is never executed because _infer_dimension's
-        # for loop is off by one
-        pv = 0
-        v = 1
-    else:
-        v = np.sum(spectrum[rank:]) / (n_features - rank)
-        if spectrum_threshold > v:
-            return -np.inf
-        pv = -np.log(v) * n_samples * (n_features - rank) / 2.
+    v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
+    pv = -np.log(v) * n_samples * (n_features - rank) / 2.
 
     m = n_features * rank - rank * (rank + 1.) / 2.
-    pp = log(2. * np.pi) * (m + rank + 1.) / 2.
+    pp = log(2. * np.pi) * (m + rank) / 2.
 
     pa = 0.
     spectrum_ = spectrum.copy()
     spectrum_[rank:n_features] = v
     for i in range(rank):
-        if spectrum_[i] < spectrum_threshold:
-            # TODO: this line is never executed
-            # (off by one in _infer_dimension)
-            # this break only happens when rank == n_features and
-            # spectrum_[i] < spectrum_threshold, otherwise the early return
-            # above catches this case.
-            break
         for j in range(i + 1, len(spectrum)):
             pa += log((spectrum[i] - spectrum[j]) *
                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
@@ -102,15 +97,15 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     return ll
 
 
-def _infer_dimension(spectrum, n_samples, n_features):
-    """Infers the dimension of a dataset of shape (n_samples, n_features)
+def _infer_dimension(spectrum, n_samples):
+    """Infers the dimension of a dataset with a given spectrum.
 
-    The dataset is described by its spectrum `spectrum`.
+    The returned value will be in [1, n_features - 1].
     """
-    n_spectrum = len(spectrum)
-    ll = np.empty(n_spectrum)
-    for rank in range(n_spectrum):
-        ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
+    ll = np.empty_like(spectrum)
+    ll[0] = -np.inf  # we don't want to return n_components = 0
+    for rank in range(1, spectrum.shape[0]):
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples)
     return ll.argmax()
 
 
@@ -324,7 +319,7 @@ class PCA(_BasePCA):
     >>> print(pca.singular_values_)
     [6.30061...]
     """
-
+    @_deprecate_positional_args
     def __init__(self, n_components=None, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
                  random_state=None):
@@ -471,7 +466,7 @@ def _fit_full(self, X, n_components):
         # Postprocess the number of components required
         if n_components == 'mle':
             n_components = \
-                _infer_dimension(explained_variance_, n_samples, n_features)
+                _infer_dimension(explained_variance_, n_samples)
         elif 0 < n_components < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 158bbefc22e92..888d5d79e1e4b 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -8,6 +8,7 @@
 
 from ..utils import check_random_state, check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..linear_model import ridge_regression
 from ..base import BaseEstimator, TransformerMixin
 from ._dict_learning import dict_learning, dict_learning_online
@@ -131,7 +132,8 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     MiniBatchSparsePCA
     DictionaryLearning
     """
-    def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  max_iter=1000, tol=1e-8, method='lars', n_jobs=None,
                  U_init=None, V_init=None, verbose=False, random_state=None,
                  normalize_components='deprecated'):
@@ -340,7 +342,8 @@ class MiniBatchSparsePCA(SparsePCA):
     SparsePCA
     DictionaryLearning
     """
-    def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  n_iter=100, callback=None, batch_size=3, verbose=False,
                  shuffle=True, n_jobs=None, method='lars', random_state=None,
                  normalize_components='deprecated'):
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 940eab56feea8..c0057ad6287a1 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -14,6 +14,9 @@
 from ..utils import check_array, check_random_state
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
 from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import _deprecate_positional_args
+from ..utils.validation import check_is_fitted
+
 
 __all__ = ["TruncatedSVD"]
 
@@ -116,7 +119,8 @@ class TruncatedSVD(TransformerMixin, BaseEstimator):
     class to data once, then keep the instance around to do transformations.
 
     """
-    def __init__(self, n_components=2, algorithm="randomized", n_iter=5,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, algorithm="randomized", n_iter=5,
                  random_state=None, tol=0.):
         self.algorithm = algorithm
         self.n_components = n_components
@@ -208,7 +212,8 @@ def transform(self, X):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+        check_is_fitted(self)
         return safe_sparse_dot(X, self.components_.T)
 
     def inverse_transform(self, X):
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index a08ae0cb7a43a..a7a9547bfa33a 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -7,6 +7,7 @@
 
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.datasets import make_circles
+from sklearn.datasets import make_blobs
 from sklearn.linear_model import Perceptron
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
@@ -282,3 +283,15 @@ def test_kernel_conditioning():
     # check that the small non-zero eigenvalue was correctly set to zero
     assert kpca.lambdas_.min() == 0
     assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
+
+
+@pytest.mark.parametrize("kernel",
+                         ["linear", "poly", "rbf", "sigmoid", "cosine"])
+def test_kernel_pca_inverse_transform(kernel):
+    X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]],
+                       random_state=0)
+
+    kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True)
+    X_trans = kp.fit_transform(X)
+    X_inv = kp.inverse_transform(X_trans)
+    assert_allclose(X, X_inv)
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index c81a0136177dc..a7ef1243d8e25 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -19,6 +19,15 @@
 from sklearn.exceptions import ConvergenceWarning
 
 
+@pytest.mark.parametrize('solver', ['cd', 'mu'])
+def test_convergence_warning(solver):
+    convergence_warning = ("Maximum number of iterations 1 reached. "
+                           "Increase it to improve convergence.")
+    A = np.ones((2, 2))
+    with pytest.warns(ConvergenceWarning, match=convergence_warning):
+        NMF(solver=solver, max_iter=1).fit(A)
+
+
 def test_initialize_nn_output():
     # Test that initialization does not return negative values
     rng = np.random.mtrand.RandomState(42)
@@ -54,7 +63,7 @@ def test_parameter_checking():
         msg = ("init = '{}' can only be used when "
                "n_components <= min(n_samples, n_features)"
                .format(init))
-        assert_raise_message(ValueError, msg, NMF(3, init).fit, A)
+        assert_raise_message(ValueError, msg, NMF(3, init=init).fit, A)
         assert_raise_message(ValueError, msg, nmf._initialize_nmf, A,
                              3, init)
 
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index 438478a55f6fa..0123e169ce9c0 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -295,7 +295,7 @@ def test_n_components_mle(svd_solver):
     X = rng.randn(n_samples, n_features)
     pca = PCA(n_components='mle', svd_solver=svd_solver)
     pca.fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
@@ -333,7 +333,7 @@ def test_infer_dim_1():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)])
+    ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
     assert ll[1] > ll.max() - .01 * n
 
 
@@ -348,7 +348,7 @@ def test_infer_dim_2():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 1
+    assert _infer_dimension(spect, n) > 1
 
 
 def test_infer_dim_3():
@@ -361,7 +361,7 @@ def test_infer_dim_3():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 2
+    assert _infer_dimension(spect, n) > 2
 
 
 @pytest.mark.parametrize(
@@ -570,51 +570,43 @@ def test_pca_n_components_mostly_explained_variance_ratio():
     assert pca2.n_components_ == X.shape[1]
 
 
-def test_infer_dim_bad_spec():
-    # Test a spectrum that drops to near zero for PR #16224
+def test_assess_dimension_bad_rank():
+    # Test error when tested rank not in [1, n_features - 1]
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
     n_samples = 10
-    n_features = 5
-    ret = _infer_dimension(spectrum, n_samples, n_features)
-    assert ret == 0
+    for rank in (0, 5):
+        with pytest.raises(ValueError,
+                           match=r"should be in \[1, n_features - 1\]"):
+            _assess_dimension(spectrum, rank, n_samples)
 
 
-def test_assess_dimension_error_rank_greater_than_features():
-    # Test error when tested rank is greater than the number of features
-    # for PR #16224
+def test_small_eigenvalues_mle():
+    # Test rank associated with tiny eigenvalues are given a log-likelihood of
+    # -inf. The inferred rank will be 1
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 4
-    rank = 5
-    with pytest.raises(ValueError, match="The tested rank cannot exceed "
-                                         "the rank of the dataset"):
-        _assess_dimension(spectrum, rank, n_samples, n_features)
 
+    assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf
 
-def test_assess_dimension_small_eigenvalues():
-    # Test tiny eigenvalues appropriately when using 'mle'
-    # for  PR #16224
-    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 5
-    rank = 3
-    ret = _assess_dimension(spectrum, rank, n_samples, n_features)
-    assert ret == -np.inf
+    for rank in (2, 3):
+        assert _assess_dimension(spectrum, rank, 10) == -np.inf
+
+    assert _infer_dimension(spectrum, 10) == 1
 
 
-def test_infer_dim_mle():
-    # Test small eigenvalues when 'mle' with pathological 'X' dataset
-    # for PR #16224
-    X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
+def test_mle_redundant_data():
+    # Test 'mle' with pathological X: only one relevant feature should give a
+    # rank of 1
+    X, _ = datasets.make_classification(n_features=20,
+                                        n_informative=1, n_repeated=18,
                                         n_redundant=1, n_clusters_per_class=1,
                                         random_state=42)
     pca = PCA(n_components='mle').fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 def test_fit_mle_too_few_samples():
     # Tests that an error is raised when the number of samples is smaller
-    # than the number of features during an mle fit for PR #16224
+    # than the number of features during an mle fit
     X, _ = datasets.make_classification(n_samples=20, n_features=21,
                                         random_state=42)
 
@@ -623,3 +615,26 @@ def test_fit_mle_too_few_samples():
                                          "supported if "
                                          "n_samples >= n_features"):
         pca.fit(X)
+
+
+def test_mle_simple_case():
+    # non-regression test for issue
+    # https://github.com/scikit-learn/scikit-learn/issues/16730
+    n_samples, n_dim = 1000, 10
+    X = np.random.RandomState(0).randn(n_samples, n_dim)
+    X[:, -1] = np.mean(X[:, :-1], axis=-1)  # true X dim is ndim - 1
+    pca_skl = PCA('mle', svd_solver='full')
+    pca_skl.fit(X)
+    assert pca_skl.n_components_ == n_dim - 1
+
+
+def test_assess_dimesion_rank_one():
+    # Make sure assess_dimension works properly on a matrix of rank 1
+    n_samples, n_features = 9, 6
+    X = np.ones((n_samples, n_features))  # rank 1 matrix
+    _, s, _ = np.linalg.svd(X, full_matrices=True)
+    assert sum(s[1:]) == 0  # except for rank 1, all eigenvalues are 0
+
+    assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
+    for rank in range(2, n_features):
+        assert _assess_dimension(s, rank, n_samples) == -np.inf
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 2bd3948f2e013..45e623904b9ea 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -18,7 +18,7 @@
 from .linear_model._base import LinearClassifierMixin
 from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance
 from .utils.multiclass import unique_labels
-from .utils import check_array, check_X_y
+from .utils import check_array
 from .utils.validation import check_is_fitted
 from .utils.multiclass import check_classification_targets
 from .utils.extmath import softmax
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index daa2c1ff0da11..634943231860f 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -395,7 +395,8 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The outputs_2d_ attribute is deprecated in version 0.22 "
         "and will be removed in version 0.24. It is equivalent to "
         "n_outputs_ > 1."
@@ -622,7 +623,8 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The outputs_2d_ attribute is deprecated in version 0.22 "
         "and will be removed in version 0.24. It is equivalent to "
         "n_outputs_ > 1."
diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py
index 27acb2fbcf00a..ae86349ad9af0 100644
--- a/sklearn/ensemble/__init__.py
+++ b/sklearn/ensemble/__init__.py
@@ -2,6 +2,7 @@
 The :mod:`sklearn.ensemble` module includes ensemble-based methods for
 classification, regression and anomaly detection.
 """
+import typing
 
 from ._base import BaseEnsemble
 from ._forest import RandomForestClassifier
@@ -21,6 +22,12 @@
 from ._stacking import StackingClassifier
 from ._stacking import StackingRegressor
 
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._hist_gradient_boosting.gradient_boosting import (  # noqa
+        HistGradientBoostingRegressor, HistGradientBoostingClassifier
+    )
 
 __all__ = ["BaseEnsemble",
            "RandomForestClassifier", "RandomForestRegressor",
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index d73f38954d21a..162979373602b 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -16,13 +16,13 @@
 from ..base import ClassifierMixin, RegressorMixin
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
-from ..utils import check_random_state, check_X_y, check_array, column_or_1d
+from ..utils import check_random_state, check_array, column_or_1d
 from ..utils import indices_to_mask
 from ..utils.metaestimators import if_delegate_has_method
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import sample_without_replacement
 from ..utils.validation import has_fit_parameter, check_is_fitted, \
-    _check_sample_weight
+    _check_sample_weight, _deprecate_positional_args
 
 
 __all__ = ["BaggingClassifier",
@@ -193,7 +193,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
@@ -577,9 +577,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
+    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
@@ -975,10 +976,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
-
+    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 23db107874c9b..a91f28b0710b2 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -6,6 +6,7 @@
 from abc import ABCMeta, abstractmethod
 import numbers
 import warnings
+from typing import List
 
 import numpy as np
 
@@ -53,10 +54,10 @@ def _set_random_states(estimator, random_state=None):
         parameters.
 
     random_state : int or RandomState, default=None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        integers. Pass an int for reproducible output across multiple function
+        calls.
+        See :term:`Glossary <random_state>`.
 
     Notes
     -----
@@ -106,10 +107,10 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     # overwrite _required_parameters from MetaEstimatorMixin
-    _required_parameters = []
+    _required_parameters: List[str] = []
 
     @abstractmethod
-    def __init__(self, base_estimator, n_estimators=10,
+    def __init__(self, base_estimator, *, n_estimators=10,
                  estimator_params=tuple()):
         # Set parameters
         self.base_estimator = base_estimator
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index d6784b10f05d3..40a1c2434316c 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -62,6 +62,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ["RandomForestClassifier",
@@ -180,7 +181,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -480,7 +481,7 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -735,7 +736,7 @@ class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -884,9 +885,9 @@ class RandomForestClassifier(ForestClassifier):
     A random forest is a meta estimator that fits a number of decision tree
     classifiers on various sub-samples of the dataset and uses averaging to
     improve the predictive accuracy and control over-fitting.
-    The sub-sample size is always the same as the original
-    input sample size but the samples are drawn with replacement if
-    `bootstrap=True` (default).
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
 
     Read more in the :ref:`User Guide <forest>`.
 
@@ -1110,19 +1111,9 @@ class labels (multi-output problem).
         `oob_decision_function_` might contain NaN. This attribute exists
         only when ``oob_score`` is True.
 
-    Examples
+    See Also
     --------
-    >>> from sklearn.ensemble import RandomForestClassifier
-    >>> from sklearn.datasets import make_classification
-
-    >>> X, y = make_classification(n_samples=1000, n_features=4,
-    ...                            n_informative=2, n_redundant=0,
-    ...                            random_state=0, shuffle=False)
-    >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
-    >>> clf.fit(X, y)
-    RandomForestClassifier(max_depth=2, random_state=0)
-    >>> print(clf.predict([[0, 0, 0, 0]]))
-    [1]
+    DecisionTreeClassifier, ExtraTreesClassifier
 
     Notes
     -----
@@ -1141,15 +1132,24 @@ class labels (multi-output problem).
 
     References
     ----------
-
     .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
 
-    See Also
+    Examples
     --------
-    DecisionTreeClassifier, ExtraTreesClassifier
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=1000, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
+    >>> clf.fit(X, y)
+    RandomForestClassifier(...)
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [1]
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
@@ -1204,9 +1204,9 @@ class RandomForestRegressor(ForestRegressor):
     A random forest is a meta estimator that fits a number of classifying
     decision trees on various sub-samples of the dataset and uses averaging
     to improve the predictive accuracy and control over-fitting.
-    The sub-sample size is always the same as the original
-    input sample size but the samples are drawn with replacement if
-    `bootstrap=True` (default).
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
 
     Read more in the :ref:`User Guide <forest>`.
 
@@ -1396,18 +1396,9 @@ class RandomForestRegressor(ForestRegressor):
         Prediction computed with out-of-bag estimate on the training set.
         This attribute exists only when ``oob_score`` is True.
 
-    Examples
+    See Also
     --------
-    >>> from sklearn.ensemble import RandomForestRegressor
-    >>> from sklearn.datasets import make_regression
-
-    >>> X, y = make_regression(n_features=4, n_informative=2,
-    ...                        random_state=0, shuffle=False)
-    >>> regr = RandomForestRegressor(max_depth=2, random_state=0)
-    >>> regr.fit(X, y)
-    RandomForestRegressor(max_depth=2, random_state=0)
-    >>> print(regr.predict([[0, 0, 0, 0]]))
-    [-8.32987858]
+    DecisionTreeRegressor, ExtraTreesRegressor
 
     Notes
     -----
@@ -1430,18 +1421,26 @@ class RandomForestRegressor(ForestRegressor):
 
     References
     ----------
-
     .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
 
     .. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
            trees", Machine Learning, 63(1), 3-42, 2006.
 
-    See Also
+    Examples
     --------
-    DecisionTreeRegressor, ExtraTreesRegressor
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=4, n_informative=2,
+    ...                        random_state=0, shuffle=False)
+    >>> regr = RandomForestRegressor(max_depth=2, random_state=0)
+    >>> regr.fit(X, y)
+    RandomForestRegressor(...)
+    >>> print(regr.predict([[0, 0, 0, 0]]))
+    [-8.32987858]
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
@@ -1720,6 +1719,12 @@ class labels (multi-output problem).
         `oob_decision_function_` might contain NaN. This attribute exists
         only when ``oob_score`` is True.
 
+    See Also
+    --------
+    sklearn.tree.ExtraTreeClassifier : Base classifier for this ensemble.
+    RandomForestClassifier : Ensemble Classifier based on trees with optimal
+        splits.
+
     Notes
     -----
     The default values for the parameters controlling the size of the trees
@@ -1728,6 +1733,11 @@ class labels (multi-output problem).
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
 
+    References
+    ----------
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
+           trees", Machine Learning, 63(1), 3-42, 2006.
+
     Examples
     --------
     >>> from sklearn.ensemble import ExtraTreesClassifier
@@ -1738,21 +1748,10 @@ class labels (multi-output problem).
     ExtraTreesClassifier(random_state=0)
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
-
-    References
-    ----------
-
-    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
-           trees", Machine Learning, 63(1), 3-42, 2006.
-
-    See Also
-    --------
-    sklearn.tree.ExtraTreeClassifier : Base classifier for this ensemble.
-    RandomForestClassifier : Ensemble Classifier based on trees with optimal
-        splits.
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
@@ -2000,6 +1999,11 @@ class ExtraTreesRegressor(ForestRegressor):
         Prediction computed with out-of-bag estimate on the training set.
         This attribute exists only when ``oob_score`` is True.
 
+    See Also
+    --------
+    sklearn.tree.ExtraTreeRegressor: Base estimator for this ensemble.
+    RandomForestRegressor: Ensemble regressor using trees with optimal splits.
+
     Notes
     -----
     The default values for the parameters controlling the size of the trees
@@ -2010,17 +2014,25 @@ class ExtraTreesRegressor(ForestRegressor):
 
     References
     ----------
-
     .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
            Machine Learning, 63(1), 3-42, 2006.
 
-    See Also
+    Examples
     --------
-    sklearn.tree.ExtraTreeRegressor: Base estimator for this ensemble.
-    RandomForestRegressor: Ensemble regressor using trees with optimal splits.
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.ensemble import ExtraTreesRegressor
+    >>> X, y = load_diabetes(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
+    ...    X_train, y_train)
+    >>> reg.score(X_test, y_test)
+    0.2708...
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
@@ -2189,6 +2201,14 @@ class RandomTreesEmbedding(BaseForest):
     estimators_ : list of DecisionTreeClassifier
         The collection of fitted sub-estimators.
 
+    References
+    ----------
+    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
+           Machine Learning, 63(1), 3-42, 2006.
+    .. [2] Moosmann, F. and Triggs, B. and Jurie, F.  "Fast discriminative
+           visual codebooks using randomized clustering forests"
+           NIPS 2007
+
     Examples
     --------
     >>> from sklearn.ensemble import RandomTreesEmbedding
@@ -2202,22 +2222,14 @@ class RandomTreesEmbedding(BaseForest):
            [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],
            [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
            [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
-
-    References
-    ----------
-    .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
-           Machine Learning, 63(1), 3-42, 2006.
-    .. [2] Moosmann, F. and Triggs, B. and Jurie, F.  "Fast discriminative
-           visual codebooks using randomized clustering forests"
-           NIPS 2007
-
     """
 
     criterion = 'mse'
     max_features = 1
 
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  max_depth=5,
                  min_samples_split=2,
                  min_samples_leaf=1,
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index c9f0b69f57968..32e534fdc8517 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -40,25 +40,20 @@
 from scipy.sparse import csc_matrix
 from scipy.sparse import csr_matrix
 from scipy.sparse import issparse
-from scipy.special import expit
 
 from time import time
 from ..model_selection import train_test_split
 from ..tree import DecisionTreeRegressor
 from ..tree._tree import DTYPE, DOUBLE
-from ..tree._tree import TREE_LEAF
 from . import _gb_losses
 
 from ..utils import check_random_state
 from ..utils import check_array
 from ..utils import column_or_1d
-from ..utils import check_consistent_length
-from ..utils import deprecated
-from ..utils.fixes import logsumexp
-from ..utils.stats import _weighted_percentile
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..exceptions import NotFittedError
+from ..utils.validation import _deprecate_positional_args
 
 
 class VerboseReporter:
@@ -71,7 +66,6 @@ class VerboseReporter:
         (when iteration mod verbose_mod is zero).; if larger than 1 then output
         is printed for each update.
     """
-
     def __init__(self, verbose):
         self.verbose = verbose
 
@@ -140,7 +134,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
     """Abstract base class for Gradient Boosting. """
 
     @abstractmethod
-    def __init__(self, loss, learning_rate, n_estimators, criterion,
+    def __init__(self, *, loss, learning_rate, n_estimators, criterion,
                  min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, min_impurity_decrease, min_impurity_split,
                  init, subsample, max_features, ccp_alpha,
@@ -532,7 +526,7 @@ def _fit_stages(self, X, y, raw_predictions, sample_weight, random_state,
         loss_ = self.loss_
 
         if self.verbose:
-            verbose_reporter = VerboseReporter(self.verbose)
+            verbose_reporter = VerboseReporter(verbose=self.verbose)
             verbose_reporter.init(self, begin_at_stage)
 
         X_csc = csc_matrix(X) if issparse(X) else None
@@ -1073,7 +1067,8 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
 
     _SUPPORTED_LOSS = ('deviance', 'exponential')
 
-    def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
+    @_deprecate_positional_args
+    def __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
@@ -1571,7 +1566,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
 
     _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile')
 
-    def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
+    @_deprecate_positional_args
+    def __init__(self, *, loss='ls', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py
index f400144abc4fc..7bd5faca1d7d9 100644
--- a/sklearn/ensemble/_gb_losses.py
+++ b/sklearn/ensemble/_gb_losses.py
@@ -6,10 +6,9 @@
 from abc import abstractmethod
 
 import numpy as np
-from scipy.special import expit
+from scipy.special import expit, logsumexp
 
 from ..tree._tree import TREE_LEAF
-from ..utils.fixes import logsumexp
 from ..utils.stats import _weighted_percentile
 from ..dummy import DummyClassifier
 from ..dummy import DummyRegressor
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
index 83c338d89633e..84a76dd252064 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -49,7 +49,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
     """
     rng = check_random_state(random_state)
     if subsample is not None and data.shape[0] > subsample:
-        subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False)
+        subset = rng.choice(data.shape[0], subsample, replace=False)
         data = data.take(subset, axis=0)
 
     binning_thresholds = []
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index fa78f2024aa5c..60399c2fbdd70 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -30,3 +30,9 @@ cdef packed struct node_struct:
     unsigned int depth
     unsigned char is_leaf
     X_BINNED_DTYPE_C bin_threshold
+
+
+cpdef enum MonotonicConstraint:
+    NO_CST = 0
+    POS = 1
+    NEG = -1
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index e18d3ac4b1f9b..796f4f060dda5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -8,9 +8,11 @@
 from timeit import default_timer as time
 from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin,
                      is_classifier)
-from ...utils import check_X_y, check_random_state, check_array, resample
+from ...utils import check_random_state, check_array, resample
 from ...utils.validation import (check_is_fitted,
-                                 check_consistent_length, _check_sample_weight)
+                                 check_consistent_length,
+                                 _check_sample_weight,
+                                 _deprecate_positional_args)
 from ...utils.multiclass import check_classification_targets
 from ...metrics import check_scoring
 from ...model_selection import train_test_split
@@ -27,10 +29,11 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
     """Base class for histogram-based gradient boosting estimators."""
 
     @abstractmethod
-    def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
+    def __init__(self, loss, *, learning_rate, max_iter, max_leaf_nodes,
                  max_depth, min_samples_leaf, l2_regularization, max_bins,
-                 warm_start, early_stopping, scoring, validation_fraction,
-                 n_iter_no_change, tol, verbose, random_state):
+                 monotonic_cst, warm_start, early_stopping, scoring,
+                 validation_fraction, n_iter_no_change, tol, verbose,
+                 random_state):
         self.loss = loss
         self.learning_rate = learning_rate
         self.max_iter = max_iter
@@ -39,6 +42,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.min_samples_leaf = min_samples_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
+        self.monotonic_cst = monotonic_cst
         self.warm_start = warm_start
         self.early_stopping = early_stopping
         self.scoring = scoring
@@ -82,6 +86,12 @@ def _validate_parameters(self):
             raise ValueError('max_bins={} should be no smaller than 2 '
                              'and no larger than 255.'.format(self.max_bins))
 
+        if self.monotonic_cst is not None and self.n_trees_per_iteration_ != 1:
+            raise ValueError(
+                'monotonic constraints are not supported for '
+                'multiclass classification.'
+                )
+
     def fit(self, X, y, sample_weight=None):
         """Fit the gradient boosting model.
 
@@ -217,14 +227,6 @@ def fit(self, X, y, sample_weight=None):
             )
             raw_predictions += self._baseline_prediction
 
-            # initialize gradients and hessians (empty arrays).
-            # shape = (n_trees_per_iteration, n_samples).
-            gradients, hessians = self.loss_.init_gradients_and_hessians(
-                n_samples=n_samples,
-                prediction_dim=self.n_trees_per_iteration_,
-                sample_weight=sample_weight_train
-            )
-
             # predictors is a matrix (list of lists) of TreePredictor objects
             # with shape (n_iter_, n_trees_per_iteration)
             self._predictors = predictors = []
@@ -304,6 +306,8 @@ def fit(self, X, y, sample_weight=None):
             raw_predictions = self._raw_predict(X_binned_train)
             if self.do_early_stopping_ and self._use_validation_data:
                 raw_predictions_val = self._raw_predict(X_binned_val)
+            else:
+                raw_predictions_val = None
 
             if self.do_early_stopping_ and self.scoring != 'loss':
                 # Compute the subsample set
@@ -313,13 +317,6 @@ def fit(self, X, y, sample_weight=None):
                     X_binned_train, y_train, sample_weight_train,
                     self._random_seed)
 
-            # Initialize the gradients and hessians
-            gradients, hessians = self.loss_.init_gradients_and_hessians(
-                n_samples=n_samples,
-                sample_weight=sample_weight_train,
-                prediction_dim=self.n_trees_per_iteration_
-            )
-
             # Get the predictors from the previous fit
             predictors = self._predictors
 
@@ -350,12 +347,12 @@ def fit(self, X, y, sample_weight=None):
 
             # Build `n_trees_per_iteration` trees.
             for k in range(self.n_trees_per_iteration_):
-
                 grower = TreeGrower(
                     X_binned_train, gradients[k, :], hessians[k, :],
                     n_bins=n_bins,
                     n_bins_non_missing=self.bin_mapper_.n_bins_non_missing_,
                     has_missing_values=has_missing_values,
+                    monotonic_cst=self.monotonic_cst,
                     max_leaf_nodes=self.max_leaf_nodes,
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
@@ -788,6 +785,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a positive constraint, negative
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -853,28 +855,31 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     >>> # To use this experimental feature, we need to explicitly ask for it:
     >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
-    >>> from sklearn.datasets import load_boston
-    >>> X, y = load_boston(return_X_y=True)
+    >>> from sklearn.datasets import load_diabetes
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> est = HistGradientBoostingRegressor().fit(X, y)
     >>> est.score(X, y)
-    0.98...
+    0.92...
     """
 
     _VALID_LOSSES = ('least_squares', 'least_absolute_deviation')
 
-    def __init__(self, loss='least_squares', learning_rate=0.1,
+    @_deprecate_positional_args
+    def __init__(self, loss='least_squares', *, learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
-                 warm_start=False, early_stopping='auto', scoring='loss',
-                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 monotonic_cst=None, warm_start=False, early_stopping='auto',
+                 scoring='loss', validation_fraction=0.1,
+                 n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
-            scoring=scoring, validation_fraction=validation_fraction,
+            monotonic_cst=monotonic_cst, early_stopping=early_stopping,
+            warm_start=warm_start, scoring=scoring,
+            validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
@@ -976,6 +981,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a positive constraint, negative
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -1054,10 +1064,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy',
                      'auto')
 
-    def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
+    @_deprecate_positional_args
+    def __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=255, warm_start=False,
-                 early_stopping='auto', scoring='loss',
+                 l2_regularization=0., max_bins=255, monotonic_cst=None,
+                 warm_start=False, early_stopping='auto', scoring='loss',
                  validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
@@ -1065,8 +1076,9 @@ def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
-            scoring=scoring, validation_fraction=validation_fraction,
+            monotonic_cst=monotonic_cst, warm_start=warm_start,
+            early_stopping=early_stopping, scoring=scoring,
+            validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index bbee8f6c4585c..e0b54550d3082 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -17,6 +17,7 @@
 from .utils import sum_parallel
 from .common import PREDICTOR_RECORD_DTYPE
 from .common import Y_DTYPE
+from .common import MonotonicConstraint
 
 
 EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
@@ -71,7 +72,6 @@ class TreeNode:
     split_info = None
     left_child = None
     right_child = None
-    value = None
     histograms = None
     sibling = None
     parent = None
@@ -88,13 +88,25 @@ class TreeNode:
     partition_stop = 0
 
     def __init__(self, depth, sample_indices, sum_gradients,
-                 sum_hessians, parent=None):
+                 sum_hessians, parent=None, value=None):
         self.depth = depth
         self.sample_indices = sample_indices
         self.n_samples = sample_indices.shape[0]
         self.sum_gradients = sum_gradients
         self.sum_hessians = sum_hessians
         self.parent = parent
+        self.value = value
+        self.is_leaf = False
+        self.set_children_bounds(float('-inf'), float('+inf'))
+
+    def set_children_bounds(self, lower, upper):
+        """Set children values bounds to respect monotonic constraints."""
+
+        # These are bounds for the node's *children* values, not the node's
+        # value. The bounds are used in the splitter when considering potential
+        # left and right child.
+        self.children_lower_bound = lower
+        self.children_upper_bound = upper
 
     def __lt__(self, other_node):
         """Comparison for priority queue.
@@ -167,8 +179,8 @@ class TreeGrower:
     def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
                  max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
                  n_bins=256, n_bins_non_missing=None, has_missing_values=False,
-                 l2_regularization=0., min_hessian_to_split=1e-3,
-                 shrinkage=1.):
+                 monotonic_cst=None, l2_regularization=0.,
+                 min_hessian_to_split=1e-3, shrinkage=1.):
 
         self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
                                   min_samples_leaf, min_gain_to_split,
@@ -189,17 +201,42 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
             has_missing_values = [has_missing_values] * X_binned.shape[1]
         has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
 
+        if monotonic_cst is None:
+            self.with_monotonic_cst = False
+            monotonic_cst = np.full(shape=X_binned.shape[1],
+                                    fill_value=MonotonicConstraint.NO_CST,
+                                    dtype=np.int8)
+        else:
+            self.with_monotonic_cst = True
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+
+            if monotonic_cst.shape[0] != X_binned.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(
+                        monotonic_cst.shape[0], X_binned.shape[1]
+                    )
+                )
+            if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of "
+                    "-1, 0 or 1."
+                    )
+
         hessians_are_constant = hessians.shape[0] == 1
         self.histogram_builder = HistogramBuilder(
             X_binned, n_bins, gradients, hessians, hessians_are_constant)
         missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
             X_binned, n_bins_non_missing, missing_values_bin_idx,
-            has_missing_values, l2_regularization, min_hessian_to_split,
+            has_missing_values, monotonic_cst,
+            l2_regularization, min_hessian_to_split,
             min_samples_leaf, min_gain_to_split, hessians_are_constant)
         self.n_bins_non_missing = n_bins_non_missing
         self.max_leaf_nodes = max_leaf_nodes
         self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
+        self.l2_regularization = l2_regularization
         self.n_features = X_binned.shape[1]
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
@@ -252,6 +289,20 @@ def grow(self):
         while self.splittable_nodes:
             self.split_next()
 
+        self._apply_shrinkage()
+
+    def _apply_shrinkage(self):
+        """Multiply leaves values by shrinkage parameter.
+
+        This must be done at the very end of the growing process. If this were
+        done during the growing process e.g. in finalize_leaf(), then a leaf
+        would be shrunk but its sibling would potentially not be (if it's a
+        non-leaf), which would lead to a wrong computation of the 'middle'
+        value needed to enforce the monotonic constraints.
+        """
+        for leaf in self.finalized_leaves:
+            leaf.value *= self.shrinkage
+
     def _intilialize_root(self, gradients, hessians, hessians_are_constant):
         """Initialize root node and finalize it if needed."""
         n_samples = self.X_binned.shape[0]
@@ -265,7 +316,8 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
             depth=depth,
             sample_indices=self.splitter.partition,
             sum_gradients=sum_gradients,
-            sum_hessians=sum_hessians
+            sum_hessians=sum_hessians,
+            value=0
         )
 
         self.root.partition_start = 0
@@ -294,7 +346,8 @@ def _compute_best_split_and_push(self, node):
 
         node.split_info = self.splitter.find_node_split(
             node.n_samples, node.histograms, node.sum_gradients,
-            node.sum_hessians)
+            node.sum_hessians, node.value, node.children_lower_bound,
+            node.children_upper_bound)
 
         if node.split_info.gain <= 0:  # no valid split
             self._finalize_leaf(node)
@@ -329,12 +382,17 @@ def split_next(self):
                                    sample_indices_left,
                                    node.split_info.sum_gradient_left,
                                    node.split_info.sum_hessian_left,
-                                   parent=node)
+                                   parent=node,
+                                   value=node.split_info.value_left,
+                                   )
         right_child_node = TreeNode(depth,
                                     sample_indices_right,
                                     node.split_info.sum_gradient_right,
                                     node.split_info.sum_hessian_right,
-                                    parent=node)
+                                    parent=node,
+                                    value=node.split_info.value_right,
+                                    )
+
         left_child_node.sibling = right_child_node
         right_child_node.sibling = left_child_node
         node.right_child = right_child_node
@@ -372,10 +430,29 @@ def split_next(self):
         if right_child_node.n_samples < self.min_samples_leaf * 2:
             self._finalize_leaf(right_child_node)
 
-        # Compute histograms of childs, and compute their best possible split
+        if self.with_monotonic_cst:
+            # Set value bounds for respecting monotonic constraints
+            # See test_nodes_values() for details
+            if (self.monotonic_cst[node.split_info.feature_idx] ==
+                    MonotonicConstraint.NO_CST):
+                lower_left = lower_right = node.children_lower_bound
+                upper_left = upper_right = node.children_upper_bound
+            else:
+                mid = (left_child_node.value + right_child_node.value) / 2
+                if (self.monotonic_cst[node.split_info.feature_idx] ==
+                        MonotonicConstraint.POS):
+                    lower_left, upper_left = node.children_lower_bound, mid
+                    lower_right, upper_right = mid, node.children_upper_bound
+                else:  # NEG
+                    lower_left, upper_left = mid, node.children_upper_bound
+                    lower_right, upper_right = node.children_lower_bound, mid
+            left_child_node.set_children_bounds(lower_left, upper_left)
+            right_child_node.set_children_bounds(lower_right, upper_right)
+
+        # Compute histograms of children, and compute their best possible split
         # (if needed)
-        should_split_left = left_child_node.value is None  # node isn't a leaf
-        should_split_right = right_child_node.value is None
+        should_split_left = not left_child_node.is_leaf
+        should_split_right = not right_child_node.is_leaf
         if should_split_left or should_split_right:
 
             # We will compute the histograms of both nodes even if one of them
@@ -412,17 +489,9 @@ def split_next(self):
         return left_child_node, right_child_node
 
     def _finalize_leaf(self, node):
-        """Compute the prediction value that minimizes the objective function.
+        """Make node a leaf of the tree being grown."""
 
-        This sets the node.value attribute (node is a leaf iff node.value is
-        not None).
-
-        See Equation 5 of:
-        XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
-        https://arxiv.org/abs/1603.02754
-        """
-        node.value = -self.shrinkage * node.sum_gradients / (
-            node.sum_hessians + self.splitter.l2_regularization + EPS)
+        node.is_leaf = True
         self.finalized_leaves.append(node)
 
     def _finalize_splittable_nodes(self):
@@ -464,10 +533,11 @@ def _fill_predictor_node_array(predictor_nodes, grower_node,
     else:
         node['gain'] = -1
 
-    if grower_node.value is not None:
+    node['value'] = grower_node.value
+
+    if grower_node.is_leaf:
         # Leaf node
         node['is_leaf'] = True
-        node['value'] = grower_node.value
         return next_free_idx + 1
     else:
         # Decision node
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index 2dbf8bd58773e..c7884a25a9c41 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -9,11 +9,7 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
-from scipy.special import expit
-try:  # logsumexp was moved from mist to special in 0.19
-    from scipy.special import logsumexp
-except ImportError:
-    from scipy.misc import logsumexp
+from scipy.special import expit, logsumexp
 
 from .common import Y_DTYPE
 from .common import G_H_DTYPE
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 0e74d6ba38c71..43405551ef357 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -19,11 +19,13 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
     from openmp cimport omp_get_max_threads
 from libc.stdlib cimport malloc, free
 from libc.string cimport memcpy
+from numpy.math cimport INFINITY
 
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport Y_DTYPE_C
 from .common cimport hist_struct
 from .common import HISTOGRAM_DTYPE
+from .common cimport MonotonicConstraint
 
 
 cdef struct split_info_struct:
@@ -39,6 +41,8 @@ cdef struct split_info_struct:
     Y_DTYPE_C sum_hessian_right
     unsigned int n_samples_left
     unsigned int n_samples_right
+    Y_DTYPE_C value_left
+    Y_DTYPE_C value_right
 
 
 class SplitInfo:
@@ -70,7 +74,7 @@ class SplitInfo:
     def __init__(self, gain, feature_idx, bin_idx,
                  missing_go_to_left, sum_gradient_left, sum_hessian_left,
                  sum_gradient_right, sum_hessian_right, n_samples_left,
-                 n_samples_right):
+                 n_samples_right, value_left, value_right):
         self.gain = gain
         self.feature_idx = feature_idx
         self.bin_idx = bin_idx
@@ -81,6 +85,8 @@ class SplitInfo:
         self.sum_hessian_right = sum_hessian_right
         self.n_samples_left = n_samples_left
         self.n_samples_right = n_samples_right
+        self.value_left = value_left
+        self.value_right = value_right
 
 
 @cython.final
@@ -126,6 +132,7 @@ cdef class Splitter:
         const unsigned int [::1] n_bins_non_missing
         unsigned char missing_values_bin_idx
         const unsigned char [::1] has_missing_values
+        const char [::1] monotonic_cst
         unsigned char hessians_are_constant
         Y_DTYPE_C l2_regularization
         Y_DTYPE_C min_hessian_to_split
@@ -141,6 +148,7 @@ cdef class Splitter:
                  const unsigned int [::1] n_bins_non_missing,
                  const unsigned char missing_values_bin_idx,
                  const unsigned char [::1] has_missing_values,
+                 const char [::1] monotonic_cst,
                  Y_DTYPE_C l2_regularization,
                  Y_DTYPE_C min_hessian_to_split=1e-3,
                  unsigned int min_samples_leaf=20,
@@ -152,6 +160,7 @@ cdef class Splitter:
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
         self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
         self.l2_regularization = l2_regularization
         self.min_hessian_to_split = min_hessian_to_split
         self.min_samples_leaf = min_samples_leaf
@@ -350,7 +359,11 @@ cdef class Splitter:
             unsigned int n_samples,
             hist_struct [:, ::1] histograms,  # IN
             const Y_DTYPE_C sum_gradients,
-            const Y_DTYPE_C sum_hessians):
+            const Y_DTYPE_C sum_hessians,
+            const Y_DTYPE_C value,
+            const Y_DTYPE_C lower_bound=-INFINITY,
+            const Y_DTYPE_C upper_bound=INFINITY,
+            ):
         """For each feature, find the best bin to split on at a given node.
 
         Return the best split info among all features.
@@ -366,6 +379,22 @@ cdef class Splitter:
             The sum of the gradients for each sample at the node.
         sum_hessians : float
             The sum of the hessians for each sample at the node.
+        value : float
+            The bounded value of the current node. We directly pass the value
+            instead of re-computing it from sum_gradients and sum_hessians,
+            because we need to compute the loss and the gain based on the
+            *bounded* value: computing the value from
+            sum_gradients / sum_hessians would give the unbounded value, and
+            the interaction with min_gain_to_split would not be correct
+            anymore. Side note: we can't use the lower_bound / upper_bound
+            parameters either because these refer to the bounds of the
+            children, not the bounds of the current node.
+        lower_bound : float
+            Lower bound for the children values for respecting the monotonic
+            constraints.
+        upper_bound : float
+            Upper bound for the children values for respecting the monotonic
+            constraints.
 
         Returns
         -------
@@ -378,7 +407,8 @@ cdef class Splitter:
             int n_features = self.n_features
             split_info_struct split_info
             split_info_struct * split_infos
-            const unsigned char [:] has_missing_values = self.has_missing_values
+            const unsigned char [::1] has_missing_values = self.has_missing_values
+            const char [::1] monotonic_cst = self.monotonic_cst
 
         with nogil:
 
@@ -386,6 +416,8 @@ cdef class Splitter:
                 self.n_features * sizeof(split_info_struct))
 
             for feature_idx in prange(n_features, schedule='static'):
+                split_infos[feature_idx].feature_idx = feature_idx
+
                 # For each feature, find best bin to split on
                 # Start with a gain of -1 (if no better split is found, that
                 # means one of the constraints isn't respected
@@ -404,7 +436,8 @@ cdef class Splitter:
                 self._find_best_bin_to_split_left_to_right(
                     feature_idx, has_missing_values[feature_idx],
                     histograms, n_samples, sum_gradients, sum_hessians,
-                    &split_infos[feature_idx])
+                    value, monotonic_cst[feature_idx],
+                    lower_bound, upper_bound, &split_infos[feature_idx])
 
                 if has_missing_values[feature_idx]:
                     # We need to explore both directions to check whether
@@ -412,7 +445,9 @@ cdef class Splitter:
                     # gain
                     self._find_best_bin_to_split_right_to_left(
                         feature_idx, histograms, n_samples,
-                        sum_gradients, sum_hessians, &split_infos[feature_idx])
+                        sum_gradients, sum_hessians,
+                        value, monotonic_cst[feature_idx],
+                        lower_bound, upper_bound, &split_infos[feature_idx])
 
             # then compute best possible split among all features
             best_feature_idx = self._find_best_feature_to_split_helper(
@@ -430,6 +465,8 @@ cdef class Splitter:
             split_info.sum_hessian_right,
             split_info.n_samples_left,
             split_info.n_samples_right,
+            split_info.value_left,
+            split_info.value_right,
         )
         free(split_infos)
         return out
@@ -456,6 +493,10 @@ cdef class Splitter:
             unsigned int n_samples,
             Y_DTYPE_C sum_gradients,
             Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
             split_info_struct * split_info) nogil:  # OUT
         """Find best bin to split on for a given feature.
 
@@ -481,15 +522,20 @@ cdef class Splitter:
             Y_DTYPE_C sum_hessian_right
             Y_DTYPE_C sum_gradient_left
             Y_DTYPE_C sum_gradient_right
-            Y_DTYPE_C negative_loss_current_node
+            Y_DTYPE_C loss_current_node
             Y_DTYPE_C gain
+            unsigned char found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = -1
 
         sum_gradient_left, sum_hessian_left = 0., 0.
         n_samples_left = 0
-        negative_loss_current_node = negative_loss(sum_gradients,
-                                                   sum_hessians,
-                                                   self.l2_regularization)
 
+        loss_current_node = _loss_from_value(value, sum_gradients)
 
         for bin_idx in range(end):
             n_samples_left += histograms[feature_idx, bin_idx].count
@@ -519,21 +565,40 @@ cdef class Splitter:
 
             gain = _split_gain(sum_gradient_left, sum_hessian_left,
                                sum_gradient_right, sum_hessian_right,
-                               negative_loss_current_node,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
                                self.l2_regularization)
 
-            if gain > split_info.gain and gain > self.min_gain_to_split:
-                split_info.gain = gain
-                split_info.feature_idx = feature_idx
-                split_info.bin_idx = bin_idx
-                # we scan from left to right so missing values go to the right
-                split_info.missing_go_to_left = False
-                split_info.sum_gradient_left = sum_gradient_left
-                split_info.sum_gradient_right = sum_gradient_right
-                split_info.sum_hessian_left = sum_hessian_left
-                split_info.sum_hessian_right = sum_hessian_right
-                split_info.n_samples_left = n_samples_left
-                split_info.n_samples_right = n_samples_right
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from left to right so missing values go to the right
+            split_info.missing_go_to_left = False
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
 
     cdef void _find_best_bin_to_split_right_to_left(
             self,
@@ -542,6 +607,10 @@ cdef class Splitter:
             unsigned int n_samples,
             Y_DTYPE_C sum_gradients,
             Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
             split_info_struct * split_info) nogil:  # OUT
         """Find best bin to split on for a given feature.
 
@@ -565,15 +634,21 @@ cdef class Splitter:
             Y_DTYPE_C sum_hessian_right
             Y_DTYPE_C sum_gradient_left
             Y_DTYPE_C sum_gradient_right
-            Y_DTYPE_C negative_loss_current_node
+            Y_DTYPE_C loss_current_node
             Y_DTYPE_C gain
             unsigned int start = self.n_bins_non_missing[feature_idx] - 2
+            unsigned char found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = split_info.gain  # computed during previous scan
 
         sum_gradient_right, sum_hessian_right = 0., 0.
         n_samples_right = 0
-        negative_loss_current_node = negative_loss(sum_gradients,
-                                                   sum_hessians,
-                                                   self.l2_regularization)
+
+        loss_current_node = _loss_from_value(value, sum_gradients)
 
         for bin_idx in range(start, -1, -1):
             n_samples_right += histograms[feature_idx, bin_idx + 1].count
@@ -604,28 +679,51 @@ cdef class Splitter:
 
             gain = _split_gain(sum_gradient_left, sum_hessian_left,
                                sum_gradient_right, sum_hessian_right,
-                               negative_loss_current_node,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
                                self.l2_regularization)
 
-            if gain > split_info.gain and gain > self.min_gain_to_split:
-                split_info.gain = gain
-                split_info.feature_idx = feature_idx
-                split_info.bin_idx = bin_idx
-                # we scan from right to left so missing values go to the left
-                split_info.missing_go_to_left = True
-                split_info.sum_gradient_left = sum_gradient_left
-                split_info.sum_gradient_right = sum_gradient_right
-                split_info.sum_hessian_left = sum_hessian_left
-                split_info.sum_hessian_right = sum_hessian_right
-                split_info.n_samples_left = n_samples_left
-                split_info.n_samples_right = n_samples_right
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from right to left so missing values go to the left
+            split_info.missing_go_to_left = True
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
+
 
 cdef inline Y_DTYPE_C _split_gain(
         Y_DTYPE_C sum_gradient_left,
         Y_DTYPE_C sum_hessian_left,
         Y_DTYPE_C sum_gradient_right,
         Y_DTYPE_C sum_hessian_right,
-        Y_DTYPE_C negative_loss_current_node,
+        Y_DTYPE_C loss_current_node,
+        char monotonic_cst,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
         Y_DTYPE_C l2_regularization) nogil:
     """Loss reduction
 
@@ -638,18 +736,44 @@ cdef inline Y_DTYPE_C _split_gain(
     """
     cdef:
         Y_DTYPE_C gain
-    gain = negative_loss(sum_gradient_left, sum_hessian_left,
-                         l2_regularization)
-    gain += negative_loss(sum_gradient_right, sum_hessian_right,
-                          l2_regularization)
-    gain -= negative_loss_current_node
+        Y_DTYPE_C value_left
+        Y_DTYPE_C value_right
+
+    # Compute values of potential left and right children
+    value_left = compute_node_value(sum_gradient_left, sum_hessian_left,
+                                    lower_bound, upper_bound,
+                                    l2_regularization)
+    value_right = compute_node_value(sum_gradient_right, sum_hessian_right,
+                                    lower_bound, upper_bound,
+                                    l2_regularization)
+
+    if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or
+            (monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)):
+        # don't consider this split since it does not respect the monotonic
+        # constraints. Note that these comparisons need to be done on values
+        # that have already been clipped to take the monotonic constraints into
+        # account (if any).
+        return -1
+
+    gain = loss_current_node
+    gain -= _loss_from_value(value_left, sum_gradient_left)
+    gain -= _loss_from_value(value_right, sum_gradient_right)
+    # Note that for the gain to be correct (and for min_gain_to_split to work
+    # as expected), we need all values to be bounded (current node, left child
+    # and right child).
+
     return gain
 
-cdef inline Y_DTYPE_C negative_loss(
-        Y_DTYPE_C gradient,
-        Y_DTYPE_C hessian,
-        Y_DTYPE_C l2_regularization) nogil:
-    return (gradient * gradient) / (hessian + l2_regularization)
+cdef inline Y_DTYPE_C _loss_from_value(
+        Y_DTYPE_C value,
+        Y_DTYPE_C sum_gradient) nogil:
+    """Return loss of a node from its (bounded) value
+
+    See Equation 6 of:
+    XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
+    https://arxiv.org/abs/1603.02754
+    """
+    return sum_gradient * value
 
 cdef inline unsigned char sample_goes_left(
         unsigned char missing_go_to_left,
@@ -666,3 +790,32 @@ cdef inline unsigned char sample_goes_left(
         or (
             bin_value <= split_bin_idx
         ))
+
+
+cpdef inline Y_DTYPE_C compute_node_value(
+        Y_DTYPE_C sum_gradient,
+        Y_DTYPE_C sum_hessian,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
+        Y_DTYPE_C l2_regularization) nogil:
+    """Compute a node's value.
+
+    The value is capped in the [lower_bound, upper_bound] interval to respect
+    monotonic constraints. Shrinkage is ignored.
+
+    See Equation 5 of:
+    XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
+    https://arxiv.org/abs/1603.02754
+    """
+
+    cdef:
+        Y_DTYPE_C value 
+
+    value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
+
+    if value < lower_bound:
+        value = lower_bound
+    elif value > upper_bound:
+        value = upper_bound
+
+    return value
\ No newline at end of file
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index c5b4a143591d6..1b61e65793422 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -645,3 +645,39 @@ def test_max_depth_max_leaf_nodes():
     tree = est._predictors[0][0]
     assert tree.get_max_depth() == 2
     assert tree.get_n_leaf_nodes() == 3  # would be 4 prior to bug fix
+
+
+def test_early_stopping_on_test_set_with_warm_start():
+    # Non regression test for #16661 where second fit fails with
+    # warm_start=True, early_stopping is on, and no validation set
+    X, y = make_classification(random_state=0)
+    gb = HistGradientBoostingClassifier(
+        max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
+        n_iter_no_change=1, validation_fraction=None)
+
+    gb.fit(X, y)
+    # does not raise on second call
+    gb.set_params(max_iter=2)
+    gb.fit(X, y)
+
+
+@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
+                                 HistGradientBoostingRegressor))
+def test_single_node_trees(Est):
+    # Make sure it's still possible to build single-node trees. In that case
+    # the value of the root is set to 0. That's a correct value: if the tree is
+    # single-node that's because min_gain_to_split is not respected right from
+    # the root, so we don't want the tree to have any impact on the
+    # predictions.
+
+    X, y = make_classification(random_state=0)
+    y[:] = 1  # constant target will lead to a single root node
+
+    est = Est(max_iter=20)
+    est.fit(X, y)
+
+    assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
+    assert all(predictor[0].nodes[0]['value'] == 0
+               for predictor in est._predictors)
+    # Still gives correct predictions thanks to the baseline prediction
+    assert_allclose(est.predict(X), y)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index d770b50e7aa30..73be2e4f4d155 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -134,6 +134,8 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
     # All the leafs are pure, it is not possible to split any further:
     assert not grower.splittable_nodes
 
+    grower._apply_shrinkage()
+
     # Check the values of the leaves:
     assert grower.root.left_child.value == approx(shrinkage)
     assert grower.root.right_child.left_child.value == approx(shrinkage)
@@ -393,5 +395,5 @@ def test_split_on_nan_with_infinite_values():
     predictions = predictor.predict(X)
     predictions_binned = predictor.predict_binned(
         X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
-    assert np.all(predictions == -gradients)
-    assert np.all(predictions_binned == -gradients)
+    np.testing.assert_allclose(predictions, -gradients)
+    np.testing.assert_allclose(predictions_binned, -gradients)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
new file mode 100644
index 0000000000000..d4e4c8976caed
--- /dev/null
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -0,0 +1,341 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+def is_increasing(a):
+    return (np.diff(a) >= 0.0).all()
+
+
+def is_decreasing(a):
+    return (np.diff(a) <= 0.0).all()
+
+
+def assert_leaves_values_monotonic(predictor, monotonic_cst):
+    # make sure leaves values (from left to right) are either all increasing
+    # or all decreasing (or neither) depending on the monotonic constraint.
+    nodes = predictor.nodes
+
+    def get_leaves_values():
+        """get leaves values from left to right"""
+        values = []
+
+        def depth_first_collect_leaf_values(node_idx):
+            node = nodes[node_idx]
+            if node['is_leaf']:
+                values.append(node['value'])
+                return
+            depth_first_collect_leaf_values(node['left'])
+            depth_first_collect_leaf_values(node['right'])
+
+        depth_first_collect_leaf_values(0)  # start at root (0)
+        return values
+
+    values = get_leaves_values()
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        # some increasing, some decreasing
+        assert not is_increasing(values) and not is_decreasing(values)
+    elif monotonic_cst == MonotonicConstraint.POS:
+        # all increasing
+        assert is_increasing(values)
+    else:  # NEG
+        # all decreasing
+        assert is_decreasing(values)
+
+
+def assert_children_values_monotonic(predictor, monotonic_cst):
+    # Make sure siblings values respect the monotonic constraints. Left should
+    # be lower (resp greater) than right child if constraint is POS (resp.
+    # NEG).
+    # Note that this property alone isn't enough to ensure full monotonicity,
+    # since we also need to guanrantee that all the descendents of the left
+    # child won't be greater (resp. lower) than the right child, or its
+    # descendents. That's why we need to bound the predicted values (this is
+    # tested in assert_children_values_bounded)
+    nodes = predictor.nodes
+    left_lower = []
+    left_greater = []
+    for node in nodes:
+        if node['is_leaf']:
+            continue
+
+        left_idx = node['left']
+        right_idx = node['right']
+
+        if nodes[left_idx]['value'] < nodes[right_idx]['value']:
+            left_lower.append(node)
+        elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
+            left_greater.append(node)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        assert left_lower and left_greater
+    elif monotonic_cst == MonotonicConstraint.POS:
+        assert left_lower and not left_greater
+    else:  # NEG
+        assert not left_lower and left_greater
+
+
+def assert_children_values_bounded(grower, monotonic_cst):
+    # Make sure that the values of the children of a node are bounded by the
+    # middle value between that node and its sibling (if there is a monotonic
+    # constraint).
+    # As a bonus, we also check that the siblings values are properly ordered
+    # which is slightly redundant with assert_children_values_monotonic (but
+    # this check is done on the grower nodes whereas
+    # assert_children_values_monotonic is done on the predictor nodes)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        return
+
+    def recursively_check_children_node_values(node):
+        if node.is_leaf:
+            return
+        if node is not grower.root and node is node.parent.left_child:
+            sibling = node.sibling  # on the right
+            middle = (node.value + sibling.value) / 2
+            if monotonic_cst == MonotonicConstraint.POS:
+                assert (node.left_child.value <=
+                        node.right_child.value <=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle <=
+                            sibling.left_child.value <=
+                            sibling.right_child.value)
+            else:  # NEG
+                assert (node.left_child.value >=
+                        node.right_child.value >=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle >=
+                            sibling.left_child.value >=
+                            sibling.right_child.value)
+
+        recursively_check_children_node_values(node.left_child)
+        recursively_check_children_node_values(node.right_child)
+
+    recursively_check_children_node_values(grower.root)
+
+
+@pytest.mark.parametrize('seed', range(3))
+@pytest.mark.parametrize('monotonic_cst', (
+    MonotonicConstraint.NO_CST,
+    MonotonicConstraint.POS,
+    MonotonicConstraint.NEG,
+))
+def test_nodes_values(monotonic_cst, seed):
+    # Build a single tree with only one feature, and make sure the nodes
+    # values respect the monotonic constraints.
+
+    # Considering the following tree with a monotonic POS constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     5     10    # middle = 7.5
+    #    / \   / \
+    #   a  b  c  d
+    #
+    # a <= b and c <= d  (assert_children_values_monotonic)
+    # a, b <= middle <= c, d (assert_children_values_bounded)
+    # a <= b <= c <= d (assert_leaves_values_monotonic)
+    #
+    # The last one is a consequence of the others, but can't hurt to check
+
+    rng = np.random.RandomState(seed)
+    n_samples = 1000
+    n_features = 1
+    X_binned = rng.randint(0, 255, size=(n_samples, n_features),
+                           dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        monotonic_cst=[monotonic_cst],
+                        shrinkage=.1)
+    grower.grow()
+
+    # grow() will shrink the leaves values at the very end. For our comparison
+    # tests, we need to revert the shrinkage of the leaves, else we would
+    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
+    # test would not be correct.
+    for leave in grower.finalized_leaves:
+        leave.value /= grower.shrinkage
+
+    # The consistency of the bounds can only be checked on the tree grower
+    # as the node bounds are not copied into the predictor tree. The
+    # consistency checks on the values of node children and leaves can be
+    # done either on the grower tree or on the predictor tree. We only
+    # do those checks on the predictor tree as the latter is derived from
+    # the former.
+    predictor = grower.make_predictor()
+    assert_children_values_monotonic(predictor, monotonic_cst)
+    assert_children_values_bounded(grower, monotonic_cst)
+    assert_leaves_values_monotonic(predictor, monotonic_cst)
+
+
+@pytest.mark.parametrize('seed', range(3))
+def test_predictions(seed):
+    # Train a model with a POS constraint on the first feature and a NEG
+    # constraint on the second feature, and make sure the constraints are
+    # respected by checking the predictions.
+    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
+    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
+
+    rng = np.random.RandomState(seed)
+
+    n_samples = 1000
+    f_0 = rng.rand(n_samples)  # positive correlation with y
+    f_1 = rng.rand(n_samples)  # negative correslation with y
+    X = np.c_[f_0, f_1]
+    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+    y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+         5 * f_1 - np.cos(10 * np.pi * f_1) +
+         noise)
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+    gbdt.fit(X, y)
+
+    linspace = np.linspace(0, 1, 100)
+    sin = np.sin(linspace)
+    constant = np.full_like(linspace, fill_value=.5)
+
+    # We now assert the predictions properly respect the constraints, on each
+    # feature. When testing for a feature we need to set the other one to a
+    # constant, because the monotonic constraints are only a "all else being
+    # equal" type of constraints:
+    # a constraint on the first feature only means that
+    # x0 < x0' => f(x0, x1) < f(x0', x1)
+    # while x1 stays constant.
+    # The constraint does not guanrantee that
+    # x0 < x0' => f(x0, x1) < f(x0', x1')
+
+    # First feature (POS)
+    # assert pred is all increasing when f_0 is all increasing
+    X = np.c_[linspace, constant]
+    pred = gbdt.predict(X)
+    assert is_increasing(pred)
+    # assert pred actually follows the variations of f_0
+    X = np.c_[sin, constant]
+    pred = gbdt.predict(X)
+    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
+
+    # Second feature (NEG)
+    # assert pred is all decreasing when f_1 is all increasing
+    X = np.c_[constant, linspace]
+    pred = gbdt.predict(X)
+    assert is_decreasing(pred)
+    # assert pred actually follows the inverse variations of f_1
+    X = np.c_[constant, sin]
+    pred = gbdt.predict(X)
+    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
+
+
+def test_input_error():
+    X = [[1, 2], [2, 3], [3, 4]]
+    y = [0, 1, 2]
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
+    with pytest.raises(ValueError,
+                       match='monotonic_cst has shape 3 but the input data'):
+        gbdt.fit(X, y)
+
+    for monotonic_cst in ([1, 3], [1, -3]):
+        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+        with pytest.raises(ValueError,
+                           match='must be None or an array-like of '
+                                 '-1, 0 or 1'):
+            gbdt.fit(X, y)
+
+    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
+    with pytest.raises(
+            ValueError,
+            match='monotonic constraints are not supported '
+                  'for multiclass classification'
+            ):
+        gbdt.fit(X, y)
+
+
+def test_bounded_value_min_gain_to_split():
+    # The purpose of this test is to show that when computing the gain at a
+    # given split, the value of the current node should be properly bounded to
+    # respect the monotonic constraints, because it strongly interacts with
+    # min_gain_to_split. We build a simple example where gradients are [1, 1,
+    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
+    # bin, and depending on whether the value of the node is bounded or not,
+    # the min_gain_to_split constraint is or isn't satisfied.
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    n_bins = n_samples = 5
+    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    children_lower_bound, children_upper_bound = -np.inf, np.inf
+
+    min_gain_to_split = 2000
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    # Since the gradient array is [1, 1, 100, 1, 1]
+    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
+    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
+    # the node is considered unsplittable (gain = -1)
+    current_lower_bound, current_upper_bound = -np.inf, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    # the unbounded value is equal to -sum_gradients / sum_hessians
+    assert value == pytest.approx(-104 / 5)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain == -1  # min_gain_to_split not respected
+
+    # here again the max possible gain is on the 3rd bin but we now cap the
+    # value of the node into [-10, inf].
+    # This means the gain is now about 2430 which is more than the
+    # min_gain_to_split constraint.
+    current_lower_bound, current_upper_bound = -10, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    assert value == -10
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain > min_gain_to_split
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
index 5f80f99f05116..bcc19d750d9d3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -4,7 +4,11 @@
 from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
 from sklearn.utils._testing import skip_if_32bit
 
@@ -43,20 +47,26 @@ def test_histogram_split(n_bins):
                                           dtype=np.uint32)
             has_missing_values = np.array([False] * X_binned.shape[1],
                                           dtype=np.uint8)
+            monotonic_cst = np.array(
+                [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+                dtype=np.int8)
             missing_values_bin_idx = n_bins - 1
             splitter = Splitter(X_binned,
                                 n_bins_non_missing,
                                 missing_values_bin_idx,
                                 has_missing_values,
+                                monotonic_cst,
                                 l2_regularization,
                                 min_hessian_to_split,
                                 min_samples_leaf, min_gain_to_split,
                                 hessians_are_constant)
 
             histograms = builder.compute_histograms_brute(sample_indices)
+            value = compute_node_value(sum_gradients, sum_hessians,
+                                       -np.inf, np.inf, l2_regularization)
             split_info = splitter.find_node_split(
                 sample_indices.shape[0], histograms, sum_gradients,
-                sum_hessians)
+                sum_hessians, value)
 
             assert split_info.bin_idx == true_bin
             assert split_info.gain >= 0
@@ -106,26 +116,40 @@ def test_gradient_and_hessian_sanity(constant_hessian):
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, constant_hessian)
 
     hists_parent = builder.compute_histograms_brute(sample_indices)
+    value_parent = compute_node_value(sum_gradients, sum_hessians,
+                                      -np.inf, np.inf, l2_regularization)
     si_parent = splitter.find_node_split(n_samples, hists_parent,
-                                         sum_gradients, sum_hessians)
+                                         sum_gradients, sum_hessians,
+                                         value_parent)
     sample_indices_left, sample_indices_right, _ = splitter.split_indices(
         si_parent, sample_indices)
 
     hists_left = builder.compute_histograms_brute(sample_indices_left)
+    value_left = compute_node_value(si_parent.sum_gradient_left,
+                                    si_parent.sum_hessian_left,
+                                    -np.inf, np.inf, l2_regularization)
     hists_right = builder.compute_histograms_brute(sample_indices_right)
+    value_right = compute_node_value(si_parent.sum_gradient_right,
+                                     si_parent.sum_hessian_right,
+                                     -np.inf, np.inf, l2_regularization)
     si_left = splitter.find_node_split(n_samples, hists_left,
                                        si_parent.sum_gradient_left,
-                                       si_parent.sum_hessian_left)
+                                       si_parent.sum_hessian_left,
+                                       value_left)
     si_right = splitter.find_node_split(n_samples, hists_right,
                                         si_parent.sum_gradient_right,
-                                        si_parent.sum_hessian_right)
+                                        si_parent.sum_hessian_right,
+                                        value_right)
 
     # make sure that si.sum_gradient_left + si.sum_gradient_right have their
     # expected value, same for hessians
@@ -206,17 +230,22 @@ def test_split_indices():
     n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, hessians_are_constant)
 
     assert np.all(sample_indices == splitter.partition)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     si_root = splitter.find_node_split(n_samples, histograms,
-                                       sum_gradients, sum_hessians)
+                                       sum_gradients, sum_hessians, value)
 
     # sanity checks for best split
     assert si_root.feature_idx == 1
@@ -263,15 +292,20 @@ def test_min_gain_to_split():
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, hessians_are_constant)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     split_info = splitter.find_node_split(n_samples, histograms,
-                                          sum_gradients, sum_hessians)
+                                          sum_gradients, sum_hessians, value)
     assert split_info.gain == -1
 
 
@@ -347,7 +381,7 @@ def test_min_gain_to_split():
          3,  # cut on bin_idx=3
          False),  # missing values go to right
 
-        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 4 <=> missing
+        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
          [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
          True,  # missing values
          6,  # n_bins_non_missing
@@ -400,16 +434,22 @@ def test_splitting_missing_values(X_binned, all_gradients,
                                hessians_are_constant)
 
     n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing,
                         missing_values_bin_idx, has_missing_values,
+                        monotonic_cst,
                         l2_regularization, min_hessian_to_split,
                         min_samples_leaf, min_gain_to_split,
                         hessians_are_constant)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     split_info = splitter.find_node_split(n_samples, histograms,
-                                          sum_gradients, sum_hessians)
+                                          sum_gradients, sum_hessians, value)
 
     assert split_info.bin_idx == expected_bin_idx
     if has_missing_values:
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 501f2425541e8..9cec1c08efc9e 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -16,6 +16,7 @@
 )
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..base import OutlierMixin
 
 from ._bagging import BaseBagging
@@ -93,7 +94,7 @@ class IsolationForest(OutlierMixin, BaseBagging):
         processors. See :term:`Glossary <n_jobs>` for more details.
 
     behaviour : str, default='deprecated'
-        This parameter has not effect, is deprecated, and will be removed.
+        This parameter has no effect, is deprecated, and will be removed.
 
         .. versionadded:: 0.20
            ``behaviour`` is added in 0.20 for back-compatibility purpose.
@@ -181,8 +182,8 @@ class IsolationForest(OutlierMixin, BaseBagging):
     >>> clf.predict([[0.1], [0], [90]])
     array([ 1,  1, -1])
     """
-
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  n_estimators=100,
                  max_samples="auto",
                  contamination="auto",
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index ba817613523f6..a75e9236f1612 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -30,6 +30,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 
 
 class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
@@ -37,7 +38,7 @@ class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
     """Base class for stacking method."""
 
     @abstractmethod
-    def __init__(self, estimators, final_estimator=None, cv=None,
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
                  stack_method='auto', n_jobs=None, verbose=0,
                  passthrough=False):
         super().__init__(estimators=estimators)
@@ -366,7 +367,8 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
     0.9...
 
     """
-    def __init__(self, estimators, final_estimator=None, cv=None,
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
                  stack_method='auto', n_jobs=None, passthrough=False,
                  verbose=0):
         super().__init__(
@@ -603,8 +605,9 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
     0.3...
 
     """
-    def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None,
-                 passthrough=False, verbose=0):
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
+                 n_jobs=None, passthrough=False, verbose=0):
         super().__init__(
             estimators=estimators,
             final_estimator=final_estimator,
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 0da6dc86c30fa..8d2bbbe8c2b8a 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -30,6 +30,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
 
 
@@ -205,8 +206,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     >>> print(eclf3.transform(X).shape)
     (6, 6)
     """
-
-    def __init__(self, estimators, voting='hard', weights=None,
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, voting='hard', weights=None,
                  n_jobs=None, flatten_transform=True, verbose=False):
         super().__init__(estimators=estimators)
         self.voting = voting
@@ -349,8 +350,8 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
 
     .. versionadded:: 0.21
 
-    A voting regressor is an ensemble meta-estimator that fits base
-    regressors each on the whole dataset. It, then, averages the individual
+    A voting regressor is an ensemble meta-estimator that fits several base
+    regressors, each on the whole dataset. Then it averages the individual
     predictions to form a final prediction.
 
     Read more in the :ref:`User Guide <voting_regressor>`.
@@ -410,8 +411,9 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
     >>> print(er.fit(X, y).predict(X))
     [ 3.3  5.7 11.8 19.7 28.  40.3]
     """
-
-    def __init__(self, estimators, weights=None, n_jobs=None, verbose=False):
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, weights=None, n_jobs=None,
+                 verbose=False):
         super().__init__(estimators=estimators)
         self.weights = weights
         self.n_jobs = n_jobs
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index de73858f4bb3f..7fc8f898a5ae0 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -33,7 +33,7 @@
 from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
 
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
-from ..utils import check_array, check_random_state, check_X_y, _safe_indexing
+from ..utils import check_array, check_random_state, _safe_indexing
 from ..utils.extmath import softmax
 from ..utils.extmath import stable_cumsum
 from ..metrics import accuracy_score, r2_score
@@ -41,6 +41,7 @@
 from ..utils.validation import _check_sample_weight
 from ..utils.validation import has_fit_parameter
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = [
     'AdaBoostClassifier',
@@ -57,7 +58,7 @@ class BaseWeightBoosting(BaseEnsemble, metaclass=ABCMeta):
 
     @abstractmethod
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  estimator_params=tuple(),
                  learning_rate=1.,
@@ -397,8 +398,9 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
     >>> clf.score(X, y)
     0.983...
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  learning_rate=1.,
                  algorithm='SAMME.R',
@@ -959,8 +961,9 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
     .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
 
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  learning_rate=1.,
                  loss='linear',
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 9164751bac256..8144a095cec3a 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -15,11 +15,13 @@
 import itertools
 from itertools import combinations
 from itertools import product
+from typing import Dict, Any
 
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import csc_matrix
 from scipy.sparse import coo_matrix
+from scipy.special import comb
 
 import pytest
 
@@ -47,7 +49,6 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 from sklearn.utils.validation import check_random_state
-from sklearn.utils.fixes import comb
 
 from sklearn.tree._classes import SPARSE_SPLITTERS
 
@@ -100,12 +101,12 @@
     "RandomTreesEmbedding": RandomTreesEmbedding,
 }
 
-FOREST_ESTIMATORS = dict()
+FOREST_ESTIMATORS: Dict[str, Any] = dict()
 FOREST_ESTIMATORS.update(FOREST_CLASSIFIERS)
 FOREST_ESTIMATORS.update(FOREST_REGRESSORS)
 FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS)
 
-FOREST_CLASSIFIERS_REGRESSORS = FOREST_CLASSIFIERS.copy()
+FOREST_CLASSIFIERS_REGRESSORS: Dict[str, Any] = FOREST_CLASSIFIERS.copy()
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
@@ -1259,7 +1260,8 @@ def test_min_impurity_decrease():
             assert tree.min_impurity_decrease == 0.1
 
 
-class MyBackend(DEFAULT_JOBLIB_BACKEND):
+# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
+class MyBackend(DEFAULT_JOBLIB_BACKEND):  # type: ignore
     def __init__(self, *args, **kwargs):
         self.count = 0
         super().__init__(*args, **kwargs)
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 0c7f07929e370..c7653ddac959c 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -13,7 +13,6 @@
 
 from sklearn import datasets
 from sklearn.base import clone
-from sklearn.base import BaseEstimator
 from sklearn.datasets import (make_classification, fetch_california_housing,
                               make_regression)
 from sklearn.ensemble import GradientBoostingClassifier
diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py
index 6b0a6ad8a28bb..7de4f2e434de0 100644
--- a/sklearn/experimental/enable_hist_gradient_boosting.py
+++ b/sklearn/experimental/enable_hist_gradient_boosting.py
@@ -26,7 +26,11 @@
 
 from .. import ensemble
 
-ensemble.HistGradientBoostingClassifier = HistGradientBoostingClassifier
-ensemble.HistGradientBoostingRegressor = HistGradientBoostingRegressor
+# use settattr to avoid mypy errors when monkeypatching
+setattr(ensemble, "HistGradientBoostingClassifier",
+        HistGradientBoostingClassifier)
+setattr(ensemble, "HistGradientBoostingRegressor",
+        HistGradientBoostingRegressor)
+
 ensemble.__all__ += ['HistGradientBoostingClassifier',
                      'HistGradientBoostingRegressor']
diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py
index 99d18a289aa99..eebe816980b0f 100644
--- a/sklearn/experimental/enable_iterative_imputer.py
+++ b/sklearn/experimental/enable_iterative_imputer.py
@@ -15,5 +15,6 @@
 from ..impute._iterative import IterativeImputer
 from .. import impute
 
-impute.IterativeImputer = IterativeImputer
+# use settattr to avoid mypy errors when monkeypatching
+setattr(impute, 'IterativeImputer', IterativeImputer)
 impute.__all__ += ['IterativeImputer']
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index bf3cbfc9a9b98..63a39c3c15d4a 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -148,6 +148,8 @@
                     'joel.nothman@gmail.com')
 __version__ = '2.4.0'
 
+from typing import Optional
+
 import re
 import sys
 import csv
@@ -318,7 +320,7 @@ def _parse_values(s):
 
 # EXCEPTIONS ==================================================================
 class ArffException(Exception):
-    message = None
+    message : Optional[str] = None
 
     def __init__(self):
         self.line = -1
diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
deleted file mode 100644
index 4fcf030c28853..0000000000000
--- a/sklearn/externals/joblib/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Import necessary to preserve backward compatibility of pickles
-import sys
-import warnings
-
-from joblib import *
-
-
-msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be removed "
-       "in 0.23. Please import this functionality directly from joblib, "
-       "which can be installed with: pip install joblib. If this warning is "
-       "raised when loading pickled models, you may need to re-serialize "
-       "those models with scikit-learn 0.21+.")
-
-if not hasattr(sys, "_is_pytest_session"):
-    warnings.warn(msg, category=FutureWarning)
diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py
deleted file mode 100644
index e79a0e1c5c056..0000000000000
--- a/sklearn/externals/joblib/numpy_pickle.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Import necessary to preserve backward compatibility of pickles
-
-from joblib.numpy_pickle import *
diff --git a/sklearn/externals/setup.py b/sklearn/externals/setup.py
deleted file mode 100644
index 936f0327226d6..0000000000000
--- a/sklearn/externals/setup.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-
-
-def configuration(parent_package='', top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    config = Configuration('externals', parent_package, top_path)
-    config.add_subpackage('joblib')
-
-    return config
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 4954329728d5e..ebc584b6271a9 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1165,7 +1165,7 @@ def fit(self, raw_documents, y=None):
         return self
 
     def fit_transform(self, raw_documents, y=None):
-        """Learn the vocabulary dictionary and return term-document matrix.
+        """Learn the vocabulary dictionary and return document-term matrix.
 
         This is equivalent to fit followed by transform, but more efficiently
         implemented.
@@ -1333,7 +1333,7 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
     If ``smooth_idf=True`` (the default), the constant "1" is added to the
     numerator and denominator of the idf as if an extra document was seen
     containing every term in the collection exactly once, which prevents
-    zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.
+    zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
 
     Furthermore, the formulas used to compute tf and idf depend
     on parameter settings that correspond to the SMART notation used in IR
@@ -1816,7 +1816,7 @@ def fit(self, raw_documents, y=None):
         return self
 
     def fit_transform(self, raw_documents, y=None):
-        """Learn vocabulary and idf, return term-document matrix.
+        """Learn vocabulary and idf, return document-term matrix.
 
         This is equivalent to fit followed by transform, but more efficiently
         implemented.
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index dd72bddc58eb5..76326a8617da5 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -10,6 +10,7 @@
 
 from ..exceptions import NotFittedError
 from ..utils.metaestimators import if_delegate_has_method
+from ..utils.validation import _deprecate_positional_args
 
 
 def _get_feature_importances(estimator, norm_order=1):
@@ -116,9 +117,8 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
         estimator is of dimension 2.
 
     max_features : int or None, optional
-        The maximum number of features selected scoring above ``threshold``.
-        To disable ``threshold`` and only select based on ``max_features``,
-        set ``threshold=-np.inf``.
+        The maximum number of features to select.
+        To only select based on ``max_features``, set ``threshold=-np.inf``.
 
         .. versionadded:: 0.20
 
@@ -158,7 +158,8 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
            [-0.48],
            [ 1.48]])
     """
-    def __init__(self, estimator, threshold=None, prefit=False,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, threshold=None, prefit=False,
                  norm_order=1, max_features=None):
         self.estimator = estimator
         self.threshold = threshold
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 69e3cc4de9e6c..7e7aada0d70b3 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -9,10 +9,11 @@
 import numpy as np
 from joblib import Parallel, delayed, effective_n_jobs
 
-from ..utils import check_X_y, safe_sqr
+from ..utils import safe_sqr
 from ..utils.metaestimators import if_delegate_has_method
 from ..utils.metaestimators import _safe_split
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
 from ..base import clone
@@ -95,7 +96,7 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
     >>> from sklearn.svm import SVR
     >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
     >>> estimator = SVR(kernel="linear")
-    >>> selector = RFE(estimator, 5, step=1)
+    >>> selector = RFE(estimator, n_features_to_select=5, step=1)
     >>> selector = selector.fit(X, y)
     >>> selector.support_
     array([ True,  True,  True,  True,  True, False, False, False, False,
@@ -119,7 +120,8 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    def __init__(self, estimator, n_features_to_select=None, step=1,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_features_to_select=None, step=1,
                  verbose=0):
         self.estimator = estimator
         self.n_features_to_select = n_features_to_select
@@ -464,7 +466,8 @@ class RFECV(RFE):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    def __init__(self, estimator, step=1, min_features_to_select=1, cv=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None,
                  scoring=None, verbose=0, n_jobs=None):
         self.estimator = estimator
         self.step = step
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 221e46f2a505e..7ca0ce4a36715 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -17,6 +17,7 @@
                      safe_mask)
 from ..utils.extmath import safe_sparse_dot, row_norms
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ._base import SelectorMixin
 
 
@@ -419,9 +420,9 @@ class SelectPercentile(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, percentile=10):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, percentile=10):
+        super().__init__(score_func=score_func)
         self.percentile = percentile
 
     def _check_params(self, X, y):
@@ -503,9 +504,9 @@ class SelectKBest(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, k=10):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, k=10):
+        super().__init__(score_func=score_func)
         self.k = k
 
     def _check_params(self, X, y):
@@ -582,9 +583,9 @@ class SelectFpr(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -648,9 +649,9 @@ class SelectFdr(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -711,9 +712,9 @@ class SelectFwe(_BaseFilter):
     SelectFdr: Select features based on an estimated false discovery rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -761,7 +762,7 @@ class GenericUnivariateSelect(_BaseFilter):
     >>> X, y = load_breast_cancer(return_X_y=True)
     >>> X.shape
     (569, 30)
-    >>> transformer = GenericUnivariateSelect(chi2, 'k_best', param=20)
+    >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
     >>> X_new = transformer.fit_transform(X, y)
     >>> X_new.shape
     (569, 20)
@@ -786,8 +787,9 @@ class GenericUnivariateSelect(_BaseFilter):
                         'fdr': SelectFdr,
                         'fwe': SelectFwe}
 
-    def __init__(self, score_func=f_classif, mode='percentile', param=1e-5):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, mode='percentile', param=1e-5):
+        super().__init__(score_func=score_func)
         self.mode = mode
         self.param = param
 
diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
index 6438e6b80dc0a..b0bd41ba41abd 100644
--- a/sklearn/feature_selection/_variance_threshold.py
+++ b/sklearn/feature_selection/_variance_threshold.py
@@ -4,7 +4,6 @@
 import numpy as np
 from ..base import BaseEstimator
 from ._base import SelectorMixin
-from ..utils import check_array
 from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
 from ..utils.validation import check_is_fitted
 
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index abb11fdc7b8da..27938c5e27819 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -552,7 +552,7 @@ def test_nans():
     X = [[0, 1, 0], [0, -1, -1], [0, .5, .5]]
     y = [1, 0, 1]
 
-    for select in (SelectKBest(f_classif, 2),
+    for select in (SelectKBest(f_classif, k=2),
                    SelectPercentile(f_classif, percentile=67)):
         ignore_warnings(select.fit)(X, y)
         assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index ed8ed2a007a22..e70838c6d251a 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -14,7 +14,7 @@
 from ..base import BaseEstimator, ClassifierMixin, clone
 from .kernels \
     import RBF, CompoundKernel, ConstantKernel as C
-from ..utils.validation import check_X_y, check_is_fitted, check_array
+from ..utils.validation import check_is_fitted, check_array
 from ..utils import check_random_state
 from ..utils.optimize import _check_optimize_result
 from ..preprocessing import LabelEncoder
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 1b48efb39f26d..caf94ce41c1b4 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -15,7 +15,7 @@
 from ..base import MultiOutputMixin
 from .kernels import RBF, ConstantKernel as C
 from ..utils import check_random_state
-from ..utils.validation import check_X_y, check_array
+from ..utils.validation import check_array
 from ..utils.optimize import _check_optimize_result
 
 
@@ -486,7 +486,7 @@ def log_marginal_likelihood(self, theta=None, eval_gradient=False,
             # constructing the full matrix tmp.dot(K_gradient) since only
             # its diagonal is required
             log_likelihood_gradient_dims = \
-                0.5 * np.einsum("ijl,ijk->kl", tmp, K_gradient)
+                0.5 * np.einsum("ijl,jik->kl", tmp, K_gradient)
             log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1)
 
         if eval_gradient:
diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py
index 4e435d44fbdbf..940035ae58589 100644
--- a/sklearn/impute/__init__.py
+++ b/sklearn/impute/__init__.py
@@ -1,8 +1,14 @@
 """Transformers for missing value imputation"""
+import typing
 
 from ._base import MissingIndicator, SimpleImputer
 from ._knn import KNNImputer
 
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._iterative import IterativeImputer  # noqa
+
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index bc98778d5c5d8..608f8f54ee162 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -16,7 +16,6 @@
 from ..utils.validation import FLOAT_DTYPES
 from ..utils._mask import _get_mask
 from ..utils import is_scalar_nan
-from ..utils import check_array
 
 
 def _check_inputs_dtype(X, missing_values):
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 58a35d157c7a4..88eff8503d510 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -1,10 +1,8 @@
 
 from time import time
-from distutils.version import LooseVersion
 from collections import namedtuple
 import warnings
 
-import scipy
 from scipy import stats
 import numpy as np
 
@@ -329,19 +327,10 @@ def _impute_one_feature(self,
             a = (self._min_value[feat_idx] - mus) / sigmas
             b = (self._max_value[feat_idx] - mus) / sigmas
 
-            if scipy.__version__ < LooseVersion('0.18'):
-                # bug with vector-valued `a` in old scipy
-                imputed_values[inrange_mask] = [
-                    stats.truncnorm(a=a_, b=b_,
-                                    loc=loc_, scale=scale_).rvs(
-                                        random_state=self.random_state_)
-                    for a_, b_, loc_, scale_
-                    in zip(a, b, mus, sigmas)]
-            else:
-                truncated_normal = stats.truncnorm(a=a, b=b,
-                                                   loc=mus, scale=sigmas)
-                imputed_values[inrange_mask] = truncated_normal.rvs(
-                    random_state=self.random_state_)
+            truncated_normal = stats.truncnorm(a=a, b=b,
+                                               loc=mus, scale=sigmas)
+            imputed_values[inrange_mask] = truncated_normal.rvs(
+                random_state=self.random_state_)
         else:
             imputed_values = estimator.predict(X_test)
             imputed_values = np.clip(imputed_values,
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index ff4d9d6738977..8efafd8a7eef4 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -100,6 +100,21 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     ----------
     .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
              2001. https://doi.org/10.1023/A:1010933404324
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.inspection import permutation_importance
+    >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],
+    ...      [0, 9, 9],[0, 9, 9],[0, 9, 9]]
+    >>> y = [1, 1, 1, 0, 0, 0]
+    >>> clf = LogisticRegression().fit(X, y)
+    >>> result = permutation_importance(clf, X, y, n_repeats=10,
+    ...                                 random_state=0)
+    >>> result.importances_mean
+    array([0.4666..., 0.       , 0.       ])
+    >>> result.importances_std
+    array([0.2211..., 0.       , 0.       ])
     """
     if not hasattr(X, "iloc"):
         X = check_array(X, force_all_finite='allow-nan', dtype=None)
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index 5dee2750ad37a..f39c604cac77b 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -9,7 +9,7 @@
 from joblib import Parallel, delayed
 
 from .. import partial_dependence
-from ...base import is_classifier, is_regressor
+from ...base import is_regressor
 from ...utils import check_array
 from ...utils import check_matplotlib_support  # noqa
 from ...utils import _safe_indexing
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 896044ae9cc6e..96e82b7b6a318 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -252,12 +252,10 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True):
         unique_X, unique_y, unique_sample_weight = _make_unique(
             X, y, sample_weight)
 
-        # Store _X_ and _y_ to maintain backward compat during the deprecation
-        # period of X_ and y_
-        self._X_ = X = unique_X
-        self._y_ = y = isotonic_regression(unique_y, unique_sample_weight,
-                                           self.y_min, self.y_max,
-                                           increasing=self.increasing_)
+        X = unique_X
+        y = isotonic_regression(unique_y, unique_sample_weight,
+                                self.y_min, self.y_max,
+                                increasing=self.increasing_)
 
         # Handle the left and right bounds on X
         self.X_min_, self.X_max_ = np.min(X), np.max(X)
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 6ae62ce245a56..47b7813ae8aaa 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -40,10 +40,10 @@ class RBFSampler(TransformerMixin, BaseEstimator):
         Equals the dimensionality of the computed feature space.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        weights and random offset when fitting the training data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
@@ -154,10 +154,10 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator):
         Equals the dimensionality of the computed feature space.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        weights and random offset when fitting the training data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Examples
     --------
@@ -482,10 +482,11 @@ class Nystroem(TransformerMixin, BaseEstimator):
         How many data points will be used to construct the mapping.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the uniform sampling without
+        replacement of n_components of the training data to construct the basis
+        kernel.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 21c43979c3b1e..d08c706caefc4 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -9,7 +9,6 @@
 from .base import BaseEstimator, RegressorMixin, MultiOutputMixin
 from .metrics.pairwise import pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
-from .utils import check_X_y
 from .utils.validation import check_is_fitted, _check_sample_weight
 
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index d280f9d0f0d81..8e91767b9ff53 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -26,7 +26,7 @@
 
 from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                     MultiOutputMixin)
-from ..utils import check_array, check_X_y
+from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index c67fc54f43157..397461e73d8be 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -12,7 +12,6 @@
 from ._base import LinearModel, _rescale_data
 from ..base import RegressorMixin
 from ..utils.extmath import fast_logdet
-from ..utils import check_X_y
 from ..utils.fixes import pinvh
 from ..utils.validation import _check_sample_weight
 
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 9281d03710455..46e924abbc1d0 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -17,7 +17,7 @@
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
 from ._base import _preprocess_data
-from ..utils import check_array, check_X_y
+from ..utils import check_array
 from ..utils.validation import check_random_state
 from ..model_selection import check_cv
 from ..utils.extmath import safe_sparse_dot
@@ -25,7 +25,8 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import column_or_1d
 
-from . import _cd_fast as cd_fast
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from . import _cd_fast as cd_fast  # type: ignore
 
 
 def _set_order(X, y, order='C'):
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index 1d3a3fcc73421..d9046d3a1ee9b 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -7,7 +7,6 @@
 
 from ..base import BaseEstimator, RegressorMixin
 from ._base import LinearModel
-from ..utils import check_X_y
 from ..utils import axis0_safe_slice
 from ..utils.validation import _check_sample_weight
 from ..utils.extmath import safe_sparse_dot
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 9f0f62471376a..a3781cf981710 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -19,7 +19,8 @@
 
 from ._base import LinearModel
 from ..base import RegressorMixin, MultiOutputMixin
-from ..utils import arrayfuncs, as_float_array, check_X_y
+# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
+from ..utils import arrayfuncs, as_float_array  # type: ignore
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 9e84e56ee0284..874dc743f4cc2 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 from scipy import optimize, sparse
-from scipy.special import expit
+from scipy.special import expit, logsumexp
 from joblib import Parallel, delayed, effective_n_jobs
 
 from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
@@ -27,11 +27,8 @@
 from ..utils.extmath import (log_logistic, safe_sparse_dot, softmax,
                              squared_norm)
 from ..utils.extmath import row_norms
-from ..utils.fixes import logsumexp
 from ..utils.optimize import _newton_cg, _check_optimize_result
-from ..utils.validation import check_X_y
 from ..utils.validation import check_is_fitted, _check_sample_weight
-from ..utils import deprecated
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
 from ..model_selection import check_cv
@@ -732,10 +729,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             w0 = w0.ravel()
         target = Y_multi
         if solver == 'lbfgs':
-            func = lambda x, *args: _multinomial_loss_grad(x, *args)[0:2]
+            def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2]
         elif solver == 'newton-cg':
-            func = lambda x, *args: _multinomial_loss(x, *args)[0]
-            grad = lambda x, *args: _multinomial_loss_grad(x, *args)[1]
+            def func(x, *args): return _multinomial_loss(x, *args)[0]
+            def grad(x, *args): return _multinomial_loss_grad(x, *args)[1]
             hess = _multinomial_grad_hess
         warm_start_sag = {'coef': w0.T}
     else:
@@ -744,7 +741,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             func = _logistic_loss_and_grad
         elif solver == 'newton-cg':
             func = _logistic_loss
-            grad = lambda x, *args: _logistic_loss_and_grad(x, *args)[1]
+            def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1]
             hess = _logistic_grad_hess
         warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}
 
@@ -1309,8 +1306,8 @@ def fit(self, X, y, sample_weight=None):
         if self.penalty == 'elasticnet':
             if (not isinstance(self.l1_ratio, numbers.Number) or
                     self.l1_ratio < 0 or self.l1_ratio > 1):
-                        raise ValueError("l1_ratio must be between 0 and 1;"
-                                         " got (l1_ratio=%r)" % self.l1_ratio)
+                raise ValueError("l1_ratio must be between 0 and 1;"
+                                 " got (l1_ratio=%r)" % self.l1_ratio)
         elif self.l1_ratio is not None:
             warnings.warn("l1_ratio parameter is only used when penalty is "
                           "'elasticnet'. Got "
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 54b751423c933..0d572dd17c6d7 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -15,7 +15,7 @@
 
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
-from ..utils import as_float_array, check_array, check_X_y
+from ..utils import as_float_array, check_array
 from ..model_selection import check_cv
 
 premature = """ Orthogonal matching pursuit ended prematurely due to linear
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index c40f641df4b5e..9c3f703ac478e 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -22,7 +22,6 @@
 from ..base import RegressorMixin, MultiOutputMixin, is_classifier
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import row_norms
-from ..utils import check_X_y
 from ..utils import check_array
 from ..utils import check_consistent_length
 from ..utils import compute_sample_weight
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index bca1928ecf481..bf1e77e3e355b 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -287,25 +287,31 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
             self, X[validation_mask], y[validation_mask],
             sample_weight[validation_mask], classes=classes)
 
-    @deprecated("Attribute standard_coef_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute standard_coef_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def standard_coef_(self):
         return self._standard_coef
 
-    @deprecated("Attribute standard_intercept_ was deprecated "
-                "in version 0.23 and will be removed in 0.25.")
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute standard_intercept_ was deprecated "
+        "in version 0.23 and will be removed in 0.25."
+    )
     @property
     def standard_intercept_(self):
         return self._standard_intercept
 
-    @deprecated("Attribute average_coef_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_coef_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def average_coef_(self):
         return self._average_coef
 
-    @deprecated("Attribute average_intercept_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_intercept_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def average_intercept_(self):
@@ -720,13 +726,13 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
 
 
 class SGDClassifier(BaseSGDClassifier):
-    """Linear classifiers (SVM, logistic regression, a.o.) with SGD training.
+    """Linear classifiers (SVM, logistic regression, etc.) with SGD training.
 
     This estimator implements regularized linear models with stochastic
     gradient descent (SGD) learning: the gradient of the loss is estimated
     each sample at a time and the model is updated along the way with a
     decreasing strength schedule (aka learning rate). SGD allows minibatch
-    (online/out-of-core) learning, see the partial_fit method.
+    (online/out-of-core) learning via the `partial_fit` method.
     For best results using the default learning rate schedule, the data should
     have zero mean and unit variance.
 
@@ -760,7 +766,11 @@ class SGDClassifier(BaseSGDClassifier):
         'squared_hinge' is like hinge but is quadratically penalized.
         'perceptron' is the linear loss used by the perceptron algorithm.
         The other losses are designed for regression but can be useful in
-        classification as well; see SGDRegressor for a description.
+        classification as well; see
+        :class:`~sklearn.linear_model.SGDRegressor` for a description.
+
+        More details about the losses formulas can be found in the
+        :ref:`User Guide <sgd_mathematical_formulation>`.
 
     penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
@@ -769,17 +779,19 @@ class SGDClassifier(BaseSGDClassifier):
         not achievable with 'l2'.
 
     alpha : float, default=0.0001
-        Constant that multiplies the regularization term. Defaults to 0.0001.
-        Also used to compute learning_rate when set to 'optimal'.
+        Constant that multiplies the regularization term. The higher the
+        value, the stronger the regularization.
+        Also used to compute the learning rate when set to `learning_rate` is
+        set to 'optimal'.
 
     l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
-        Defaults to 0.15.
+        Only used if `penalty` is 'elasticnet'.
 
     fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
     max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
@@ -789,7 +801,7 @@ class SGDClassifier(BaseSGDClassifier):
         .. versionadded:: 0.19
 
     tol : float, default=1e-3
-        The stopping criterion. If it is not None, the iterations will stop
+        The stopping criterion. If it is not None, training will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
@@ -824,18 +836,14 @@ class SGDClassifier(BaseSGDClassifier):
     learning_rate : str, default='optimal'
         The learning rate schedule:
 
-        'constant':
-            eta = eta0
-        'optimal': [default]
-            eta = 1.0 / (alpha * (t + t0))
-            where t0 is chosen by a heuristic proposed by Leon Bottou.
-        'invscaling':
-            eta = eta0 / pow(t, power_t)
-        'adaptive':
-            eta = eta0, as long as the training keeps decreasing.
-            Each time n_iter_no_change consecutive epochs fail to decrease the
-            training loss by tol or fail to increase validation score by tol if
-            early_stopping is True, the current learning rate is divided by 5.
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
 
     eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
@@ -849,15 +857,15 @@ class SGDClassifier(BaseSGDClassifier):
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to True, it will automatically set aside
         a stratified fraction of training data as validation and terminate
-        training when validation score is not improving by at least tol for
-        n_iter_no_change consecutive epochs.
+        training when validation score returned by the `score` method is not
+        improving by at least tol for n_iter_no_change consecutive epochs.
 
         .. versionadded:: 0.20
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
-        Only used if early_stopping is True.
+        Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
 
@@ -890,11 +898,11 @@ class SGDClassifier(BaseSGDClassifier):
         existing counter.
 
     average : bool or int, default=False
-        When set to True, computes the averaged SGD weights and stores the
-        result in the ``coef_`` attribute. If set to an int greater than 1,
-        averaging will begin once the total number of samples seen reaches
-        average. So ``average=10`` will begin averaging after seeing 10
-        samples.
+        When set to True, computes the averaged SGD weights accross all
+        updates and stores the result in the ``coef_`` attribute. If set to
+        an int greater than 1, averaging will begin once the total number of
+        samples seen reaches `average`. So ``average=10`` will begin
+        averaging after seeing 10 samples.
 
     Attributes
     ----------
@@ -906,7 +914,7 @@ class SGDClassifier(BaseSGDClassifier):
         Constants in decision function.
 
     n_iter_ : int
-        The actual number of iterations to reach the stopping criterion.
+        The actual number of iterations before reaching the stopping criterion.
         For multiclass fits, it is the maximum over every binary fit.
 
     loss_function_ : concrete ``LossFunction``
@@ -928,13 +936,17 @@ class SGDClassifier(BaseSGDClassifier):
     Examples
     --------
     >>> import numpy as np
-    >>> from sklearn import linear_model
+    >>> from sklearn.linear_model import SGDClassifier
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> Y = np.array([1, 1, 2, 2])
-    >>> clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
+    >>> # Always scale the input. The most convenient way is to use a pipeline.
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     SGDClassifier(max_iter=1000, tol=1e-3))
     >>> clf.fit(X, Y)
-    SGDClassifier()
-
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('sgdclassifier', SGDClassifier())])
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
     """
@@ -1361,6 +1373,9 @@ class SGDRegressor(BaseSGDRegressor):
         'squared_epsilon_insensitive' is the same but becomes squared loss past
         a tolerance of epsilon.
 
+        More details about the losses formulas can be found in the
+        :ref:`User Guide <sgd_mathematical_formulation>`.
+
     penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
@@ -1368,12 +1383,15 @@ class SGDRegressor(BaseSGDRegressor):
         not achievable with 'l2'.
 
     alpha : float, default=0.0001
-        Constant that multiplies the regularization term.
-        Also used to compute learning_rate when set to 'optimal'.
+        Constant that multiplies the regularization term. The higher the
+        value, the stronger the regularization.
+        Also used to compute the learning rate when set to `learning_rate` is
+        set to 'optimal'.
 
     l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+        Only used if `penalty` is 'elasticnet'.
 
     fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
@@ -1387,7 +1405,7 @@ class SGDRegressor(BaseSGDRegressor):
         .. versionadded:: 0.19
 
     tol : float, default=1e-3
-        The stopping criterion. If it is not None, the iterations will stop
+        The stopping criterion. If it is not None, training will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
@@ -1415,18 +1433,14 @@ class SGDRegressor(BaseSGDRegressor):
     learning_rate : string, default='invscaling'
         The learning rate schedule:
 
-        'constant':
-            eta = eta0
-        'optimal':
-            eta = 1.0 / (alpha * (t + t0))
-            where t0 is chosen by a heuristic proposed by Leon Bottou.
-        'invscaling': [default]
-            eta = eta0 / pow(t, power_t)
-        'adaptive':
-            eta = eta0, as long as the training keeps decreasing.
-            Each time n_iter_no_change consecutive epochs fail to decrease the
-            training loss by tol or fail to increase validation score by tol if
-            early_stopping is True, the current learning rate is divided by 5.
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
 
     eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
@@ -1439,15 +1453,16 @@ class SGDRegressor(BaseSGDRegressor):
         Whether to use early stopping to terminate training when validation
         score is not improving. If set to True, it will automatically set aside
         a fraction of training data as validation and terminate
-        training when validation score is not improving by at least tol for
-        n_iter_no_change consecutive epochs.
+        training when validation score returned by the `score` method is not
+        improving by at least `tol` for `n_iter_no_change` consecutive
+        epochs.
 
         .. versionadded:: 0.20
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
-        Only used if early_stopping is True.
+        Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
 
@@ -1470,11 +1485,11 @@ class SGDRegressor(BaseSGDRegressor):
         existing counter.
 
     average : bool or int, default=False
-        When set to True, computes the averaged SGD weights and stores the
-        result in the ``coef_`` attribute. If set to an int greater than 1,
-        averaging will begin once the total number of samples seen reaches
-        average. So ``average=10`` will begin averaging after seeing 10
-        samples.
+        When set to True, computes the averaged SGD weights accross all
+        updates and stores the result in the ``coef_`` attribute. If set to
+        an int greater than 1, averaging will begin once the total number of
+        samples seen reaches `average`. So ``average=10`` will begin
+        averaging after seeing 10 samples.
 
     Attributes
     ----------
@@ -1500,7 +1515,7 @@ class SGDRegressor(BaseSGDRegressor):
             in version 0.23 and will be removed in 0.25.
 
     n_iter_ : int
-        The actual number of iterations to reach the stopping criterion.
+        The actual number of iterations before reaching the stopping criterion.
 
     t_ : int
         Number of weight updates performed during training.
@@ -1509,14 +1524,19 @@ class SGDRegressor(BaseSGDRegressor):
     Examples
     --------
     >>> import numpy as np
-    >>> from sklearn import linear_model
+    >>> from sklearn.linear_model import SGDRegressor
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> n_samples, n_features = 10, 5
     >>> rng = np.random.RandomState(0)
     >>> y = rng.randn(n_samples)
     >>> X = rng.randn(n_samples, n_features)
-    >>> clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
-    >>> clf.fit(X, y)
-    SGDRegressor()
+    >>> # Always scale the input. The most convenient way is to use a pipeline.
+    >>> reg = make_pipeline(StandardScaler(),
+    ...                     SGDRegressor(max_iter=1000, tol=1e-3))
+    >>> reg.fit(X, y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('sgdregressor', SGDRegressor())])
 
     See also
     --------
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index a29cc26cdc0a3..16f0adae12c9c 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -20,7 +20,6 @@
 from ._base import LinearModel
 from ..base import RegressorMixin
 from ..utils import check_random_state
-from ..utils import check_X_y
 from ..exceptions import ConvergenceWarning
 
 _EPSILON = np.finfo(np.double).eps
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index c962edccc953a..0d30c4dd13022 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -212,16 +212,30 @@ def test_linear_regression_pd_sparse_dataframe_warning():
     # restrict the pd versions < '0.24.0' as they have a bug in is_sparse func
     if LooseVersion(pd.__version__) < '0.24.0':
         pytest.skip("pandas 0.24+ required.")
-    df = pd.DataFrame()
-    for col in range(4):
+
+    # Warning is raised only when some of the columns is sparse
+    df = pd.DataFrame({'0': np.random.randn(10)})
+    for col in range(1, 4):
         arr = np.random.randn(10)
         arr[:8] = 0
-        df[str(col)] = pd.arrays.SparseArray(arr, fill_value=0)
+        # all columns but the first column is sparse
+        if col != 0:
+            arr = pd.arrays.SparseArray(arr, fill_value=0)
+        df[str(col)] = arr
+
     msg = "pandas.DataFrame with sparse columns found."
     with pytest.warns(UserWarning, match=msg):
         reg = LinearRegression()
         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
 
+    # does not warn when the whole dataframe is sparse
+    df['0'] = pd.arrays.SparseArray(df['0'], fill_value=0)
+    assert hasattr(df, "sparse")
+
+    with pytest.warns(None) as record:
+        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+    assert not record
+
 
 def test_preprocess_data():
     n_samples = 200
diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py
index cb70db88d3d41..156cd4b57dbc8 100644
--- a/sklearn/linear_model/tests/test_huber.py
+++ b/sklearn/linear_model/tests/test_huber.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 from scipy import optimize, sparse
-import pytest
 
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
diff --git a/sklearn/linear_model/tests/test_perceptron.py b/sklearn/linear_model/tests/test_perceptron.py
index 6cdd538ca9247..1fe74164f17f4 100644
--- a/sklearn/linear_model/tests/test_perceptron.py
+++ b/sklearn/linear_model/tests/test_perceptron.py
@@ -1,6 +1,5 @@
 import numpy as np
 import scipy.sparse as sp
-import pytest
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raises
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index f52e4f0852d5f..1f7d3c2569bab 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -1,4 +1,3 @@
-import pytest
 import numpy as np
 from scipy import sparse
 
diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
index 6bb156c64715b..4a66d6d576add 100644
--- a/sklearn/linear_model/tests/test_sag.py
+++ b/sklearn/linear_model/tests/test_sag.py
@@ -7,6 +7,7 @@
 import pytest
 import numpy as np
 import scipy.sparse as sp
+from scipy.special import logsumexp
 
 from sklearn.linear_model._sag import get_auto_step_size
 from sklearn.linear_model._sag_fast import _multinomial_grad_loss_all_samples
@@ -14,7 +15,6 @@
 from sklearn.linear_model._base import make_dataset
 from sklearn.linear_model._logistic import _multinomial_loss_grad
 
-from sklearn.utils.fixes import logsumexp
 from sklearn.utils.extmath import row_norms
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 8a7fc3f85f425..f26db5cc2028d 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -167,8 +167,11 @@ def _fit_transform(self, X):
 
         self.embedding_ = self.kernel_pca_.fit_transform(G)
 
-    @deprecated("Attribute `training_data_` was deprecated in version 0.22 and"
-                " will be removed in 0.24.")
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute `training_data_` was deprecated in version 0.22 and"
+        " will be removed in 0.24."
+    )
     @property
     def training_data_(self):
         check_is_fitted(self)
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index d0c9e4e509a73..53558f6051283 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -16,14 +16,14 @@
 from scipy.sparse import csr_matrix, issparse
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator
-from ..utils import check_array
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 from . import _utils
-from . import _barnes_hut_tsne
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from . import _barnes_hut_tsne  # type: ignore
 
 
 MACHINE_EPSILON = np.finfo(np.double).eps
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index 9486bbd4a96f5..5e38e1afff592 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -21,7 +21,8 @@
 from sklearn.manifold._t_sne import _gradient_descent
 from sklearn.manifold._t_sne import trustworthiness
 from sklearn.manifold import TSNE
-from sklearn.manifold import _barnes_hut_tsne
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from sklearn.manifold import _barnes_hut_tsne  # type: ignore
 from sklearn.manifold._utils import _binary_search_perplexity
 from sklearn.datasets import make_blobs
 from scipy.optimize import check_grad
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 96d99adfe7386..8916b523fc273 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -137,7 +137,8 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index a83fbe5acc60a..bfec9276f83be 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -106,7 +106,8 @@ def plot_precision_recall_curve(estimator, X, y,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index fb76691ff37d1..d786ac6659d41 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -115,7 +115,8 @@ def plot_roc_curve(estimator, X, y, sample_weight=None,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index dde32dd3f25ce..973c45a908bf1 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -19,10 +19,11 @@
 
 import numpy as np
 from scipy import sparse as sp
+from scipy.special import comb
 
 from ._expected_mutual_info_fast import expected_mutual_information
 from ...utils.validation import check_array, check_consistent_length
-from ...utils.fixes import comb, _astype_copy_false
+from ...utils.fixes import _astype_copy_false
 
 
 def _comb2(n):
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
index a0eaa5e84240a..8841df701c69f 100644
--- a/sklearn/metrics/cluster/_unsupervised.py
+++ b/sklearn/metrics/cluster/_unsupervised.py
@@ -16,7 +16,6 @@
 from ..pairwise import pairwise_distances_chunked
 from ..pairwise import pairwise_distances
 from ...preprocessing import LabelEncoder
-from ...utils import deprecated
 
 
 def check_number_of_labels(n_labels, n_samples):
diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
index f169a9242daf0..354b6c94a7548 100644
--- a/sklearn/metrics/cluster/tests/test_unsupervised.py
+++ b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -5,7 +5,6 @@
 
 from sklearn import datasets
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns_message
 from sklearn.metrics.cluster import silhouette_score
 from sklearn.metrics.cluster import silhouette_samples
 from sklearn.metrics import pairwise_distances
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 4542b8e2a2964..a66ff9525c28c 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -737,8 +737,9 @@ def _test_precision_recall_curve(y_true, probas_pred):
     assert_array_almost_equal(precision_recall_auc, 0.859, 3)
     assert_array_almost_equal(precision_recall_auc,
                               average_precision_score(y_true, probas_pred))
+    # `_average_precision` is not very precise in case of 0.5 ties: be tolerant
     assert_almost_equal(_average_precision(y_true, probas_pred),
-                        precision_recall_auc, decimal=3)
+                        precision_recall_auc, decimal=2)
     assert p.size == r.size
     assert p.size == thresholds.size + 1
     # Smoke test in the case of proba having only one value
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index 5c09d67f6e63d..ee6c81a149b3b 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -9,6 +9,7 @@
 from time import time
 
 import numpy as np
+from scipy.special import logsumexp
 
 from .. import cluster
 from ..base import BaseEstimator
@@ -16,7 +17,6 @@
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, check_random_state
 from ..utils.validation import check_is_fitted
-from ..utils.fixes import logsumexp
 
 
 def _check_shape(param, param_shape, name):
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
index d69b7d1958183..c68fa260faee3 100644
--- a/sklearn/mixture/_bayesian_mixture.py
+++ b/sklearn/mixture/_bayesian_mixture.py
@@ -15,7 +15,6 @@
 from ._gaussian_mixture import _estimate_gaussian_parameters
 from ._gaussian_mixture import _estimate_log_gaussian_prob
 from ..utils import check_array
-from ..utils.validation import check_is_fitted
 
 
 def _log_dirichlet_norm(dirichlet_concentration):
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 1c563984ba00b..277f65f929eac 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -10,7 +10,6 @@
 
 from ._base import BaseMixture, _check_shape
 from ..utils import check_array
-from ..utils.validation import check_is_fitted
 from ..utils.extmath import row_norms
 
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 55e770d701858..3e5b85ed73a02 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -21,6 +21,7 @@
 import warnings
 
 import numpy as np
+from numpy.ma import MaskedArray
 from scipy.stats import rankdata
 
 from ..base import BaseEstimator, is_classifier, clone
@@ -31,7 +32,6 @@
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
 from ..utils import check_random_state
-from ..utils.fixes import MaskedArray
 from ..utils.random import sample_without_replacement
 from ..utils.validation import indexable, check_is_fitted, _check_fit_params
 from ..utils.metaestimators import if_delegate_has_method
@@ -385,9 +385,10 @@ def _check_param_grid(param_grid):
 
             if (isinstance(v, str) or
                     not isinstance(v, (np.ndarray, Sequence))):
-                raise ValueError("Parameter values for parameter ({0}) need "
-                                 "to be a sequence(but not a string) or"
-                                 " np.ndarray.".format(name))
+                raise ValueError("Parameter grid for parameter ({0}) needs to"
+                                 " be a list or numpy array, but got ({1})."
+                                 " Single values need to be wrapped in a list"
+                                 " with one element.".format(name, type(v)))
 
             if len(v) == 0:
                 raise ValueError("Parameter values for parameter ({0}) need "
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 0b769aefe120c..e728533c3b5cf 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -18,13 +18,13 @@
 from inspect import signature
 
 import numpy as np
+from scipy.special import comb
 
 from ..utils import indexable, check_random_state, _safe_indexing
 from ..utils import _approximate_mode
 from ..utils.validation import _num_samples, column_or_1d
 from ..utils.validation import check_array
 from ..utils.multiclass import type_of_target
-from ..utils.fixes import comb
 from ..base import _pprint
 
 __all__ = ['BaseCrossValidator',
@@ -2144,7 +2144,8 @@ def train_test_split(*arrays, **options):
 
 # Tell nose that train_test_split is not a test.
 # (Needed for external libraries that may use nose.)
-train_test_split.__test__ = False
+# Use setattr to avoid mypy errors when monkeypatching.
+setattr(train_test_split, '__test__', False)
 
 
 def _build_repr(self):
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 49d4b156e0686..1673040f96bc6 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -428,12 +428,14 @@ def test_grid_search_when_param_grid_includes_range():
 
 
 def test_grid_search_bad_param_grid():
-    param_dict = {"C": 1.0}
+    param_dict = {"C": 1}
     clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
-        "Parameter values for parameter (C) need to be a sequence"
-        "(but not a string) or np.ndarray.",
+        "Parameter grid for parameter (C) needs to"
+        " be a list or numpy array, but got (<class 'int'>)."
+        " Single values need to be wrapped in a list"
+        " with one element.",
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": []}
@@ -447,8 +449,10 @@ def test_grid_search_bad_param_grid():
     clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
-        "Parameter values for parameter (C) need to be a sequence"
-        "(but not a string) or np.ndarray.",
+        "Parameter grid for parameter (C) needs to"
+        " be a list or numpy array, but got (<class 'str'>)."
+        " Single values need to be wrapped in a list"
+        " with one element.",
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": np.ones((3, 2))}
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 875e113f8dc36..0205eb8901699 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -4,6 +4,7 @@
 import numpy as np
 from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
 from scipy import stats
+from scipy.special import comb
 from itertools import combinations
 from itertools import combinations_with_replacement
 from itertools import permutations
@@ -46,8 +47,6 @@
 from sklearn.datasets import load_digits
 from sklearn.datasets import make_classification
 
-from sklearn.utils.fixes import comb
-
 from sklearn.svm import SVC
 
 X = np.ones(10)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 82edd85472880..2f8976a86c8b8 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -233,6 +233,17 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     ----------
     estimators_ : list of ``n_output`` estimators
         Estimators used for predictions.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_linnerud
+    >>> from sklearn.multioutput import MultiOutputRegressor
+    >>> from sklearn.linear_model import Ridge
+    >>> X, y = load_linnerud(return_X_y=True)
+    >>> clf = MultiOutputRegressor(Ridge(random_state=123)).fit(X, y)
+    >>> clf.predict(X[[0]])
+    array([[176..., 35..., 57...]])
     """
 
     def __init__(self, estimator, n_jobs=None):
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index bcd9da1cb72fc..6ef3895ffdb60 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -21,6 +21,7 @@
 
 
 import numpy as np
+from scipy.special import logsumexp
 
 from .base import BaseEstimator, ClassifierMixin
 from .preprocessing import binarize
@@ -28,7 +29,6 @@
 from .preprocessing import label_binarize
 from .utils import check_X_y, check_array, deprecated
 from .utils.extmath import safe_sparse_dot
-from .utils.fixes import logsumexp
 from .utils.multiclass import _check_partial_fit_first_call
 from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
 from .utils.validation import _check_sample_weight
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 3404a9768f36a..91a97e2810baa 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -8,7 +8,7 @@
 from scipy.special import gammainc
 from ..base import BaseEstimator
 from ..utils import check_array, check_random_state
-from ..utils.validation import _check_sample_weight
+from ..utils.validation import _check_sample_weight, check_is_fitted
 
 from ..utils.extmath import row_norms
 from ._ball_tree import BallTree, DTYPE
@@ -184,6 +184,7 @@ def score_samples(self, X):
             probability densities, so values will be low for high-dimensional
             data.
         """
+        check_is_fitted(self)
         # The returned density is normalized to the number of points.
         # For it to be a probability, we must scale it.  For this reason
         # we'll also scale atol.
@@ -241,6 +242,7 @@ def sample(self, n_samples=1, random_state=None):
         X : array_like, shape (n_samples, n_features)
             List of samples.
         """
+        check_is_fitted(self)
         # TODO: implement sampling for other valid kernel shapes
         if self.kernel not in ['gaussian', 'tophat']:
             raise NotImplementedError()
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index b9d2de01c958d..cd87d594281da 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -22,8 +22,7 @@
 from ..decomposition import PCA
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import check_random_state
-from ..utils.validation import (check_is_fitted, check_array, check_X_y,
-                                check_scalar)
+from ..utils.validation import check_is_fitted, check_array, check_scalar
 from ..exceptions import ConvergenceWarning
 
 
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 48712c1fcfb44..0fdcd597353f5 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -15,7 +15,7 @@
 from ..base import BaseEstimator, ClassifierMixin
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
-from ..utils.validation import check_array, check_X_y, check_is_fitted
+from ..utils.validation import check_array, check_is_fitted
 from ..utils.sparsefuncs import csc_median_axis_0
 from ..utils.multiclass import check_classification_targets
 
diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
index 6687cfa475ce8..e17e8e575f728 100644
--- a/sklearn/neighbors/tests/test_kde.py
+++ b/sklearn/neighbors/tests/test_kde.py
@@ -9,6 +9,7 @@
 from sklearn.datasets import make_blobs
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
+from sklearn.exceptions import NotFittedError
 import joblib
 
 
@@ -235,3 +236,15 @@ def test_pickling(tmpdir, sample_weight):
     scores_pickled = kde.score_samples(X)
 
     assert_allclose(scores, scores_pickled)
+
+
+@pytest.mark.parametrize('method', ['score_samples', 'sample'])
+def test_check_is_fitted(method):
+    # Check that predict raises an exception in an unfitted estimator.
+    # Unfitted estimators should raise a NotFittedError.
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    kde = KernelDensity()
+
+    with pytest.raises(NotFittedError):
+        getattr(kde, method)(X)
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 6eb42bb455c3a..3ec30336c23c1 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -22,7 +22,7 @@
 from ..utils import gen_batches, check_random_state
 from ..utils import shuffle
 from ..utils import _safe_indexing
-from ..utils import check_array, check_X_y, column_or_1d
+from ..utils import check_array, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 64d2de70df531..c1bbdbd629ff8 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -84,47 +84,21 @@ class Pipeline(_BaseComposition):
 
     Examples
     --------
-    >>> from sklearn import svm
+    >>> from sklearn.svm import SVC
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_classification
-    >>> from sklearn.feature_selection import SelectKBest
-    >>> from sklearn.feature_selection import f_regression
+    >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.pipeline import Pipeline
-    >>> # generate some data to play with
-    >>> X, y = make_classification(
-    ...     n_informative=5, n_redundant=0, random_state=42)
-    >>> # ANOVA SVM-C
-    >>> anova_filter = SelectKBest(f_regression, k=5)
-    >>> clf = svm.SVC(kernel='linear')
-    >>> anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
-    >>> # You can set the parameters using the names issued
-    >>> # For instance, fit using a k of 10 in the SelectKBest
-    >>> # and a parameter 'C' of the svm
-    >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
-    Pipeline(steps=[('anova', SelectKBest(...)), ('svc', SVC(...))])
-    >>> prediction = anova_svm.predict(X)
-    >>> anova_svm.score(X, y)
-    0.83
-    >>> # getting the selected features chosen by anova_filter
-    >>> anova_svm['anova'].get_support()
-    array([False, False,  True,  True, False, False,  True,  True, False,
-           True, False,  True,  True, False,  True, False,  True,  True,
-           False, False])
-    >>> # Another way to get selected features chosen by anova_filter
-    >>> anova_svm.named_steps.anova.get_support()
-    array([False, False,  True,  True, False, False,  True,  True, False,
-           True, False,  True,  True, False,  True, False,  True,  True,
-           False, False])
-    >>> # Indexing can also be used to extract a sub-pipeline.
-    >>> sub_pipeline = anova_svm[:1]
-    >>> sub_pipeline
-    Pipeline(steps=[('anova', SelectKBest(...))])
-    >>> coef = anova_svm[-1].coef_
-    >>> anova_svm['svc'] is anova_svm[-1]
-    True
-    >>> coef.shape
-    (1, 10)
-    >>> sub_pipeline.inverse_transform(coef).shape
-    (1, 20)
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=0)
+    >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
+    >>> # The pipeline can be used as any other estimator
+    >>> # and avoids leaking the test set into the train set
+    >>> pipe.fit(X_train, y_train)
+    Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
+    >>> pipe.score(X_test, y_test)
+    0.88
     """
 
     # BaseEstimator interface
@@ -258,17 +232,7 @@ def _log_message(self, step_idx):
                                                   len(self.steps),
                                                   name)
 
-    # Estimator interface
-
-    def _fit(self, X, y=None, **fit_params):
-        # shallow copy of steps - this should really be steps_
-        self.steps = list(self.steps)
-        self._validate_steps()
-        # Setup the memory
-        memory = check_memory(self.memory)
-
-        fit_transform_one_cached = memory.cache(_fit_transform_one)
-
+    def _check_fit_params(self, **fit_params):
         fit_params_steps = {name: {} for name, step in self.steps
                             if step is not None}
         for pname, pval in fit_params.items():
@@ -281,6 +245,19 @@ def _fit(self, X, y=None, **fit_params):
                     "=sample_weight)`.".format(pname))
             step, param = pname.split('__', 1)
             fit_params_steps[step][param] = pval
+        return fit_params_steps
+
+    # Estimator interface
+
+    def _fit(self, X, y=None, **fit_params_steps):
+        # shallow copy of steps - this should really be steps_
+        self.steps = list(self.steps)
+        self._validate_steps()
+        # Setup the memory
+        memory = check_memory(self.memory)
+
+        fit_transform_one_cached = memory.cache(_fit_transform_one)
+
         for (step_idx,
              name,
              transformer) in self._iter(with_final=False,
@@ -318,9 +295,7 @@ def _fit(self, X, y=None, **fit_params):
             # transformer. This is necessary when loading the transformer
             # from the cache.
             self.steps[step_idx] = (name, fitted_transformer)
-        if self._final_estimator == 'passthrough':
-            return X, {}
-        return X, fit_params_steps[self.steps[-1][0]]
+        return X
 
     def fit(self, X, y=None, **fit_params):
         """Fit the model
@@ -348,11 +323,14 @@ def fit(self, X, y=None, **fit_params):
         self : Pipeline
             This estimator
         """
-        Xt, fit_params = self._fit(X, y, **fit_params)
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
             if self._final_estimator != 'passthrough':
-                self._final_estimator.fit(Xt, y, **fit_params)
+                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
+                self._final_estimator.fit(Xt, y, **fit_params_last_step)
+
         return self
 
     def fit_transform(self, X, y=None, **fit_params):
@@ -382,16 +360,20 @@ def fit_transform(self, X, y=None, **fit_params):
         Xt : array-like of shape  (n_samples, n_transformed_features)
             Transformed samples
         """
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
+
         last_step = self._final_estimator
-        Xt, fit_params = self._fit(X, y, **fit_params)
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
             if last_step == 'passthrough':
                 return Xt
+            fit_params_last_step = fit_params_steps[self.steps[-1][0]]
             if hasattr(last_step, 'fit_transform'):
-                return last_step.fit_transform(Xt, y, **fit_params)
+                return last_step.fit_transform(Xt, y, **fit_params_last_step)
             else:
-                return last_step.fit(Xt, y, **fit_params).transform(Xt)
+                return last_step.fit(Xt, y,
+                                     **fit_params_last_step).transform(Xt)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def predict(self, X, **predict_params):
@@ -447,10 +429,14 @@ def fit_predict(self, X, y=None, **fit_params):
         -------
         y_pred : array-like
         """
-        Xt, fit_params = self._fit(X, y, **fit_params)
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
+
+        fit_params_last_step = fit_params_steps[self.steps[-1][0]]
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
-            y_pred = self.steps[-1][-1].fit_predict(Xt, y, **fit_params)
+            y_pred = self.steps[-1][-1].fit_predict(Xt, y,
+                                                    **fit_params_last_step)
         return y_pred
 
     @if_delegate_has_method(delegate='_final_estimator')
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 72ad6bacd43b4..c95351db9d985 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -210,11 +210,6 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
 
     where min, max = feature_range.
 
-    The transformation is calculated as::
-
-        X_scaled = scale * X + min - X.min(axis=0) * scale
-        where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
-
     This transformation is often used as an alternative to zero mean,
     unit variance scaling.
 
@@ -1718,7 +1713,8 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
         elif norm == 'max':
-            _, norms = min_max_axis(X, 1)
+            mins, maxes = min_max_axis(X, 1)
+            norms = np.maximum(abs(mins), maxes)
             norms_elementwise = norms.repeat(np.diff(X.indptr))
             mask = norms_elementwise != 0
             X.data[mask] /= norms_elementwise[mask]
@@ -1728,7 +1724,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
         elif norm == 'l2':
             norms = row_norms(X)
         elif norm == 'max':
-            norms = np.max(X, axis=1)
+            norms = np.max(abs(X), axis=1)
         norms = _handle_zeros_in_scale(norms, copy=False)
         X /= norms[:, np.newaxis]
 
@@ -1746,7 +1742,7 @@ class Normalizer(TransformerMixin, BaseEstimator):
 
     Each sample (i.e. each row of the data matrix) with at least one
     non zero component is rescaled independently of other samples so
-    that its norm (l1 or l2) equals one.
+    that its norm (l1, l2 or inf) equals one.
 
     This transformer is able to work both with dense numpy arrays and
     scipy.sparse matrix (use CSR format if you want to avoid the burden of
@@ -1763,7 +1759,9 @@ class Normalizer(TransformerMixin, BaseEstimator):
     Parameters
     ----------
     norm : 'l1', 'l2', or 'max', optional ('l2' by default)
-        The norm to use to normalize each non zero sample.
+        The norm to use to normalize each non zero sample. If norm='max'
+        is used, values will be rescaled by the maximum of the absolute
+        values.
 
     copy : boolean, optional, default True
         set to False to perform inplace row normalization and avoid a
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b9f4217f39af4..8583629669480 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,7 +10,6 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
-from ..utils.fixes import _argmax
 from ..utils.validation import check_is_fitted
 
 from ._label import _encode, _encode_check_unknown
@@ -223,6 +222,10 @@ class OneHotEncoder(_BaseEncoder):
         support when `min_frequency` or `max_categories` is set to combine
         infrequent categories.
 
+        However, dropping one category breaks the symmetry of the original
+        representation and can therefore induce a bias in downstream models,
+        for instance for penalized linear classification or regression models.
+
         - None : retain all features (the default).
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
@@ -296,13 +299,13 @@ class OneHotEncoder(_BaseEncoder):
         (if any).
 
     drop_idx_ : array of shape (n_features,)
-        ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
-        be dropped for each feature.
-        ``drop_idx_[i] = -1`` if no category is to be dropped from the feature
-        with index ``i``, e.g. when `drop='if_binary'` and the feature isn't
-        binary
-
-        ``drop_idx_ = None`` if all the transformed features will be retained.
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+          to be dropped for each feature.
+        - ``drop_idx_[i] = None`` if no category is to be dropped from the
+          feature with index ``i``, e.g. when `drop='if_binary'` and the
+          feature isn't binary.
+        - ``drop_idx_ = None`` if all the transformed features will be
+          retained.
 
     infrequent_indices_ : list of shape (n_features,)
         Defined only when `min_frequency` or `max_categories` is set to a
@@ -426,10 +429,10 @@ def _compute_drop_idx(self):
             return None
         elif isinstance(self.drop, str):
             if self.drop == 'first':
-                return np.zeros(len(self.categories_), dtype=np.int_)
+                return np.zeros(len(self.categories_), dtype=np.object)
             elif self.drop == 'if_binary':
-                return np.array([0 if len(cats) == 2 else -1
-                                for cats in self.categories_], dtype=np.int_)
+                return np.array([0 if len(cats) == 2 else None
+                                for cats in self.categories_], dtype=np.object)
             else:
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
@@ -464,7 +467,8 @@ def _compute_drop_idx(self):
                 raise ValueError(msg)
             return np.array([np.where(cat_list == val)[0][0]
                              for (val, cat_list) in
-                             zip(self.drop, self.categories_)], dtype=np.int_)
+                             zip(self.drop, self.categories_)],
+                            dtype=np.object)
 
     @property
     def _infrequent_enabled(self):
@@ -638,8 +642,8 @@ def _compute_transformed_categories(self, i):
         """
         cats = self.categories_[i]
 
-        if self.drop is not None:
-            if self.drop_idx_[i] == -1:
+        if self.drop_idx_ is not None:
+            if self.drop_idx_[i] is None:
                 return cats
             return np.delete(cats, self.drop_idx_[i])
 
@@ -664,10 +668,10 @@ def _compute_transformed_categories(self, i):
     @property
     def _n_transformed_features(self):
         """Number of transformed features."""
-        if self.drop is not None:
+        if self.drop_idx_ is not None:
             output = []
             for drop_idx, cats in zip(self.drop_idx_, self.categories_):
-                if drop_idx == -1:
+                if drop_idx is None:
                     output.append(len(cats))
                 else:
                     output.append(len(cats) - 1)
@@ -771,14 +775,14 @@ def transform(self, X):
 
         n_samples, n_features = X_int.shape
 
-        if self.drop is not None:
+        if self.drop_idx_ is not None:
             to_drop = self.drop_idx_.copy()
             # We remove all the dropped categories from mask, and decrement all
             # categories that occur after them to avoid an empty column.
             keep_cells = X_int != to_drop
             for i, cats in enumerate(self.categories_):
                 # drop='if_binary' but feature isn't binary
-                if to_drop[i] == -1:
+                if to_drop[i] is None:
                     # set to cardinality to not drop from X_int
                     to_drop[i] = len(cats)
 
@@ -867,7 +871,7 @@ def inverse_transform(self, X):
                 continue
             sub = X[:, j:j + n_categories]
             # for sparse X argmax returns 2D matrix, ensure 1D array
-            labels = np.asarray(_argmax(sub, axis=1)).flatten()
+            labels = np.asarray(sub.argmax(axis=1)).flatten()
             X_tr[:, i] = cats[labels]
 
             if (self.handle_unknown == 'ignore' or
@@ -878,9 +882,9 @@ def inverse_transform(self, X):
                 if unknown.any():
                     found_unknown[i] = unknown
             # drop will either be None or handle_unknown will be error. If
-            # self.drop is not None, then we can safely assume that all of
+            # self.drop_idx_ is not None, then we can safely assume that all of
             # the nulls in each column are the dropped value
-            elif self.drop is not None:
+            elif self.drop_idx_ is not None:
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
                     X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
@@ -928,7 +932,8 @@ def get_feature_names(self, input_features=None):
 
         feature_names = []
         for i in range(len(cats)):
-            names = [input_features[i] + '_' + str(t) for t in cats[i]]
+            names = [
+                input_features[i] + '_' + str(t) for t in cats[i]]
             feature_names.extend(names)
 
         return np.array(feature_names, dtype=object)
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 85ce3a1f845c1..9cf365ebb3cdf 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -1,7 +1,6 @@
 import warnings
 
 from ..base import BaseEstimator, TransformerMixin
-from ..utils import check_array
 from ..utils.validation import _allclose_dense_sparse
 
 
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index d7789a00ca741..27495177f34d4 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -147,7 +147,10 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
             res = _encode_python(values, uniques, encode,
                                  return_counts=return_counts)
         except TypeError:
-            raise TypeError("argument must be a string or number")
+            types = sorted(t.__qualname__
+                           for t in set(type(v) for v in values))
+            raise TypeError("Encoders require their input to be uniformly "
+                            f"strings or numbers. Got {types}")
         return res
     else:
         return _encode_numpy(values, uniques, encode,
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 95721a0508091..7999df083631c 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -1947,7 +1947,7 @@ def test_normalizer_max():
         X_norm2 = toarray(X_norm2)
 
         for X_norm in (X_norm1, X_norm2):
-            row_maxs = X_norm.max(axis=1)
+            row_maxs = abs(X_norm).max(axis=1)
             for i in range(3):
                 assert_almost_equal(row_maxs[i], 1.0)
             assert_almost_equal(row_maxs[3], 0.0)
@@ -1966,6 +1966,27 @@ def test_normalizer_max():
         assert_almost_equal(la.norm(X_norm[3]), 0.0)
 
 
+def test_normalizer_max_sign():
+    # check that we normalize by a positive number even for negative data
+    rng = np.random.RandomState(0)
+    X_dense = rng.randn(4, 5)
+    # set the row number 3 to zero
+    X_dense[3, :] = 0.0
+    # check for mixed data where the value with
+    # largest magnitude is negative
+    X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
+    X_all_neg = -np.abs(X_dense)
+    X_all_neg_sparse = sparse.csr_matrix(X_all_neg)
+
+    for X in (X_dense, X_all_neg, X_all_neg_sparse):
+        normalizer = Normalizer(norm='max')
+        X_norm = normalizer.transform(X)
+        assert X_norm is not X
+        X_norm = toarray(X_norm)
+        assert_array_equal(
+            np.sign(X_norm), np.sign(toarray(X)))
+
+
 def test_normalize():
     # Test normalize function
     # Only tests functionality not used by the tests for Normalizer.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 3fce9e09c8094..7949b09918a36 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -277,6 +277,23 @@ def test_one_hot_encoder_inverse_if_binary():
     assert_array_equal(ohe.inverse_transform(X_tr), X)
 
 
+# check that resetting drop option without refitting does not throw an error
+@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
+@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
+def test_one_hot_encoder_drop_reset(drop, reset_drop):
+    X = np.array([['Male', 1],
+                  ['Female', 3],
+                  ['Female', 2]], dtype=object)
+    ohe = OneHotEncoder(drop=drop, sparse=False)
+    ohe.fit(X)
+    X_tr = ohe.transform(X)
+    feature_names = ohe.get_feature_names()
+    ohe.set_params(drop=reset_drop)
+    assert_array_equal(ohe.inverse_transform(X_tr), X)
+    assert_allclose(ohe.transform(X), X_tr)
+    assert_array_equal(ohe.get_feature_names(), feature_names)
+
+
 @pytest.mark.parametrize("method", ['fit', 'fit_transform'])
 @pytest.mark.parametrize("X", [
     [1, 2],
@@ -401,8 +418,9 @@ def test_one_hot_encoder_pandas():
 
 @pytest.mark.parametrize("drop, expected_names",
                          [('first', ['x0_c', 'x2_b']),
+                          ('if_binary', ['x0_c', 'x1_2', 'x2_b']),
                           (['c', 2, 'b'], ['x0_b', 'x2_a'])],
-                         ids=['first', 'manual'])
+                         ids=['first', 'binary', 'manual'])
 def test_one_hot_encoder_feature_names_drop(drop, expected_names):
     X = [['c', 2, 'a'],
          ['b', 2, 'b']]
@@ -422,7 +440,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
     expected = np.array([[1., 0., 0., 1.],
                          [0., 1., 0., 0.],
                          [0., 0., 1., 1.]])
-    expected_drop_idx = np.array([-1, 0])
+    expected_drop_idx = np.array([None, 0])
 
     ohe = OneHotEncoder(drop='if_binary', sparse=False)
     result = ohe.fit_transform(X)
@@ -436,7 +454,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
     expected = np.array([[1., 1.],
                          [0., 1.],
                          [0., 1.]])
-    expected_drop_idx = np.array([0, -1])
+    expected_drop_idx = np.array([0, None])
 
     ohe = OneHotEncoder(drop='if_binary', sparse=False)
     result = ohe.fit_transform(X)
@@ -677,15 +695,21 @@ def test_categories(density, drop):
         for drop_cat, drop_idx, cat_list in zip(drop,
                                                 ohe_test.drop_idx_,
                                                 ohe_test.categories_):
-            assert cat_list[drop_idx] == drop_cat
+            assert cat_list[int(drop_idx)] == drop_cat
     assert isinstance(ohe_test.drop_idx_, np.ndarray)
-    assert ohe_test.drop_idx_.dtype == np.int_
+    assert ohe_test.drop_idx_.dtype == np.object
 
 
 @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
 def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
+@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
+def test_encoders_does_not_support_none_values(Encoder):
+    values = [["a"], [None]]
+    with pytest.raises(TypeError, match="Encoders require their input to be "
+                                        "uniformly strings or numbers."):
+        Encoder().fit(values)
 
 def test_ohe_infrequent_infrequent_is_a_cat():
     # category with 'infrequent' is a frequent category, ohe will name mangle
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index a84a9950aa3ac..d46dacbe754e4 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -67,7 +67,7 @@
 from ..neighbors import NearestNeighbors
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_X_y, check_is_fitted, check_array
+from ..utils.validation import check_is_fitted, check_array
 from ..exceptions import ConvergenceWarning
 
 
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e759cdabc88ee..d90c198ac0d7b 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -55,13 +55,13 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('ensemble/_hist_gradient_boosting/tests')
     config.add_subpackage('_loss/')
     config.add_subpackage('_loss/tests')
+    config.add_subpackage('externals')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')
     config.add_subpackage('datasets')
     config.add_subpackage('decomposition')
     config.add_subpackage('ensemble')
-    config.add_subpackage('externals')
     config.add_subpackage('feature_extraction')
     config.add_subpackage('manifold')
     config.add_subpackage('metrics')
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 662a4ffa24678..6cecefb693ec8 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -3,14 +3,16 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 
-from . import _libsvm as libsvm
-from .import _liblinear as liblinear
-from . import _libsvm_sparse as libsvm_sparse
+# mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm'
+# (and same for other imports)
+from . import _libsvm as libsvm  # type: ignore
+from .import _liblinear as liblinear  # type: ignore
+from . import _libsvm_sparse as libsvm_sparse  # type: ignore
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder
 from ..utils.multiclass import _ovr_decision_function
 from ..utils import check_array, check_random_state
-from ..utils import column_or_1d, check_X_y
+from ..utils import column_or_1d
 from ..utils import compute_class_weight
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted, _check_large_sparse
@@ -110,7 +112,8 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) \
+                or (n_samples, n_samples)
             Training vectors, where n_samples is the number of samples
             and n_features is the number of features.
             For kernel="precomputed", the expected shape of X is
@@ -144,6 +147,13 @@ def fit(self, X, y, sample_weight=None):
             raise TypeError("Sparse precomputed kernels are not supported.")
         self._sparse = sparse and not callable(self.kernel)
 
+        if hasattr(self, 'decision_function_shape'):
+            if self.decision_function_shape not in ('ovr', 'ovo'):
+                raise ValueError(
+                    f"decision_function_shape must be either 'ovr' or 'ovo', "
+                    f"got {self.decision_function_shape}."
+                )
+
         if callable(self.kernel):
             check_consistent_length(X, y)
         else:
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index fbaa6e97ec616..46086729af35c 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -1,11 +1,9 @@
-import warnings
 import numpy as np
 
 from ._base import _fit_liblinear, BaseSVC, BaseLibSVM
 from ..base import BaseEstimator, RegressorMixin, OutlierMixin
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \
     LinearModel
-from ..utils import check_X_y
 from ..utils.validation import _num_samples
 from ..utils.multiclass import check_classification_targets
 from ..utils.deprecation import deprecated
@@ -161,15 +159,21 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     Examples
     --------
     >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_classification
     >>> X, y = make_classification(n_features=4, random_state=0)
-    >>> clf = LinearSVC(random_state=0, tol=1e-5)
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     LinearSVC(random_state=0, tol=1e-5))
     >>> clf.fit(X, y)
-    LinearSVC(random_state=0, tol=1e-05)
-    >>> print(clf.coef_)
-    [[0.085... 0.394... 0.498... 0.375...]]
-    >>> print(clf.intercept_)
-    [0.284...]
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])
+
+    >>> print(clf.named_steps['linearsvc'].coef_)
+    [[0.141...   0.526... 0.679... 0.493...]]
+
+    >>> print(clf.named_steps['linearsvc'].intercept_)
+    [0.1693...]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
@@ -324,17 +328,23 @@ class LinearSVR(RegressorMixin, LinearModel):
     Examples
     --------
     >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_regression
     >>> X, y = make_regression(n_features=4, random_state=0)
-    >>> regr = LinearSVR(random_state=0, tol=1e-5)
+    >>> regr = make_pipeline(StandardScaler(),
+    ...                      LinearSVR(random_state=0, tol=1e-5))
     >>> regr.fit(X, y)
-    LinearSVR(random_state=0, tol=1e-05)
-    >>> print(regr.coef_)
-    [16.35... 26.91... 42.30... 60.47...]
-    >>> print(regr.intercept_)
-    [-4.29...]
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvr', LinearSVR(random_state=0, tol=1e-05))])
+
+    >>> print(regr.named_steps['linearsvr'].coef_)
+    [18.582... 27.023... 44.357... 64.522...]
+    >>> print(regr.named_steps['linearsvr'].intercept_)
+    [-4...]
     >>> print(regr.predict([[0, 0, 0, 0]]))
-    [-4.29...]
+    [-2.384...]
+
 
     See also
     --------
@@ -463,6 +473,7 @@ class SVC(BaseSVC):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     probability : bool, default=False
         Whether to enable probability estimates. This must be enabled prior
@@ -497,7 +508,8 @@ class SVC(BaseSVC):
         (n_samples, n_classes) as all other classifiers, or the original
         one-vs-one ('ovo') decision function of libsvm which has shape
         (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one
-        ('ovo') is always used as multi-class strategy.
+        ('ovo') is always used as multi-class strategy. The parameter is
+        ignored for binary classification.
 
         .. versionchanged:: 0.19
             decision_function_shape is 'ovr' by default.
@@ -519,7 +531,7 @@ class SVC(BaseSVC):
 
     random_state : int or RandomState instance, default=None
         Controls the pseudo random number generation for shuffling the data for
-        probability estimates.
+        probability estimates. Ignored when `probability` is False.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
@@ -535,11 +547,13 @@ class SVC(BaseSVC):
         Number of support vectors for each class.
 
     dual_coef_ : ndarray of shape (n_class-1, n_SV)
-        Coefficients of the support vector in the decision function.
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
         For multiclass, coefficient for all 1-vs-1 classifiers.
         The layout of the coefficients in the multiclass case is somewhat
-        non-trivial. See the section about multi-class classification in the
-        SVM section of the User Guide for details.
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
 
     coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features)
         Weights assigned to the features (coefficients in the primal
@@ -578,12 +592,16 @@ class SVC(BaseSVC):
     Examples
     --------
     >>> import numpy as np
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
     >>> from sklearn.svm import SVC
-    >>> clf = SVC(gamma='auto')
+    >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
     >>> clf.fit(X, y)
-    SVC(gamma='auto')
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svc', SVC(gamma='auto'))])
+
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
@@ -639,9 +657,9 @@ class NuSVC(BaseSVC):
     Parameters
     ----------
     nu : float, default=0.5
-        An upper bound on the fraction of training errors and a lower
-        bound of the fraction of support vectors. Should be in the
-        interval (0, 1].
+        An upper bound on the fraction of margin errors (see :ref:`User Guide
+        <nu_svc>`) and a lower bound of the fraction of support vectors.
+        Should be in the interval (0, 1].
 
     kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'
          Specifies the kernel type to be used in the algorithm.
@@ -670,6 +688,7 @@ class NuSVC(BaseSVC):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     probability : bool, default=False
         Whether to enable probability estimates. This must be enabled prior
@@ -702,7 +721,9 @@ class NuSVC(BaseSVC):
         Whether to return a one-vs-rest ('ovr') decision function of shape
         (n_samples, n_classes) as all other classifiers, or the original
         one-vs-one ('ovo') decision function of libsvm which has shape
-        (n_samples, n_classes * (n_classes - 1) / 2).
+        (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one
+        ('ovo') is always used as multi-class strategy. The parameter is
+        ignored for binary classification.
 
         .. versionchanged:: 0.19
             decision_function_shape is 'ovr' by default.
@@ -724,7 +745,7 @@ class NuSVC(BaseSVC):
 
     random_state : int or RandomState instance, default=None
         Controls the pseudo random number generation for shuffling the data for
-        probability estimates.
+        probability estimates. Ignored when `probability` is False.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
@@ -740,11 +761,13 @@ class NuSVC(BaseSVC):
         Number of support vectors for each class.
 
     dual_coef_ : ndarray of shape (n_class-1, n_SV)
-        Coefficients of the support vector in the decision function.
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
         For multiclass, coefficient for all 1-vs-1 classifiers.
         The layout of the coefficients in the multiclass case is somewhat
-        non-trivial. See the section about multi-class classification in
-        the SVM section of the User Guide for details.
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
 
     coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features)
         Weights assigned to the features (coefficients in the primal
@@ -785,10 +808,12 @@ class NuSVC(BaseSVC):
     >>> import numpy as np
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.svm import NuSVC
-    >>> clf = NuSVC()
+    >>> clf = make_pipeline(StandardScaler(), NuSVC())
     >>> clf.fit(X, y)
-    NuSVC()
+    Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
@@ -895,6 +920,7 @@ class SVR(RegressorMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     cache_size : float, default=200
         Specify the size of the kernel cache (in MB).
@@ -934,14 +960,18 @@ class SVR(RegressorMixin, BaseLibSVM):
     Examples
     --------
     >>> from sklearn.svm import SVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> import numpy as np
     >>> n_samples, n_features = 10, 5
     >>> rng = np.random.RandomState(0)
     >>> y = rng.randn(n_samples)
     >>> X = rng.randn(n_samples, n_features)
-    >>> regr = SVR(C=1.0, epsilon=0.2)
+    >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
     >>> regr.fit(X, y)
-    SVR(epsilon=0.2)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svr', SVR(epsilon=0.2))])
+
 
     See also
     --------
@@ -972,14 +1002,16 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale',
             shrinking=shrinking, probability=False, cache_size=cache_size,
             class_weight=None, max_iter=max_iter, random_state=None)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
     def probA_(self):
         return self._probA
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
@@ -1035,6 +1067,7 @@ class NuSVR(RegressorMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     tol : float, default=1e-3
         Tolerance for stopping criterion.
@@ -1074,14 +1107,17 @@ class NuSVR(RegressorMixin, BaseLibSVM):
     Examples
     --------
     >>> from sklearn.svm import NuSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> import numpy as np
     >>> n_samples, n_features = 10, 5
     >>> np.random.seed(0)
     >>> y = np.random.randn(n_samples)
     >>> X = np.random.randn(n_samples, n_features)
-    >>> regr = NuSVR(C=1.0, nu=0.1)
+    >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))
     >>> regr.fit(X, y)
-    NuSVR(nu=0.1)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('nusvr', NuSVR(nu=0.1))])
 
     See also
     --------
@@ -1159,6 +1195,7 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     cache_size : float, default=200
         Specify the size of the kernel cache (in MB).
@@ -1306,14 +1343,16 @@ def predict(self, X):
         y = super().predict(X)
         return np.asarray(y, dtype=np.intp)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
     def probA_(self):
         return self._probA
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 3ab495d7441cd..989e6c7d6a316 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -16,22 +16,27 @@ def configuration(parent_package='', top_path=None):
     config.add_library('libsvm-skl',
                        sources=[join('src', 'libsvm', 'libsvm_template.cpp')],
                        depends=[join('src', 'libsvm', 'svm.cpp'),
-                                join('src', 'libsvm', 'svm.h')],
+                                join('src', 'libsvm', 'svm.h'),
+                                join('src', 'newrand', 'newrand.h')],
                        # Force C++ linking in case gcc is picked up instead
                        # of g++ under windows with some versions of MinGW
                        extra_link_args=['-lstdc++'],
+                       # Use C++11 to use the random number generator fix
+                       extra_compiler_args=['-std=c++11'],
                        )
 
     libsvm_sources = ['_libsvm.pyx']
     libsvm_depends = [join('src', 'libsvm', 'libsvm_helper.c'),
                       join('src', 'libsvm', 'libsvm_template.cpp'),
                       join('src', 'libsvm', 'svm.cpp'),
-                      join('src', 'libsvm', 'svm.h')]
+                      join('src', 'libsvm', 'svm.h'),
+                      join('src', 'newrand', 'newrand.h')]
 
     config.add_extension('_libsvm',
                          sources=libsvm_sources,
                          include_dirs=[numpy.get_include(),
-                                       join('src', 'libsvm')],
+                                       join('src', 'libsvm'),
+                                       join('src', 'newrand')],
                          libraries=['libsvm-skl'],
                          depends=libsvm_depends,
                          )
@@ -41,16 +46,30 @@ def configuration(parent_package='', top_path=None):
     if os.name == 'posix':
         libraries.append('m')
 
-    liblinear_sources = ['_liblinear.pyx',
-                         join('src', 'liblinear', '*.cpp')]
+    # precompile liblinear to use C++11 flag
+    config.add_library('liblinear-skl',
+                       sources=[join('src', 'liblinear', 'linear.cpp'),
+                                join('src', 'liblinear', 'tron.cpp')],
+                       depends=[join('src', 'liblinear', 'linear.h'),
+                                join('src', 'liblinear', 'tron.h'),
+                                join('src', 'newrand', 'newrand.h')],
+                       # Force C++ linking in case gcc is picked up instead
+                       # of g++ under windows with some versions of MinGW
+                       extra_link_args=['-lstdc++'],
+                       # Use C++11 to use the random number generator fix
+                       extra_compiler_args=['-std=c++11'],
+                       )
 
+    liblinear_sources = ['_liblinear.pyx']
     liblinear_depends = [join('src', 'liblinear', '*.h'),
+                         join('src', 'newrand', 'newrand.h'),
                          join('src', 'liblinear', 'liblinear_helper.c')]
 
     config.add_extension('_liblinear',
                          sources=liblinear_sources,
-                         libraries=libraries,
+                         libraries=['liblinear-skl'] + libraries,
                          include_dirs=[join('.', 'src', 'liblinear'),
+                                       join('.', 'src', 'newrand'),
                                        join('..', 'utils'),
                                        numpy.get_include()],
                          depends=liblinear_depends,
@@ -64,8 +83,10 @@ def configuration(parent_package='', top_path=None):
     config.add_extension('_libsvm_sparse', libraries=['libsvm-skl'],
                          sources=libsvm_sparse_sources,
                          include_dirs=[numpy.get_include(),
-                                       join("src", "libsvm")],
+                                       join("src", "libsvm"),
+                                       join("src", "newrand")],
                          depends=[join("src", "libsvm", "svm.h"),
+                                  join('src', 'newrand', 'newrand.h'),
                                   join("src", "libsvm",
                                        "libsvm_sparse_helper.c")])
 
diff --git a/sklearn/svm/src/liblinear/liblinear_helper.c b/sklearn/svm/src/liblinear/liblinear_helper.c
index 86d88e7da9273..7433a0086f682 100644
--- a/sklearn/svm/src/liblinear/liblinear_helper.c
+++ b/sklearn/svm/src/liblinear/liblinear_helper.c
@@ -182,7 +182,7 @@ struct parameter *set_parameter(int solver_type, double eps, double C,
     if (param == NULL)
         return NULL;
 
-    srand(seed);
+    set_seed(seed);
     param->solver_type = solver_type;
     param->eps = eps;
     param->C = C;
diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp
index d9bdfb69c413d..cc603b435f655 100644
--- a/sklearn/svm/src/liblinear/linear.cpp
+++ b/sklearn/svm/src/liblinear/linear.cpp
@@ -1,8 +1,8 @@
-/* 
+/*
    Modified 2011:
 
    - Make labels sorted in group_classes, Dan Yamins.
-   
+
    Modified 2012:
 
    - Changes roles of +1 and -1 to match scikit API, Andreas Mueller
@@ -22,6 +22,13 @@
    Modified 2015:
    - Patched liblinear for sample_weights - Manoj Kumar
      See https://github.com/scikit-learn/scikit-learn/pull/5274
+
+   Modified 2020:
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
  */
 
 #include <math.h>
@@ -32,6 +39,10 @@
 #include <locale.h>
 #include "linear.h"
 #include "tron.h"
+#include <climits>
+#include <random>
+#include "../newrand/newrand.h"
+
 typedef signed char schar;
 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
 #ifndef min
@@ -456,19 +467,19 @@ void l2r_l2_svr_fun::grad(double *w, double *g)
 		g[i] = w[i] + 2*g[i];
 }
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // multi-class support vector machines by Crammer and Singer
 //
 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
-// 
+//
 //  where e^m_i = 0 if y_i  = m,
 //        e^m_i = 1 if y_i != m,
-//  C^m_i = C if m  = y_i, 
-//  C^m_i = 0 if m != y_i, 
-//  and w_m(\alpha) = \sum_i \alpha^m_i x_i 
+//  C^m_i = C if m  = y_i,
+//  C^m_i = 0 if m != y_i,
+//  and w_m(\alpha) = \sum_i \alpha^m_i x_i
 //
-// Given: 
+// Given:
 // x, y, C
 // eps is the stopping tolerance
 //
@@ -579,7 +590,7 @@ int Solver_MCSVM_CS::Solve(double *w)
 	double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
 	bool start_from_all = true;
 
-	// Initial alpha can be set here. Note that 
+	// Initial alpha can be set here. Note that
 	// sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
 	// alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
 	// alpha[i*nr_class+m] <= 0 if prob->y[i] != m
@@ -615,7 +626,7 @@ int Solver_MCSVM_CS::Solve(double *w)
 		double stopping = -INF;
 		for(i=0;i<active_size;i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 		for(s=0;s<active_size;s++)
@@ -775,14 +786,14 @@ int Solver_MCSVM_CS::Solve(double *w)
 	return iter;
 }
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // L1-loss and L2-loss SVM dual problems
 //
 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
 //    s.t.      0 <= \alpha_i <= upper_bound_i,
-// 
+//
 //  where Qij = yi yj xi^T xj and
-//  D is a diagonal matrix 
+//  D is a diagonal matrix
 //
 // In L1-SVM case:
 // 		upper_bound_i = Cp if y_i = 1
@@ -793,12 +804,12 @@ int Solver_MCSVM_CS::Solve(double *w)
 // 		D_ii = 1/(2*Cp)	if y_i = 1
 // 		D_ii = 1/(2*Cn)	if y_i = -1
 //
-// Given: 
+// Given:
 // x, y, Cp, Cn
 // eps is the stopping tolerance
 //
 // solution will be put in w
-// 
+//
 // See Algorithm 3 of Hsieh et al., ICML 2008
 
 #undef GETI
@@ -888,7 +899,7 @@ static int solve_l2r_l1l2_svc(
 
 		for (i=0; i<active_size; i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 
@@ -1009,14 +1020,14 @@ static int solve_l2r_l1l2_svc(
 }
 
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // L1-loss and L2-loss epsilon-SVR dual problem
 //
 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
-// 
+//
 //  where Qij = xi^T xj and
-//  D is a diagonal matrix 
+//  D is a diagonal matrix
 //
 // In L1-SVM case:
 // 		upper_bound_i = C
@@ -1025,13 +1036,13 @@ static int solve_l2r_l1l2_svc(
 // 		upper_bound_i = INF
 // 		lambda_i = 1/(2*C)
 //
-// Given: 
+// Given:
 // x, y, p, C
 // eps is the stopping tolerance
 //
 // solution will be put in w
 //
-// See Algorithm 4 of Ho and Lin, 2012   
+// See Algorithm 4 of Ho and Lin, 2012
 
 #undef GETI
 #define GETI(i) (i)
@@ -1107,7 +1118,7 @@ static int solve_l2r_l1l2_svr(
 
 		for(i=0; i<active_size; i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 
@@ -1253,17 +1264,17 @@ static int solve_l2r_l1l2_svr(
 }
 
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // the dual of L2-regularized logistic regression problems
 //
 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
 //    s.t.      0 <= \alpha_i <= upper_bound_i,
-// 
-//  where Qij = yi yj xi^T xj and 
+//
+//  where Qij = yi yj xi^T xj and
 //  upper_bound_i = Cp if y_i = 1
 //  upper_bound_i = Cn if y_i = -1
 //
-// Given: 
+// Given:
 // x, y, Cp, Cn
 // eps is the stopping tolerance
 //
@@ -1333,7 +1344,7 @@ int solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, dou
 	{
 		for (i=0; i<l; i++)
 		{
-			int j = i+rand()%(l-i);
+			int j = i+bounded_rand_int(l-i);
 			swap(index[i], index[j]);
 		}
 		int newton_iter = 0;
@@ -1521,7 +1532,7 @@ static int solve_l1r_l2_svc(
 
 		for(j=0; j<active_size; j++)
 		{
-			int i = j+rand()%(active_size-j);
+			int i = j+bounded_rand_int(active_size-j);
 			swap(index[i], index[j]);
 		}
 
@@ -1903,7 +1914,7 @@ static int solve_l1r_lr(
 
 			for(j=0; j<QP_active_size; j++)
 			{
-				int i = j+rand()%(QP_active_size-j);
+				int i = j+bounded_rand_int(QP_active_size-j);
 				swap(index[i], index[j]);
 			}
 
@@ -2234,14 +2245,14 @@ static void group_classes(const problem *prob, int *nr_class_ret, int **label_re
                 label[i+1] = this_label;
                 count[i+1] = this_count;
         }
-        
+
         for (i=0; i <l; i++)
         {
                 j = 0;
                 int this_label = (int)prob->y[i];
                 while(this_label != label[j])
                 {
-                        j++;      
+                        j++;
                 }
                 data_label[i] = j;
 
@@ -2594,7 +2605,7 @@ void cross_validation(const problem *prob, const parameter *param, int nr_fold,
 	for(i=0;i<l;i++) perm[i]=i;
 	for(i=0;i<l;i++)
 	{
-		int j = i+rand()%(l-i);
+		int j = i+bounded_rand_int(l-i);
 		swap(perm[i],perm[j]);
 	}
 	for(i=0;i<=nr_fold;i++)
@@ -3057,4 +3068,3 @@ void set_print_string_function(void (*print_func)(const char*))
 	else
 		liblinear_print_string = print_func;
 }
-
diff --git a/sklearn/svm/src/liblinear/linear.h b/sklearn/svm/src/liblinear/linear.h
index cca7373cbc4b1..1e4952b184d97 100644
--- a/sklearn/svm/src/liblinear/linear.h
+++ b/sklearn/svm/src/liblinear/linear.h
@@ -49,6 +49,8 @@ struct model
 	int *n_iter;    /* no. of iterations of each class */
 };
 
+void set_seed(unsigned seed);
+
 struct model* train(const struct problem *prob, const struct parameter *param, BlasFunctions *blas_functions);
 void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target);
 
diff --git a/sklearn/svm/src/libsvm/LIBSVM_CHANGES b/sklearn/svm/src/libsvm/LIBSVM_CHANGES
index 7a30471387c53..c437720def7e1 100644
--- a/sklearn/svm/src/libsvm/LIBSVM_CHANGES
+++ b/sklearn/svm/src/libsvm/LIBSVM_CHANGES
@@ -4,5 +4,7 @@ This is here mainly as checklist for incorporation of new versions of libsvm.
 
   * Add copyright to files svm.cpp and svm.h
   * Add random_seed support and call to srand in fit function
+  * Improved random number generator (fix on windows, enhancement on other
+    platforms). See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
 The changes made with respect to upstream are detailed in the heading of svm.cpp
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index 9321340acaaed..c9a5df10c4924 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -48,6 +48,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    - Make labels sorted in svm_group_classes, Fabian Pedregosa.
 
+   Modified 2020:
+
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie,
+     see <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
  */
 
 #include <math.h>
@@ -57,7 +64,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 #include <string.h>
 #include <stdarg.h>
+#include <climits>
+#include <random>
 #include "svm.h"
+#include "../newrand/newrand.h"
 
 #ifndef _LIBSVM_CPP
 typedef float Qfloat;
@@ -2093,7 +2103,7 @@ static void svm_binary_svc_probability(
 	for(i=0;i<prob->l;i++) perm[i]=i;
 	for(i=0;i<prob->l;i++)
 	{
-		int j = i+rand()%(prob->l-i);
+		int j = i+bounded_rand_int(prob->l-i);
 		swap(perm[i],perm[j]);
 	}
 	for(i=0;i<nr_fold;i++)
@@ -2348,7 +2358,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p
 
     if(param->random_seed >= 0)
     {
-        srand(param->random_seed);
+        set_seed(param->random_seed);
     }
 
 	if(param->svm_type == ONE_CLASS ||
@@ -2628,7 +2638,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 	int nr_class;
     if(param->random_seed >= 0)
     {
-        srand(param->random_seed);
+        set_seed(param->random_seed);
     }
 
 	// stratified cv may not give leave-one-out rate
@@ -2650,7 +2660,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 		for (c=0; c<nr_class; c++) 
 			for(i=0;i<count[c];i++)
 			{
-				int j = i+rand()%(count[c]-i);
+				int j = i+bounded_rand_int(count[c]-i);
 				swap(index[start[c]+j],index[start[c]+i]);
 			}
 		for(i=0;i<nr_fold;i++)
@@ -2687,7 +2697,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 		for(i=0;i<l;i++) perm[i]=i;
 		for(i=0;i<l;i++)
 		{
-			int j = i+rand()%(l-i);
+			int j = i+bounded_rand_int(l-i);
 			swap(perm[i],perm[j]);
 		}
 		for(i=0;i<=nr_fold;i++)
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
new file mode 100644
index 0000000000000..b46861b71e765
--- /dev/null
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -0,0 +1,68 @@
+/*
+   Creation, 2020:
+   - New random number generator using a mersenne twister + tweaked lemire
+     postprocessor. This fixed a convergence issue on windows targets for
+     libsvm and liblinear.
+     Sylvain Marie
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
+ */
+#ifndef _NEWRAND_H
+#define _NEWRAND_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Scikit-Learn-specific random number generator replacing `rand()` originally
+// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
+// with increased speed
+// - (1) Init a `mt_rand` object
+#if INT_MAX == 0x7FFFFFFF
+std::mt19937 mt_rand(std::mt19937::default_seed);
+#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
+std::mt19937_64 mt_rand(std::mt19937::default_seed);
+#else
+info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
+exit(1);
+#endif
+
+// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
+void set_seed(unsigned custom_seed) {
+    mt_rand.seed(custom_seed);
+}
+
+// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
+inline int bounded_rand_int(int orig_range) {
+    // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
+    // random number and use modulo to make it fit in the range
+    // return abs( (int)mt_rand()) % orig_range;
+
+    // "Better way": tweaked Lemire post-processor
+    // from http://www.pcg-random.org/posts/bounded-rands.html
+    // TODO how could we make this casting safer, raising an error if lost information?
+    uint32_t range = uint32_t(orig_range);
+    uint32_t x = mt_rand();
+    uint64_t m = uint64_t(x) * uint64_t(range);
+    uint32_t l = uint32_t(m);
+    if (l < range) {
+        uint32_t t = -range;
+        if (t >= range) {
+            t -= range;
+            if (t >= range)
+                t %= range;
+        }
+        while (l < t) {
+            x = mt_rand();
+            m = uint64_t(x) * uint64_t(range);
+            l = uint32_t(m);
+        }
+    }
+    return m >> 32;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NEWRAND_H */
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index fb811940c2971..e6342a2846e3e 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -28,7 +28,8 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.exceptions import NotFittedError, UndefinedMetricWarning
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.svm import _libsvm
+# mypy error: Module 'sklearn.svm' has no attribute '_libsvm'
+from sklearn.svm import _libsvm  # type: ignore
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -361,12 +362,13 @@ def test_decision_function():
     assert_array_almost_equal(dec.ravel(), clf.decision_function(X))
 
 
-def test_decision_function_shape():
-    # check that decision_function_shape='ovr' gives
+@pytest.mark.parametrize('SVM', (svm.SVC, svm.NuSVC))
+def test_decision_function_shape(SVM):
+    # check that decision_function_shape='ovr' or 'ovo' gives
     # correct shape and is consistent with predict
 
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovr').fit(iris.data, iris.target)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovr').fit(iris.data, iris.target)
     dec = clf.decision_function(iris.data)
     assert dec.shape == (len(iris.data), 3)
     assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1))
@@ -375,18 +377,21 @@ def test_decision_function_shape():
     X, y = make_blobs(n_samples=80, centers=5, random_state=0)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovr').fit(X_train, y_train)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovr').fit(X_train, y_train)
     dec = clf.decision_function(X_test)
     assert dec.shape == (len(X_test), 5)
     assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1))
 
     # check shape of ovo_decition_function=True
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovo').fit(X_train, y_train)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovo').fit(X_train, y_train)
     dec = clf.decision_function(X_train)
     assert dec.shape == (len(X_train), 10)
 
+    with pytest.raises(ValueError, match="must be either 'ovr' or 'ovo'"):
+        SVM(decision_function_shape='bad').fit(X_train, y_train)
+
 
 def test_svr_predict():
     # Test SVR's decision_function
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index d769bb630bd03..af98c1bc50a74 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -20,7 +20,7 @@
 from sklearn.utils import all_estimators
 from sklearn.utils._testing import ignore_warnings
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.utils.estimator_checks import check_estimator, _safe_tags
+from sklearn.utils.estimator_checks import check_estimator
 
 import sklearn
 from sklearn.base import BiclusterMixin
@@ -31,6 +31,7 @@
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import SkipTest
 from sklearn.utils.estimator_checks import (
+    _mark_xfail_checks,
     _construct_instance,
     _set_checking_parameters,
     _set_check_estimator_ids,
@@ -47,6 +48,24 @@ def test_all_estimator_no_base_class():
         assert not name.lower().startswith('base'), msg
 
 
+def test_estimator_cls_parameterize_with_checks():
+    # Non-regression test for #16707 to ensure that parametrize_with_checks
+    # works with estimator classes
+    param_checks = parametrize_with_checks([LogisticRegression])
+    # Using the generator does not raise
+    list(param_checks.args[1])
+
+
+def test_mark_xfail_checks_with_unconsructable_estimator():
+    class MyEstimator:
+        def __init__(self):
+            raise ValueError("This is bad")
+
+    estimator, check = _mark_xfail_checks(MyEstimator, 42, None)
+    assert estimator == MyEstimator
+    assert check == 42
+
+
 @pytest.mark.parametrize(
         'name, Estimator',
         all_estimators()
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
index dcd4009a47a2d..029ba8471ed1f 100644
--- a/sklearn/tests/test_discriminant_analysis.py
+++ b/sklearn/tests/test_discriminant_analysis.py
@@ -4,10 +4,8 @@
 
 from scipy import linalg
 
-from sklearn.exceptions import ChangedBehaviorWarning
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import (assert_array_equal, assert_no_warnings,
-                                   assert_warns_message)
+from sklearn.utils._testing import assert_array_equal, assert_no_warnings
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_almost_equal
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 55af69ca6c10e..ca2549f2ea4c1 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -32,8 +32,10 @@
 with warnings.catch_warnings():
     warnings.simplefilter('ignore', FutureWarning)
     PUBLIC_MODULES = set([
-        pckg[1] for pckg in walk_packages(prefix='sklearn.',
-                                          path=sklearn.__path__)
+        pckg[1] for pckg in walk_packages(
+            prefix='sklearn.',
+            # mypy error: Module has no attribute "__path__"
+            path=sklearn.__path__)  # type: ignore  # mypy issue #1422
         if not ("._" in pckg[1] or ".tests." in pckg[1])
     ])
 
diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index 33eb5da939725..03ada399d2af2 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -1,5 +1,3 @@
-import pytest
-
 import numpy as np
 import scipy.sparse as sp
 
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 740480d643f76..033bb84279d54 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -1,5 +1,6 @@
 
 import functools
+from typing import List, Any
 
 import numpy as np
 import scipy.sparse as sp
@@ -23,12 +24,12 @@
 from sklearn.utils._testing import assert_warns
 from sklearn.exceptions import DataDimensionalityWarning
 
-all_sparse_random_matrix = [_sparse_random_matrix]
-all_dense_random_matrix = [_gaussian_random_matrix]
+all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
+all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
 all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
 
-all_SparseRandomProjection = [SparseRandomProjection]
-all_DenseRandomProjection = [GaussianRandomProjection]
+all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
+all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
 all_RandomProjection = set(all_SparseRandomProjection +
                            all_DenseRandomProjection)
 
diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py
deleted file mode 100644
index 07125e9562408..0000000000000
--- a/sklearn/tests/test_site_joblib.py
+++ /dev/null
@@ -1,16 +0,0 @@
-
-
-def test_old_pickle(tmpdir):
-    import joblib
-
-    # Check that a pickle that references sklearn.external.joblib can load
-    f = tmpdir.join('foo.pkl')
-    f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe'
-            b'r\nq\x00)\x81q\x01}q\x02(U\x05dtypeq\x03cnumpy\ndtype\nq\x04U'
-            b'\x02i8q\x05K\x00K\x01\x87q\x06Rq\x07(K\x03U\x01<q\x08NNNJ\xff'
-            b'\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\tbU\x05shapeq\nK\x01\x85q'
-            b'\x0bU\x05orderq\x0cU\x01Cq\rU\x08subclassq\x0ecnumpy\nndarray\nq'
-            b'\x0fU\nallow_mmapq\x10\x88ub\x01\x00\x00\x00\x00\x00\x00\x00.',
-            mode='wb')
-
-    joblib.load(str(f))
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 09481aefeed41..fe77610a20601 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1477,6 +1477,21 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
     .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
            Machine Learning, 63(1), 3-42, 2006.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.ensemble import BaggingClassifier
+    >>> from sklearn.tree import ExtraTreeClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...    X, y, random_state=0)
+    >>> extra_tree = ExtraTreeClassifier(random_state=0)
+    >>> cls = BaggingClassifier(extra_tree, random_state=0).fit(
+    ...    X_train, y_train)
+    >>> cls.score(X_test, y_test)
+    0.8947...
     """
     def __init__(self,
                  criterion="gini",
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 1149ceb8678d9..071e7efd49177 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -25,7 +25,6 @@
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import TempMemmap
 
 from sklearn.utils.validation import check_random_state
 
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 4b69365339389..aac6e292a198a 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -51,6 +51,7 @@
            "check_symmetric", "indices_to_mask", "deprecated",
            "parallel_backend", "register_parallel_backend",
            "resample", "shuffle", "check_matplotlib_support", "all_estimators",
+           "DataConversionWarning"
            ]
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
diff --git a/sklearn/utils/_pprint.py b/sklearn/utils/_pprint.py
index 91e4abd8f7f49..6c27a8c503856 100644
--- a/sklearn/utils/_pprint.py
+++ b/sklearn/utils/_pprint.py
@@ -324,7 +324,8 @@ def _pprint_key_val_tuple(self, object, stream, indent, allowance, context,
     # Note: need to copy _dispatch to prevent instances of the builtin
     # PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
     # 12906)
-    _dispatch = pprint.PrettyPrinter._dispatch.copy()
+    # mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
+    _dispatch = pprint.PrettyPrinter._dispatch.copy()  # type: ignore
     _dispatch[BaseEstimator.__repr__] = _pprint_estimator
     _dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index eb6e381f02840..eb4febea3abd8 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -183,7 +183,7 @@ def assert_warns_message(warning_class, message, func, *args, **kw):
             if callable(message):  # add support for certain tests
                 check_in_message = message
             else:
-                check_in_message = lambda msg: message in msg
+                def check_in_message(msg): return message in msg
 
             if check_in_message(msg):
                 message_found = True
diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py
index 7780cac7b52fb..e71aa57400ac1 100644
--- a/sklearn/utils/deprecation.py
+++ b/sklearn/utils/deprecation.py
@@ -1,6 +1,5 @@
 import warnings
 import functools
-import sys
 
 
 __all__ = ["deprecated"]
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 2cfb06c7994db..34a0e25c7fcaa 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -360,8 +360,17 @@ def _generate_class_checks(Estimator):
 
 def _mark_xfail_checks(estimator, check, pytest):
     """Mark estimator check pairs with xfail"""
+    if isinstance(estimator, type):
+        # try to construct estimator to get tags, if it is unable to then
+        # return the estimator class
+        try:
+            xfail_checks = _safe_tags(_construct_instance(estimator),
+                                      '_xfail_test')
+        except Exception:
+            return estimator, check
+    else:
+        xfail_checks = _safe_tags(estimator, '_xfail_test')
 
-    xfail_checks = _safe_tags(estimator, '_xfail_test')
     if not xfail_checks:
         return estimator, check
 
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 03e220eab29ae..622c102fbbd0b 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -34,143 +34,21 @@ def _parse_version(version_string):
 sp_version = _parse_version(scipy.__version__)
 
 
-try:  # SciPy >= 0.19
-    from scipy.special import comb, logsumexp
-except ImportError:
-    from scipy.misc import comb, logsumexp  # noqa
-
 if sp_version >= (1, 4):
     from scipy.sparse.linalg import lobpcg
 else:
     # Backport of lobpcg functionality from scipy 1.4.0, can be removed
     # once support for sp_version < (1, 4) is dropped
-    from ..externals._lobpcg import lobpcg  # noqa
+    # mypy error: Name 'lobpcg' already defined (possibly by an import)
+    from ..externals._lobpcg import lobpcg  # type: ignore  # noqa
 
 if sp_version >= (1, 3):
     # Preserves earlier default choice of pinvh cutoff `cond` value.
     # Can be removed once issue #14055 is fully addressed.
     from ..externals._scipy_linalg import pinvh
 else:
-    from scipy.linalg import pinvh # noqa
-
-if sp_version >= (0, 19):
-    def _argmax(arr_or_spmatrix, axis=None):
-        return arr_or_spmatrix.argmax(axis=axis)
-else:
-    # Backport of argmax functionality from scipy 0.19.1, can be removed
-    # once support for scipy 0.18 and below is dropped
-
-    def _find_missing_index(ind, n):
-        for k, a in enumerate(ind):
-            if k != a:
-                return k
-
-        k += 1
-        if k < n:
-            return k
-        else:
-            return -1
-
-    def _arg_min_or_max_axis(self, axis, op, compare):
-        if self.shape[axis] == 0:
-            raise ValueError("Can't apply the operation along a zero-sized "
-                             "dimension.")
-
-        if axis < 0:
-            axis += 2
-
-        zero = self.dtype.type(0)
-
-        mat = self.tocsc() if axis == 0 else self.tocsr()
-        mat.sum_duplicates()
-
-        ret_size, line_size = mat._swap(mat.shape)
-        ret = np.zeros(ret_size, dtype=int)
-
-        nz_lines, = np.nonzero(np.diff(mat.indptr))
-        for i in nz_lines:
-            p, q = mat.indptr[i:i + 2]
-            data = mat.data[p:q]
-            indices = mat.indices[p:q]
-            am = op(data)
-            m = data[am]
-            if compare(m, zero) or q - p == line_size:
-                ret[i] = indices[am]
-            else:
-                zero_ind = _find_missing_index(indices, line_size)
-                if m == zero:
-                    ret[i] = min(am, zero_ind)
-                else:
-                    ret[i] = zero_ind
-
-        if axis == 1:
-            ret = ret.reshape(-1, 1)
-
-        return np.asmatrix(ret)
-
-    def _arg_min_or_max(self, axis, out, op, compare):
-        if out is not None:
-            raise ValueError("Sparse matrices do not support "
-                             "an 'out' parameter.")
-
-        # validateaxis(axis)
-
-        if axis is None:
-            if 0 in self.shape:
-                raise ValueError("Can't apply the operation to "
-                                 "an empty matrix.")
-
-            if self.nnz == 0:
-                return 0
-            else:
-                zero = self.dtype.type(0)
-                mat = self.tocoo()
-                mat.sum_duplicates()
-                am = op(mat.data)
-                m = mat.data[am]
-
-                if compare(m, zero):
-                    return mat.row[am] * mat.shape[1] + mat.col[am]
-                else:
-                    size = np.product(mat.shape)
-                    if size == mat.nnz:
-                        return am
-                    else:
-                        ind = mat.row * mat.shape[1] + mat.col
-                        zero_ind = _find_missing_index(ind, size)
-                        if m == zero:
-                            return min(zero_ind, am)
-                        else:
-                            return zero_ind
-
-        return _arg_min_or_max_axis(self, axis, op, compare)
-
-    def _sparse_argmax(self, axis=None, out=None):
-        return _arg_min_or_max(self, axis, out, np.argmax, np.greater)
-
-    def _argmax(arr_or_matrix, axis=None):
-        if sp.issparse(arr_or_matrix):
-            return _sparse_argmax(arr_or_matrix, axis=axis)
-        else:
-            return arr_or_matrix.argmax(axis=axis)
-
-
-if np_version < (1, 12):
-    class MaskedArray(np.ma.MaskedArray):
-        # Before numpy 1.12, np.ma.MaskedArray object is not picklable
-        # This fix is needed to make our model_selection.GridSearchCV
-        # picklable as the ``cv_results_`` param uses MaskedArray
-        def __getstate__(self):
-            """Return the internal state of the masked array, for pickling
-            purposes.
-
-            """
-            cf = 'CF'[self.flags.fnc]
-            data_state = super(np.ma.MaskedArray, self).__reduce__()[2]
-            return data_state + (np.ma.getmaskarray(self).tostring(cf),
-                                 self._fill_value)
-else:
-    from numpy.ma import MaskedArray    # noqa
+    # mypy error: Name 'pinvh' already defined (possibly by an import)
+    from scipy.linalg import pinvh  # type: ignore  # noqa
 
 
 def _object_dtype_isnan(X):
diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py
index e091bd0f7cbf8..877d576592726 100644
--- a/sklearn/utils/metaestimators.py
+++ b/sklearn/utils/metaestimators.py
@@ -2,6 +2,7 @@
 # Author: Joel Nothman
 #         Andreas Mueller
 # License: BSD
+from typing import List, Any
 
 from abc import ABCMeta, abstractmethod
 from operator import attrgetter
@@ -17,6 +18,8 @@
 class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
     """Handles parameter management for classifiers composed of named estimators.
     """
+    steps: List[Any]
+
     @abstractmethod
     def __init__(self):
         pass
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 748666884e60e..a7f4911791467 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -35,7 +35,7 @@
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.utils.validation import check_X_y, check_array
+from sklearn.utils.validation import check_array
 from sklearn.utils import all_estimators
 
 
diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py
index 21ddaf7d3ec5c..a2fa702f19c4a 100644
--- a/sklearn/utils/tests/test_fixes.py
+++ b/sklearn/utils/tests/test_fixes.py
@@ -4,7 +4,6 @@
 # License: BSD 3 clause
 
 import math
-import pickle
 
 import numpy as np
 import pytest
@@ -12,22 +11,11 @@
 
 from sklearn.utils._testing import assert_array_equal
 
-from sklearn.utils.fixes import MaskedArray
 from sklearn.utils.fixes import _joblib_parallel_args
 from sklearn.utils.fixes import _object_dtype_isnan
 from sklearn.utils.fixes import loguniform
 
 
-def test_masked_array_obj_dtype_pickleable():
-    marr = MaskedArray([1, None, 'a'], dtype=object)
-
-    for mask in (True, False, [0, 1, 0]):
-        marr.mask = mask
-        marr_pickled = pickle.loads(pickle.dumps(marr))
-        assert_array_equal(marr.data, marr_pickled.data)
-        assert_array_equal(marr.mask, marr_pickled.mask)
-
-
 @pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0'))
 def test_joblib_parallel_args(monkeypatch, joblib_version):
     import joblib
diff --git a/sklearn/utils/tests/test_random.py b/sklearn/utils/tests/test_random.py
index 7d2437471aabb..c9ff69ec8d8b8 100644
--- a/sklearn/utils/tests/test_random.py
+++ b/sklearn/utils/tests/test_random.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
 import scipy.sparse as sp
+from scipy.special import comb
 from numpy.testing import assert_array_almost_equal
 
-from sklearn.utils.fixes import comb
 from sklearn.utils.random import _random_choice_csc, sample_without_replacement
 from sklearn.utils._random import _our_rand_r_py
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 6748dbcad9951..5f6df9685a25c 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1153,3 +1153,22 @@ def test_check_fit_params(indices):
         result['sparse-col'],
         _safe_indexing(fit_params['sparse-col'], indices_)
     )
+
+
+@pytest.mark.parametrize('sp_format', [True, 'csr', 'csc', 'coo', 'bsr'])
+def test_check_sparse_pandas_sp_format(sp_format):
+    # check_array converts pandas dataframe with only sparse arrays into
+    # sparse matrix
+    pd = pytest.importorskip("pandas")
+    sp_mat = _sparse_random_matrix(10, 3)
+
+    sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat)
+    result = check_array(sdf, accept_sparse=sp_format)
+
+    if sp_format is True:
+        # by default pandas converts to coo when accept_sparse is True
+        sp_format = 'coo'
+
+    assert sp.issparse(result)
+    assert result.format == sp_format
+    assert_allclose_dense_sparse(sp_mat, result)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 08952d6cbcd16..4bb50c3deb5e7 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -451,10 +451,12 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
-        # throw warning if pandas dataframe is sparse
+        # throw warning if columns are sparse. If all columns are sparse, then
+        # array.sparse exists and sparsity will be perserved (later).
         with suppress(ImportError):
             from pandas.api.types import is_sparse
-            if array.dtypes.apply(is_sparse).any():
+            if (not hasattr(array, 'sparse') and
+                    array.dtypes.apply(is_sparse).any()):
                 warnings.warn(
                     "pandas.DataFrame with sparse columns found."
                     "It will be converted to a dense numpy array."
@@ -498,6 +500,11 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
         estimator_name = "Estimator"
     context = " by %s" % estimator_name if estimator is not None else ""
 
+    # When all dataframe columns are sparse, convert to a sparse array
+    if hasattr(array, 'sparse') and array.ndim > 1:
+        # DataFrame.sparse only supports `to_coo`
+        array = array.sparse.to_coo()
+
     if sp.issparse(array):
         _ensure_no_complex_data(array)
         array = _ensure_sparse_format(array, accept_sparse=accept_sparse,

From 8cf73fa951308f3c2a999018d51d8a745e6c385d Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 13 Apr 2020 19:26:08 -0400
Subject: [PATCH 21/92] WIP

---
 sklearn/metrics/_ranking.py        |   9 +-
 sklearn/preprocessing/_encoders.py |  10 +-
 sklearn/preprocessing/_label.py    | 163 ++++++++++++-----------------
 3 files changed, 76 insertions(+), 106 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 3fcca5f119b12..19adf618f30ff 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -33,7 +33,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
-from ..preprocessing._label import _encode
+from ..preprocessing._label import _uniques, _encode
 
 from ._base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -457,7 +457,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
 
     if labels is not None:
         labels = column_or_1d(labels)
-        classes = _encode(labels)["uniques"]
+        classes = _uniques(labels)["uniques"]
         if len(classes) != len(labels):
             raise ValueError("Parameter 'labels' must be unique")
         if not np.array_equal(classes, labels):
@@ -471,7 +471,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
             raise ValueError(
                 "'y_true' contains labels not in parameter 'labels'")
     else:
-        classes = _encode(y_true)["uniques"]
+        classes = _uniques(y_true)["uniques"]
         if len(classes) != y_score.shape[1]:
             raise ValueError(
                 "Number of classes in y_true not equal to the number of "
@@ -482,8 +482,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
             raise ValueError("sample_weight is not supported "
                              "for multiclass one-vs-one ROC AUC, "
                              "'sample_weight' must be None in this case.")
-        y_true_encoded = (_encode(y_true, uniques=classes, encode=True)
-                          ["encoded"])
+        y_true_encoded = _encode(y_true, classes)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(_binary_roc_auc_score,
                                              y_true_encoded,
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8583629669480..1018a6556487b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -12,7 +12,7 @@
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
 
-from ._label import _encode, _encode_check_unknown
+from ._label import _encode, _encode_check_unknown, _uniques
 
 
 __all__ = [
@@ -90,7 +90,7 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
 
             result = None
             if self.categories == 'auto':
-                result = _encode(Xi, return_counts=return_counts)
+                result = _uniques(Xi, return_counts=return_counts)
                 cats = result["uniques"]
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
@@ -106,9 +106,9 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-            if return_counts:
-                if result is None:
-                    result = _encode(Xi, cats, return_counts=True)
+            if return_counts and result is None:
+                # result = _encode(Xi, cats, return_counts=True)
+
                 category_counts.append(result["counts"])
 
         if return_counts:
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 27495177f34d4..5a8e73d175369 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -34,69 +34,8 @@
 ]
 
 
-def _encode_numpy(values, uniques=None, encode=False, check_unknown=True,
-                  return_counts=False):
-    # only used in _encode below, see docstring there for details
-    if uniques is None:
-        unique_result = np.unique(values, return_inverse=encode,
-                                  return_counts=return_counts)
-        if encode and return_counts:
-            return {'uniques': unique_result[0],
-                    'encoded': unique_result[1],
-                    'counts': unique_result[2]}
-        elif encode:
-            return {'uniques': unique_result[0],
-                    'encoded': unique_result[1]}
-        elif return_counts:
-            return {'uniques': unique_result[0],
-                    'counts': unique_result[1]}
-        else:
-            return {'uniques': unique_result}
-
-    output = {'uniques': uniques}
-    if encode:
-        if check_unknown:
-            diff = _encode_check_unknown(values, uniques)
-            if diff:
-                raise ValueError("y contains previously unseen labels: %s"
-                                 % str(diff))
-        output['encoded'] = np.searchsorted(uniques, values)
-
-    if return_counts:
-        _, counts = np.unique(values, return_counts=True)
-        output['counts'] = counts
-
-    return output
-
-
-def _encode_python(values, uniques=None, encode=False, return_counts=False):
-    # only used in _encode below, see docstring there for details
-    output = {}
-    if uniques is None:
-        uniques = sorted(set(values))
-        uniques = np.array(uniques, dtype=values.dtype)
-
-    if return_counts:
-        uniques_dict = Counter(values)
-        counts = np.array([uniques_dict[item] for item in uniques],
-                          dtype=np.int)
-        output['counts'] = counts
-
-    output['uniques'] = uniques
-    if encode:
-        table = {val: i for i, val in enumerate(uniques)}
-        try:
-            encoded = np.array([table[v] for v in values])
-        except KeyError as e:
-            raise ValueError("y contains previously unseen labels: %s"
-                             % str(e))
-        output['encoded'] = encoded
-    return output
-
-
-def _encode(values, uniques=None, encode=False, check_unknown=True,
-            return_counts=False):
-    """Helper function to factorize (find uniques) and encode values.
+def _encode(values, uniques, check_unknown=True):
+    """Helper function encode values.
 
     Uses pure python method for object dtype, and numpy method for
     all other dtypes.
@@ -109,53 +48,85 @@ def _encode(values, uniques=None, encode=False, check_unknown=True,
     ----------
     values : array
         Values to factorize or encode.
-    uniques : array, optional
-        If passed, uniques are not determined from passed values (this
+    uniques : array
+        Uniques are not determined from passed values (this
         can be because the user specified categories, or because they
         already have been determined in fit).
-    encode : bool, default False
-        If True, also encode the values into integer codes based on `uniques`.
     check_unknown : bool, default True
         If True, check for values in ``values`` that are not in ``unique``
         and raise an error. This is ignored for object dtype, and treated as
         True in this case. This parameter is useful for
         _BaseEncoder._transform() to avoid calling _encode_check_unknown()
         twice.
-    return_counts: bool, default=False
-        Returns the counts of the unique items in values. If uniques of object
-        dtype is passed in, the order of the counts will match the
-        order of the uniques. All other dtypes will return counts that assume
-        that uniques is ordered.
 
     Returns
     -------
-    output :
-        Dictionary with attributes:
+    encoded : ndarray
+        Encoded values
+    """
+    if values.dtype == object:
+        table = {val: i for i, val in enumerate(uniques)}
+        try:
+            return np.array([table[v] for v in values])
+        except KeyError as e:
+            raise ValueError(f"y contains previously unseen labels: {str(e)}")
+    else:
+        if check_unknown:
+            diff = _encode_check_unknown(values, uniques)
+            if diff:
+                raise ValueError(f"y contains previously unseen labels: "
+                                 f"{str(diff)}")
+        return np.searchsorted(uniques, values)
 
-        uniques :
-            If ``encode=False``. The unique values are sorted if the `uniques`
-            parameter was None (and thus inferred from the data).
 
-        encoded :
-            If ``encode=True``.
+def _uniques_python(values, return_counts):
+    # only used in _uniques below, see docstring there for details
+    try:
+        uniques = sorted(set(values))
+        uniques = np.array(uniques, dtype=values.dtype)
+    except TypeError:
+        types = sorted(t.__qualname__
+                       for t in set(type(v) for v in values))
+        raise TypeError("Encoders require their input to be uniformly "
+                        f"strings or numbers. Got {types}")
 
-        counts :
-            If ``return_counts``.
+    output = {"uniques": uniques}
+    if return_counts:
+        uniques_dict = Counter(values)
+        counts = np.array([uniques_dict[item] for item in uniques],
+                          dtype=np.int)
+        output['counts'] = counts
+    return output
+
+
+def _uniques_numpy(values, return_counts):
+    # only used in _uniques below, see docstring there for details
+    # thin wrapper around np.unique
+    if return_counts:
+        uniques, counts = np.unique(values, return_counts=return_counts)
+        return {"uniques": uniques, "counts": counts}
+    return {"uniques": np.unique(values)}
+
+
+def _uniques(values, return_counts=False):
+    """Helper function to factorize (find uniques)
+
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+
+    Parameters
+    ----------
+    dict : dic
+    values : ndarray
+        Values to factorize.
+
+    counts : ndarray
+        Counts corresponding to `values`.
     """
     if values.dtype == object:
-        try:
-            res = _encode_python(values, uniques, encode,
-                                 return_counts=return_counts)
-        except TypeError:
-            types = sorted(t.__qualname__
-                           for t in set(type(v) for v in values))
-            raise TypeError("Encoders require their input to be uniformly "
-                            f"strings or numbers. Got {types}")
-        return res
-    else:
-        return _encode_numpy(values, uniques, encode,
-                             check_unknown=check_unknown,
-                             return_counts=return_counts)
+        return _uniques_python(values, return_counts)
+    else:  # numerical
+        return _uniques_numpy(values, return_counts)
 
 
 def _encode_check_unknown(values, uniques, return_mask=False):
@@ -273,7 +244,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         y = column_or_1d(y, warn=True)
-        self.classes_ = _encode(y)["uniques"]
+        self.classes_ = _uniques(y)["uniques"]
         return self
 
     def fit_transform(self, y):
@@ -289,7 +260,7 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         y = column_or_1d(y, warn=True)
-        result = _encode(y, encode=True)
+        result = _uniques(y, encode=True)
         self.classes_ = result["uniques"]
         return result["encoded"]
 

From ecf9e7b8d981685bbc27fe089daa58a164f3847c Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 14 Apr 2020 12:18:16 -0400
Subject: [PATCH 22/92] ENH Address comments

---
 sklearn/metrics/_ranking.py               |  6 +--
 sklearn/preprocessing/_encoders.py        | 66 +++++++++++++++--------
 sklearn/preprocessing/_label.py           | 58 +++++++++++---------
 sklearn/preprocessing/tests/test_label.py | 49 ++++++++---------
 4 files changed, 100 insertions(+), 79 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 19adf618f30ff..5afb59c379252 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -33,7 +33,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
-from ..preprocessing._label import _uniques, _encode
+from ..preprocessing._label import _unique, _encode
 
 from ._base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -457,7 +457,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
 
     if labels is not None:
         labels = column_or_1d(labels)
-        classes = _uniques(labels)["uniques"]
+        classes = _unique(labels)
         if len(classes) != len(labels):
             raise ValueError("Parameter 'labels' must be unique")
         if not np.array_equal(classes, labels):
@@ -471,7 +471,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
             raise ValueError(
                 "'y_true' contains labels not in parameter 'labels'")
     else:
-        classes = _uniques(y_true)["uniques"]
+        classes = _unique(y_true)
         if len(classes) != y_score.shape[1]:
             raise ValueError(
                 "Number of classes in y_true not equal to the number of "
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1018a6556487b..7aaed8464cd16 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -4,6 +4,7 @@
 
 import numbers
 import warnings
+from collections import Counter
 
 import numpy as np
 from scipy import sparse
@@ -12,7 +13,7 @@
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
 
-from ._label import _encode, _encode_check_unknown, _uniques
+from ._label import _encode, _encode_check_unknown, _unique
 
 
 __all__ = [
@@ -21,6 +22,26 @@
 ]
 
 
+def _get_counts(values, uniques):
+    """Get the number of times each of the values comes up `values`
+
+    For object dtypes, the counts returned will use the order passed in by
+    `uniques`.
+    For numerica dtypes, `uniques` is assumed to be ordered, such that it can
+    be used with `np.searchsorted`.
+    """
+    if values.dtype == object:
+        uniques_dict = Counter(values)
+        counts = np.array([uniques_dict[item] for item in uniques],
+                          dtype=np.int)
+        return counts
+
+    # numerical
+    uniq_values, counts = np.unique(values, return_counts=True)
+    indices_in_uniq = np.searchsorted(uniq_values, uniques)
+    return counts[indices_in_uniq]
+
+
 class _BaseEncoder(TransformerMixin, BaseEstimator):
     """
     Base class for encoders that includes the code to categorize and
@@ -88,10 +109,13 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
         for i in range(n_features):
             Xi = X_list[i]
 
-            result = None
             if self.categories == 'auto':
-                result = _uniques(Xi, return_counts=return_counts)
-                cats = result["uniques"]
+                result = _unique(Xi, return_counts=return_counts)
+                if return_counts:
+                    cats, counts = result
+                    category_counts.append(counts)
+                else:
+                    cats = result
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
@@ -104,12 +128,10 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
-            self.categories_.append(cats)
-
-            if return_counts and result is None:
-                # result = _encode(Xi, cats, return_counts=True)
+                if return_counts:
+                    category_counts.append(_get_counts(Xi, cats))
 
-                category_counts.append(result["counts"])
+            self.categories_.append(cats)
 
         if return_counts:
             process_counts(category_counts, n_samples)
@@ -165,8 +187,7 @@ def _transform(self, X, handle_unknown='error',
 
             # We use check_unknown=False, since _encode_check_unknown was
             # already called above.
-            encoded = _encode(Xi, self.categories_[i], encode=True,
-                              check_unknown=False)["encoded"]
+            encoded = _encode(Xi, self.categories_[i], check_unknown=False)
             X_int[:, i] = encoded
 
         return X_int, X_mask
@@ -665,8 +686,7 @@ def _compute_transformed_categories(self, i):
         return np.r_[cats[frequent_indices],
                      np.array([infrequent_cat], dtype=object)]
 
-    @property
-    def _n_transformed_features(self):
+    def _get_n_transformed_features(self):
         """Number of transformed features."""
         if self.drop_idx_ is not None:
             output = []
@@ -691,8 +711,7 @@ def _n_transformed_features(self):
 
         return output
 
-    @property
-    def _transformed_categories(self):
+    def _get_transformed_categories(self):
         """Transformed categories."""
         return [self._compute_transformed_categories(i)
                 for i in range(len(self.categories_))]
@@ -790,7 +809,7 @@ def transform(self, X):
             X_int[X_int > to_drop] -= 1
             X_mask &= keep_cells
 
-        n_values = self._n_transformed_features
+        n_values = self._get_n_transformed_features()
 
         mask = X_mask.ravel()
         feature_indices = np.cumsum([0] + n_values)
@@ -837,17 +856,18 @@ def inverse_transform(self, X):
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
-        n_transformed_features = sum(self._n_transformed_features)
+        transformed_features = self._get_transformed_categories()
+        n_features_out = sum(cats.shape[0] for cats in transformed_features)
 
         # validate shape of passed X
         msg = ("Shape of the passed X data is not correct. Expected {0} "
                "columns, got {1}.")
-        if X.shape[1] != n_transformed_features:
-            raise ValueError(msg.format(n_transformed_features, X.shape[1]))
+        if X.shape[1] != n_features_out:
+            raise ValueError(msg.format(n_features_out, X.shape[1]))
 
         # create resulting array of appropriate dtype
         dt = np.find_common_type([cat.dtype
-                                  for cat in self._transformed_categories], [])
+                                  for cat in transformed_features], [])
         X_tr = np.empty((n_samples, n_features), dtype=dt)
 
         j = 0
@@ -859,8 +879,8 @@ def inverse_transform(self, X):
             infrequent_indices = [None] * n_features
 
         for i in range(n_features):
-            n_categories = self._n_transformed_features[i]
-            cats = self._transformed_categories[i]
+            cats = transformed_features[i]
+            n_categories = cats.shape[0]
 
             # Only happens if there was a column with a unique
             # category. In this case we just fill the column with this
@@ -922,7 +942,7 @@ def get_feature_names(self, input_features=None):
             Array of feature names.
         """
         check_is_fitted(self)
-        cats = self._transformed_categories
+        cats = self._get_transformed_categories()
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
         elif len(input_features) != len(cats):
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 5a8e73d175369..abfe129847bb2 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -79,8 +79,8 @@ def _encode(values, uniques, check_unknown=True):
         return np.searchsorted(uniques, values)
 
 
-def _uniques_python(values, return_counts):
-    # only used in _uniques below, see docstring there for details
+def _unique_python(values, return_inverse, return_counts):
+    # Only used in _uniques below, see docstring there for details
     try:
         uniques = sorted(set(values))
         uniques = np.array(uniques, dtype=values.dtype)
@@ -90,43 +90,50 @@ def _uniques_python(values, return_counts):
         raise TypeError("Encoders require their input to be uniformly "
                         f"strings or numbers. Got {types}")
 
-    output = {"uniques": uniques}
+    ret = (uniques, )
+
+    if return_inverse:
+        table = {val: i for i, val in enumerate(uniques)}
+        inverse = np.array([table[v] for v in values], dtype=values.dtype)
+        ret += (inverse, )
+
     if return_counts:
         uniques_dict = Counter(values)
         counts = np.array([uniques_dict[item] for item in uniques],
                           dtype=np.int)
-        output['counts'] = counts
-    return output
+        ret += (counts, )
 
+    if len(ret) == 1:
+        ret = ret[0]
 
-def _uniques_numpy(values, return_counts):
-    # only used in _uniques below, see docstring there for details
-    # thin wrapper around np.unique
-    if return_counts:
-        uniques, counts = np.unique(values, return_counts=return_counts)
-        return {"uniques": uniques, "counts": counts}
-    return {"uniques": np.unique(values)}
+    return ret
 
 
-def _uniques(values, return_counts=False):
-    """Helper function to factorize (find uniques)
+def _unique(values, return_inverse=False, return_counts=False):
+    """Helper function to find uniques with support for python objects.
 
     Uses pure python method for object dtype, and numpy method for
     all other dtypes.
 
     Parameters
     ----------
-    dict : dic
-    values : ndarray
-        Values to factorize.
+    unique : ndarray
+        The sorted uniique values
+
+    unique_inverse : ndarray
+        The indicies to reconstruct the original array from the unique array.
+        Only provided if `return_inverse` is True.
 
-    counts : ndarray
-        Counts corresponding to `values`.
+    unique_counts : ndarray
+        The number of times each of the unique values comes up in the originial
+        array. Only provided if `return_counts` is True.
     """
     if values.dtype == object:
-        return _uniques_python(values, return_counts)
+        return _unique_python(values, return_inverse=return_inverse,
+                              return_counts=return_counts)
     else:  # numerical
-        return _uniques_numpy(values, return_counts)
+        return np.unique(values, return_inverse=return_inverse,
+                         return_counts=return_counts)
 
 
 def _encode_check_unknown(values, uniques, return_mask=False):
@@ -244,7 +251,7 @@ def fit(self, y):
         self : returns an instance of self.
         """
         y = column_or_1d(y, warn=True)
-        self.classes_ = _uniques(y)["uniques"]
+        self.classes_ = _unique(y)
         return self
 
     def fit_transform(self, y):
@@ -260,9 +267,8 @@ def fit_transform(self, y):
         y : array-like of shape [n_samples]
         """
         y = column_or_1d(y, warn=True)
-        result = _uniques(y, encode=True)
-        self.classes_ = result["uniques"]
-        return result["encoded"]
+        self.classes_, encoded = _unique(y, return_inverse=True)
+        return encoded
 
     def transform(self, y):
         """Transform labels to normalized encoding.
@@ -282,7 +288,7 @@ def transform(self, y):
         if _num_samples(y) == 0:
             return np.array([])
 
-        return _encode(y, uniques=self.classes_, encode=True)["encoded"]
+        return _encode(y, uniques=self.classes_)
 
     def inverse_transform(self, y):
         """Transform labels back to original encoding.
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 53db2af607fd9..fd847c2daa39c 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -23,7 +23,7 @@
 
 from sklearn.preprocessing._label import _inverse_binarize_thresholding
 from sklearn.preprocessing._label import _inverse_binarize_multiclass
-from sklearn.preprocessing._label import _encode
+from sklearn.preprocessing._label import _unique, _encode
 
 from sklearn import datasets
 
@@ -626,39 +626,34 @@ def test_inverse_binarize_multiclass():
           np.array(['a', 'b', 'c']))],
         ids=['int64', 'object', 'str'])
 def test_encode_util(values, expected):
-    uniques = _encode(values)['uniques']
+    uniques = _unique(values)
     assert_array_equal(uniques, expected)
 
-    result = _encode(values, encode=True)
-    assert_array_equal(result['uniques'], expected)
-    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
+    result, encoded = _unique(values, return_inverse=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
-    result = _encode(values, uniques, encode=True)
-    assert_array_equal(result['uniques'], expected)
-    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
+    encoded = _encode(values, uniques)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
-    result = _encode(values, return_counts=True)
-    assert_array_equal(result['uniques'], expected)
-    assert_array_equal(result['counts'], np.array([2, 1, 2]))
+    result, counts = _unique(values, return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(counts, np.array([2, 1, 2]))
 
-    result = _encode(values, encode=True, return_counts=True)
-    assert_array_equal(result['uniques'], expected)
-    assert_array_equal(result['counts'], np.array([2, 1, 2]))
-    assert_array_equal(result['encoded'], np.array([1, 0, 2, 0, 2]))
-
-    result = _encode(values, uniques, return_counts=True)
-    assert_array_equal(result['uniques'], expected)
-    assert_array_equal(result['counts'], np.array([2, 1, 2]))
+    result, encoded, counts = _unique(values, return_inverse=True,
+                                      return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    assert_array_equal(counts, np.array([2, 1, 2]))
 
 
 def test_encode_util_uniques_unordered():
-    # Make sure the returned counts are ordered based on the order of uniques
+    # Make sure the returned values are ordered based on the order of uniques
+    # make sure the enc
 
     values = np.array(['b'] * 21 + ['c'] * 5 + ['a'] * 11, dtype=object)
-    result = _encode(values, np.array(['a', 'c', 'b']), return_counts=True)
-
-    assert_array_equal(result['uniques'], np.array(['a', 'c', 'b']))
-    assert_array_equal(result['counts'], [11, 5, 21])
+    result = _encode(values, np.array(['a', 'c', 'b']))
+    assert_array_equal(result, np.array([2] * 21 + [1] * 5 + [0] * 11))
 
 
 def test_encode_check_unknown():
@@ -669,14 +664,14 @@ def test_encode_check_unknown():
     # Default is True, raise error
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=True)
+        _encode(values, uniques, check_unknown=True)
 
     # dont raise error if False
-    _encode(values, uniques, encode=True, check_unknown=False)
+    _encode(values, uniques, check_unknown=False)
 
     # parameter is ignored for object dtype
     uniques = np.array(['a', 'b', 'c'], dtype=object)
     values = np.array(['a', 'b', 'c', 'd'], dtype=object)
     with pytest.raises(ValueError,
                        match='y contains previously unseen labels'):
-        _encode(values, uniques, encode=True, check_unknown=False)
+        _encode(values, uniques, check_unknown=False)

From 9a40eb79178cabf5814e5ab927488949dbcff21f Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 14 Apr 2020 13:11:02 -0400
Subject: [PATCH 23/92] STY Fix

---
 sklearn/preprocessing/_label.py              | 2 +-
 sklearn/preprocessing/tests/test_encoders.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index abfe129847bb2..42db04636b43a 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -94,7 +94,7 @@ def _unique_python(values, return_inverse, return_counts):
 
     if return_inverse:
         table = {val: i for i, val in enumerate(uniques)}
-        inverse = np.array([table[v] for v in values], dtype=values.dtype)
+        inverse = np.array([table[v] for v in values])
         ret += (inverse, )
 
     if return_counts:
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 7949b09918a36..01dea52bfa544 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -704,6 +704,7 @@ def test_categories(density, drop):
 def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
+
 @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
 def test_encoders_does_not_support_none_values(Encoder):
     values = [["a"], [None]]
@@ -711,6 +712,7 @@ def test_encoders_does_not_support_none_values(Encoder):
                                         "uniformly strings or numbers."):
         Encoder().fit(values)
 
+
 def test_ohe_infrequent_infrequent_is_a_cat():
     # category with 'infrequent' is a frequent category, ohe will name mangle
     # this into 'infrequent_sklearn'

From eb8b501abb757af99751f1ff7a7e3a4e69813421 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 14 Apr 2020 15:01:54 -0400
Subject: [PATCH 24/92] ENH Use functiion call instead of property

---
 sklearn/preprocessing/_encoders.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 09d37e694e9e3..ce9f6a1209c8b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -422,7 +422,7 @@ def _validate_keywords(self):
                 "zero.")
 
         # validates infrequent category features
-        if self.drop is not None and self._infrequent_enabled:
+        if self.drop is not None and self._infrequent_enabled():
             raise ValueError("infrequent categories are not supported when "
                              "drop is specified")
 
@@ -431,7 +431,7 @@ def _validate_keywords(self):
             warnings.warn("handle_unknown='ignore' is deprecated in favor "
                           "of 'auto' in version 0.23 and will be removed in "
                           "version 0.25", FutureWarning)
-            if self._infrequent_enabled:
+            if self._infrequent_enabled():
                 raise ValueError("infrequent categories are only supported "
                                  "when handle_unknown is 'error' or 'auto'")
 
@@ -495,7 +495,6 @@ def _compute_drop_idx(self):
                              zip(self.drop, self.categories_)],
                             dtype=np.object)
 
-    @property
     def _infrequent_enabled(self):
         """Infrequent category is enabled."""
         return (self.max_categories is not None and self.max_categories > 1 or
@@ -607,7 +606,7 @@ def _map_to_infrequent_categories(self, X_int):
         X_int: ndarray of shape (n_samples, n_features)
             integer encoded categories
         """
-        if not self._infrequent_enabled:
+        if not self._infrequent_enabled():
             return
 
         for i, mapping in enumerate(self._default_to_infrequent_mappings):
@@ -673,7 +672,7 @@ def _compute_transformed_categories(self, i):
             return np.delete(cats, self.drop_idx_[i])
 
         # drop is None
-        if not self._infrequent_enabled:
+        if not self._infrequent_enabled():
             return cats
 
         # infrequent is enabled
@@ -704,7 +703,7 @@ def _get_n_transformed_features(self):
         # drop is None
         output = [len(cats) for cats in self.categories_]
 
-        if not self._infrequent_enabled:
+        if not self._infrequent_enabled():
             return output
 
         # infrequent is enabled
@@ -740,7 +739,7 @@ def fit(self, X, y=None):
         self._validate_keywords()
 
         process_counts = (self._fit_infrequent_category_mapping
-                          if self._infrequent_enabled else None)
+                          if self._infrequent_enabled() else None)
         self._fit(X, handle_unknown=self.handle_unknown,
                   process_counts=process_counts)
         self.drop_idx_ = self._compute_drop_idx()
@@ -786,7 +785,7 @@ def transform(self, X):
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
         transform_kws = {"handle_unknown": self.handle_unknown}
-        if self._infrequent_enabled:
+        if self._infrequent_enabled():
             transform_kws.update({
                 "process_valid_mask": self._process_valid_mask,
                 "get_default_invalid_category":
@@ -877,7 +876,7 @@ def inverse_transform(self, X):
         j = 0
         found_unknown = {}
 
-        if self._infrequent_enabled:
+        if self._infrequent_enabled():
             infrequent_indices = self.infrequent_indices_
         else:
             infrequent_indices = [None] * n_features

From 86607042f499679616ee8d2d94fd20a2e24934e0 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 13 May 2020 10:20:07 -0400
Subject: [PATCH 25/92] ENH Adds counts feature

---
 sklearn/utils/_encode.py           | 29 +++++++++++++++++++++++------
 sklearn/utils/tests/test_encode.py | 15 +++++++++++++++
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 56c701979a481..a28638cb2dd80 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -1,7 +1,8 @@
+from collections import Counter
 import numpy as np
 
 
-def _unique(values, *, return_inverse=False):
+def _unique(values, *, return_inverse=False, return_counts=False):
     """Helper function to find unique values with support for python objects.
 
     Uses pure python method for object dtype, and numpy method for
@@ -23,14 +24,20 @@ def _unique(values, *, return_inverse=False):
     unique_inverse : ndarray
         The indices to reconstruct the original array from the unique array.
         Only provided if `return_inverse` is True.
+
+    unique_counts : ndarray
+        The number of times each of the unique values comes up in the originial
+        array. Only provided if `return_counts` is True.
     """
     if values.dtype == object:
-        return _unique_python(values, return_inverse=return_inverse)
+        return _unique_python(values, return_inverse=return_inverse,
+                              return_counts=return_counts)
     # numerical
-    return np.unique(values, return_inverse=return_inverse)
+    return np.unique(values, return_inverse=return_inverse,
+                     return_counts=return_counts)
 
 
-def _unique_python(values, *, return_inverse):
+def _unique_python(values, *, return_inverse, return_counts):
     # Only used in `_uniques`, see docstring there for details
     try:
         uniques = sorted(set(values))
@@ -40,13 +47,23 @@ def _unique_python(values, *, return_inverse):
                        for t in set(type(v) for v in values))
         raise TypeError("Encoders require their input to be uniformly "
                         f"strings or numbers. Got {types}")
+    ret = (uniques, )
 
     if return_inverse:
         table = {val: i for i, val in enumerate(uniques)}
         inverse = np.array([table[v] for v in values])
-        return uniques, inverse
+        ret += (inverse, )
+
+    if return_counts:
+        uniques_dict = Counter(values)
+        counts = np.array([uniques_dict[item] for item in uniques],
+                          dtype=int)
+        ret += (counts, )
+
+    if len(ret) == 1:
+        ret = ret[0]
 
-    return uniques
+    return ret
 
 
 def _encode(values, *, uniques, check_unknown=True):
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 9371fa6e88e3e..e5e40a1ad2663 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -19,9 +19,24 @@
 def test_encode_util(values, expected):
     uniques = _unique(values)
     assert_array_equal(uniques, expected)
+
+    result, encoded = _unique(values, return_inverse=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
     encoded = _encode(values, uniques=uniques)
     assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
 
+    result, counts = _unique(values, return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
+    result, encoded, counts = _unique(values, return_inverse=True,
+                                      return_counts=True)
+    assert_array_equal(result, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    assert_array_equal(counts, np.array([2, 1, 2]))
+
 
 def test_encode_with_check_unknown():
     # test for the check_unknown parameter of _encode()

From b8a883f65544daf7ade6ba9472214d4f4fba6794 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 13 May 2020 10:28:48 -0400
Subject: [PATCH 26/92] CLN Rename variables

---
 sklearn/preprocessing/_encoders.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b7f5c20b43ef5..1ecb70c2cb9a5 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -105,7 +105,7 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
         self.categories_ = []
 
         return_counts = process_counts is not None
-        category_counts = [] if return_counts else None
+        category_counts = []
 
         for i in range(n_features):
             Xi = X_list[i]
@@ -138,8 +138,8 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
             process_counts(category_counts, n_samples)
 
     def _transform(self, X, handle_unknown='error',
-                   process_valid_mask=None,
-                   get_default_invalid_category=None):
+                   transform_valid_mask=None,
+                   get_invalid_category=None):
         X_list, n_samples, n_features = self._check_X(X)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
@@ -174,15 +174,15 @@ def _transform(self, X, handle_unknown='error',
                     else:
                         Xi = Xi.copy()
 
-                    if get_default_invalid_category is not None:
-                        invalid_index = get_default_invalid_category(i)
+                    if get_invalid_category is not None:
+                        invalid_index = get_invalid_category(i)
                     else:
                         invalid_index = 0
 
                     Xi[~valid_mask] = self.categories_[i][invalid_index]
 
-                    if process_valid_mask is not None:
-                        valid_mask = process_valid_mask(valid_mask, i)
+                    if transform_valid_mask is not None:
+                        valid_mask = transform_valid_mask(valid_mask, i)
 
                     X_mask[:, i] = valid_mask
 
@@ -619,7 +619,7 @@ def _map_to_infrequent_categories(self, X_int):
                 continue
             X_int[:, i] = np.take(mapping, X_int[:, i])
 
-    def _get_default_invalid_category(self, col_idx):
+    def _get_invalid_category(self, col_idx):
         """Get default invalid category for column index during `_transform`.
 
         This function is pasesd to `_transform` to set the invalid categories.
@@ -627,7 +627,7 @@ def _get_default_invalid_category(self, col_idx):
         infrequent_idx = self.infrequent_indices_[col_idx]
         return 0 if infrequent_idx is None else infrequent_idx[0]
 
-    def _process_valid_mask(self, valid_mask, col_idx):
+    def _transform_valid_mask(self, valid_mask, col_idx):
         """Process the valid mask during `_transform`
 
         This function is passed to `_transform` to adjust the mask depending
@@ -792,9 +792,8 @@ def transform(self, X):
         transform_kws = {"handle_unknown": self.handle_unknown}
         if self._infrequent_enabled():
             transform_kws.update({
-                "process_valid_mask": self._process_valid_mask,
-                "get_default_invalid_category":
-                self._get_default_invalid_category
+                "transform_valid_mask": self._transform_valid_mask,
+                "get_invalid_category": self._get_invalid_category
             })
 
         X_int, X_mask = self._transform(X, **transform_kws)

From 29005b1c1e2d166d96355c752e940c47c8ed1783 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 13 May 2020 10:59:15 -0400
Subject: [PATCH 27/92] DOC More details

---
 doc/whats_new/v0.24.rst            |  9 +++++++++
 sklearn/preprocessing/_encoders.py | 25 +++++++++++++------------
 sklearn/preprocessing/_label.py    |  1 -
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 8661f8138f003..6f41d72cba6de 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -65,7 +65,16 @@ Changelog
   samples between the train and test set on each fold.
   :pr:`13204` by :user:`Kyle Kosic <kykosic>`.
 
+:mod:`sklearn.preprocessing`
+............................
 
+- |MajorFeature| :class:`preprocessing.OneHotEncoder` now supports grouping
+  infrequent categories into a single infrequent category. This feature is
+  enabled by setting `handle_unknown='auto'` and specifying how to select
+  infrequent categories with `min_frequency` or `max_categories`.
+  :pr:`16018` by `Thomas Fan`_.
+
+  
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1ecb70c2cb9a5..77caa23959f0d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -281,10 +281,10 @@ class OneHotEncoder(_BaseEncoder):
         When this parameter is set to 'auto' and an unknown category is
         encountered in transform:
 
-            1. If there was no infrequent category during training, the
-            resulting one-hot encoded columns for this feature will be all
-            zeros. In the inverse transform, an unknown category will be
-            denoted as `None`.
+            1. If infrequent category support was not configured or there were
+            no infrequent category during training, the resulting one-hot
+            encoded columns for this feature will be all zeros. In the inverse
+            transform, an unknown category will be denoted as `None`.
 
             2. If there is an infrequent category during training, the unknown
             category will be considered infrequent. In the inverse transform,
@@ -292,13 +292,13 @@ class OneHotEncoder(_BaseEncoder):
             'infrequent' is already a category, 'infrequent_sklearn' will be
             used instead.
 
-        .. versionadded:: 0.23
+        .. versionadded:: 0.24
             `'auto'` was added to automatically handle unknown categories
             and infrequent categories.
 
-        .. deprecated:: 0.23
+        .. deprecated:: 0.24
             `'ignore'` is deprecated in favor of `'auto'`. This option will be
-            removed in 0.25.
+            removed in 0.26.
 
     min_frequency : int or float, default=1
         Specifies the categories to be considered infrequent.
@@ -309,15 +309,16 @@ class OneHotEncoder(_BaseEncoder):
             2. If float, categories with a smaller cardinality than
             `min_frequency * n_samples`  will be considered infrequent.
 
-        .. versionadded:: 0.23
+        .. versionadded:: 0.24
 
     max_categories : int, default=None
         Specifies an upper limit to the number of output features for each
-        input feature when considering infrequent categories. `max_categories`
-        includes the feature that combines infrequent categories. If `None`
-        there is no limit to the number of output features.
+        input feature when considering infrequent categories. Note that
+        `max_categories` includes the category representing the infrequent
+        categories along with the frequent categories. If `None`, there is no
+        limit to the number of output features.
 
-        .. versionadded:: 0.23
+        .. versionadded:: 0.24
 
     Attributes
     ----------
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 58f32c4633d2f..43ab31d5782ec 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -7,7 +7,6 @@
 # License: BSD 3 clause
 
 from collections import defaultdict
-from collections import Counter
 import itertools
 import array
 import warnings

From 03c8d4d85d1dd0f862405508fd05a32cf98f1728 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 13 May 2020 12:05:29 -0400
Subject: [PATCH 28/92] CLN Remove unneeded line

---
 sklearn/preprocessing/_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 77caa23959f0d..2451c5c24a714 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -601,7 +601,6 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
             default_to_infrequent_mappings.append(mapping)
 
         self._default_to_infrequent_mappings = default_to_infrequent_mappings
-        # self._largest_infreq_indices = largest_infreq_idxs
 
     def _map_to_infrequent_categories(self, X_int):
         """Map categories to infrequent categories.

From f669c5498a3718dd71f0034221cb434068bc8a1d Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 14 May 2020 21:27:13 -0400
Subject: [PATCH 29/92] CLN Less lines is less complicated

---
 sklearn/preprocessing/_encoders.py | 82 +++++++-----------------------
 1 file changed, 17 insertions(+), 65 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 2451c5c24a714..e18e180441f7b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -137,9 +137,7 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
         if return_counts:
             process_counts(category_counts, n_samples)
 
-    def _transform(self, X, handle_unknown='error',
-                   transform_valid_mask=None,
-                   get_invalid_category=None):
+    def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
@@ -174,16 +172,7 @@ def _transform(self, X, handle_unknown='error',
                     else:
                         Xi = Xi.copy()
 
-                    if get_invalid_category is not None:
-                        invalid_index = get_invalid_category(i)
-                    else:
-                        invalid_index = 0
-
-                    Xi[~valid_mask] = self.categories_[i][invalid_index]
-
-                    if transform_valid_mask is not None:
-                        valid_mask = transform_valid_mask(valid_mask, i)
-
+                    Xi[~valid_mask] = self.categories_[i][0]
                     X_mask[:, i] = valid_mask
 
             # We use check_unknown=False, since _encode_check_unknown was
@@ -602,7 +591,7 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
 
         self._default_to_infrequent_mappings = default_to_infrequent_mappings
 
-    def _map_to_infrequent_categories(self, X_int):
+    def _map_to_infrequent_categories(self, X_int, X_mask):
         """Map categories to infrequent categories.
         This modifies X_int in-place.
 
@@ -614,53 +603,23 @@ def _map_to_infrequent_categories(self, X_int):
         if not self._infrequent_enabled():
             return
 
+        n_features = X_int.shape[1]
+        for col_idx in range(n_features):
+            infrequent_idx = self.infrequent_indices_[col_idx]
+            if infrequent_idx is None:
+                continue
+
+            X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
+            if self.handle_unknown == 'auto':
+                # unknown values will be mapped to infrequent in the next for
+                # loop
+                X_mask[:, col_idx] = True
+
         for i, mapping in enumerate(self._default_to_infrequent_mappings):
             if mapping is None:
                 continue
             X_int[:, i] = np.take(mapping, X_int[:, i])
 
-    def _get_invalid_category(self, col_idx):
-        """Get default invalid category for column index during `_transform`.
-
-        This function is pasesd to `_transform` to set the invalid categories.
-        """
-        infrequent_idx = self.infrequent_indices_[col_idx]
-        return 0 if infrequent_idx is None else infrequent_idx[0]
-
-    def _transform_valid_mask(self, valid_mask, col_idx):
-        """Process the valid mask during `_transform`
-
-        This function is passed to `_transform` to adjust the mask depending
-        on if the infrequent column exists or not.
-
-        Parameters
-        ----------
-        valid_mask : array of shape (n_samples, )
-            boolean mask representing if a sample was seen during training
-
-        col_idx : int
-            column index
-
-        Returns
-        -------
-        valid_mask : array of shape (n_samples,) or None
-            boolean mask to use for constructing X_mask in `_transform`.
-        """
-        if self.handle_unknown != 'auto':
-            return valid_mask
-
-        # handle_unknown == 'auto'
-        infrequent_idx = self.infrequent_indices_[col_idx]
-
-        # infrequent column does not exists
-        # returning the original mask to allow the column to be ignored
-        if infrequent_idx is None:
-            return valid_mask
-
-        # infrequent column exists
-        # the unknown categories will be mapped to the infrequent category
-        return np.ones_like(valid_mask, dtype=bool)
-
     def _compute_transformed_categories(self, i):
         """Compute the transformed categories used for column `i`.
 
@@ -789,15 +748,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        transform_kws = {"handle_unknown": self.handle_unknown}
-        if self._infrequent_enabled():
-            transform_kws.update({
-                "transform_valid_mask": self._transform_valid_mask,
-                "get_invalid_category": self._get_invalid_category
-            })
-
-        X_int, X_mask = self._transform(X, **transform_kws)
-        self._map_to_infrequent_categories(X_int)
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        self._map_to_infrequent_categories(X_int, X_mask)
 
         n_samples, n_features = X_int.shape
 

From ffe29767cbca60df66a60cad8e2a10841d00eadc Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 14 May 2020 22:32:06 -0400
Subject: [PATCH 30/92] CLN Less diffs

---
 sklearn/preprocessing/_encoders.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index e18e180441f7b..3eaa988e5a913 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -155,6 +155,7 @@ def _transform(self, X, handle_unknown='error'):
             Xi = X_list[i]
             diff, valid_mask = _check_unknown(Xi, self.categories_[i],
                                               return_mask=True)
+
             if not np.all(valid_mask):
                 if handle_unknown == 'error':
                     msg = ("Found unknown categories {0} in column {1}"
@@ -164,6 +165,7 @@ def _transform(self, X, handle_unknown='error'):
                     # Set the problematic rows to an acceptable value and
                     # continue `The rows are marked `X_mask` and will be
                     # removed later.
+                    X_mask[:, i] = valid_mask
                     # cast Xi into the largest string type necessary
                     # to handle different lengths of numpy strings
                     if (self.categories_[i].dtype.kind in ('U', 'S')
@@ -173,13 +175,10 @@ def _transform(self, X, handle_unknown='error'):
                         Xi = Xi.copy()
 
                     Xi[~valid_mask] = self.categories_[i][0]
-                    X_mask[:, i] = valid_mask
-
-            # We use check_unknown=False, since _encode_check_unknown was
+            # We use check_unknown=False, since _check_unknown was
             # already called above.
-            encoded = _encode(Xi, uniques=self.categories_[i],
-                              check_unknown=False)
-            X_int[:, i] = encoded
+            X_int[:, i] = _encode(Xi, uniques=self.categories_[i],
+                                  check_unknown=False)
 
         return X_int, X_mask
 

From 8979f0b12970a842bb8beec9016984865d4371e2 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Sat, 16 May 2020 17:03:57 -0400
Subject: [PATCH 31/92] CLN Improves readiabilty

---
 sklearn/preprocessing/_encoders.py | 58 +++++++++++++-----------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 3eaa988e5a913..f8685eae7a825 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -94,7 +94,7 @@ def _get_feature(self, X, feature_idx):
         # numpy arrays, sparse arrays
         return X[:, feature_idx]
 
-    def _fit(self, X, handle_unknown='error', process_counts=None):
+    def _fit(self, X, handle_unknown='error', return_counts=False):
         X_list, n_samples, n_features = self._check_X(X)
 
         if self.categories != 'auto':
@@ -103,8 +103,6 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
-
-        return_counts = process_counts is not None
         category_counts = []
 
         for i in range(n_features):
@@ -134,8 +132,8 @@ def _fit(self, X, handle_unknown='error', process_counts=None):
 
             self.categories_.append(cats)
 
-        if return_counts:
-            process_counts(category_counts, n_samples)
+        return {'category_counts': category_counts,
+                'n_samples': n_samples}
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
@@ -522,14 +520,10 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         infrequent_mask = category_count == 0
 
         if isinstance(self.min_frequency, numbers.Integral):
-            if self.min_frequency > 1:
-                category_mask = category_count < self.min_frequency
-                infrequent_mask |= category_mask
+            infrequent_mask |= category_count < self.min_frequency
         else:  # float
-            if 0.0 < self.min_frequency < 1.0:
-                min_frequency_abs = n_samples * self.min_frequency
-                category_mask = category_count < min_frequency_abs
-                infrequent_mask |= category_mask
+            min_frequency_abs = n_samples * self.min_frequency
+            infrequent_mask |= category_count < min_frequency_abs
 
         if (self.max_categories is not None and self.max_categories > 1
                 and self.max_categories < category_count.size):
@@ -539,13 +533,12 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             infrequent_mask[smallest_levels] = True
 
         output = np.flatnonzero(infrequent_mask)
-
         if output.size == category_count.size:
             raise ValueError("All categories in column {} are infrequent"
                              .format(col_idx))
         return output if output.size > 0 else None
 
-    def _fit_infrequent_category_mapping(self, category_counts, n_samples):
+    def _fit_infrequent_category_mapping(self, fit_results):
         """Fit infrequent categories.
 
         Defines:
@@ -555,12 +548,12 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
 
         Parameters
         ----------
-        category_counts : list of ndarrays
-            list of category counts
-
-        n_samples : int
-            number of samples
+        fit_results : dict
+            return values from `super()._fit()`
         """
+        n_samples = fit_results["n_samples"]
+        category_counts = fit_results["category_counts"]
+
         self.infrequent_indices_ = [
             self._identify_infrequent(category_count, n_samples, col_idx)
             for col_idx, category_count in enumerate(category_counts)]
@@ -568,18 +561,19 @@ def _fit_infrequent_category_mapping(self, category_counts, n_samples):
         # compute mapping from default mapping to infrequent mapping
         default_to_infrequent_mappings = []
 
-        for category_count, infreq_idx in zip(category_counts,
-                                              self.infrequent_indices_):
+        for cats, infreq_idx in zip(self.categories_,
+                                    self.infrequent_indices_):
             # no infrequent categories
             if infreq_idx is None:
                 default_to_infrequent_mappings.append(None)
                 continue
 
+            n_cats = len(cats)
             # infrequent indicies exist
-            mapping = np.empty_like(category_count, dtype=np.int)
-            n_cats = mapping.size
+            mapping = np.empty(n_cats, dtype=int)
             n_infrequent_cats = infreq_idx.size
 
+            # infrequent categories are apped to the last element.
             n_frequent_cats = n_cats - n_infrequent_cats
             mapping[infreq_idx] = n_frequent_cats
 
@@ -652,6 +646,11 @@ def _compute_transformed_categories(self, i):
         return np.r_[cats[frequent_indices],
                      np.array([infrequent_cat], dtype=object)]
 
+    def _get_transformed_categories(self):
+        """Transformed categories."""
+        return [self._compute_transformed_categories(i)
+                for i in range(len(self.categories_))]
+
     def _get_n_transformed_features(self):
         """Number of transformed features."""
         if self.drop_idx_ is not None:
@@ -677,11 +676,6 @@ def _get_n_transformed_features(self):
 
         return output
 
-    def _get_transformed_categories(self):
-        """Transformed categories."""
-        return [self._compute_transformed_categories(i)
-                for i in range(len(self.categories_))]
-
     def fit(self, X, y=None):
         """
         Fit OneHotEncoder to X.
@@ -700,11 +694,9 @@ def fit(self, X, y=None):
         self
         """
         self._validate_keywords()
-
-        process_counts = (self._fit_infrequent_category_mapping
-                          if self._infrequent_enabled() else None)
-        self._fit(X, handle_unknown=self.handle_unknown,
-                  process_counts=process_counts)
+        fit_results = self._fit(X, handle_unknown=self.handle_unknown,
+                                return_counts=self._infrequent_enabled())
+        self._fit_infrequent_category_mapping(fit_results)
         self.drop_idx_ = self._compute_drop_idx()
         return self
 

From 41a29b0c077491af8afda95fc813534c969561a4 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 20 May 2020 14:24:29 -0400
Subject: [PATCH 32/92] BUG Fix

---
 sklearn/preprocessing/_encoders.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index f8685eae7a825..e8e4bb6805b8e 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -28,19 +28,18 @@ def _get_counts(values, uniques):
 
     For object dtypes, the counts returned will use the order passed in by
     `uniques`.
-    For numerica dtypes, `uniques` is assumed to be ordered, such that it can
-    be used with `np.searchsorted`.
     """
     if values.dtype == object:
         uniques_dict = Counter(values)
         counts = np.array([uniques_dict[item] for item in uniques],
-                          dtype=np.int)
+                          dtype=int)
         return counts
 
     # numerical
     uniq_values, counts = np.unique(values, return_counts=True)
-    indices_in_uniq = np.searchsorted(uniq_values, uniques)
-    return counts[indices_in_uniq]
+    indices_in_uniq = np.isin(uniq_values, uniques, assume_unique=True)
+    counts[~indices_in_uniq] = 0
+    return counts
 
 
 class _BaseEncoder(TransformerMixin, BaseEstimator):

From a1cff1f3899a0db59c8828940f081e3931bf9bf8 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 26 May 2020 15:37:44 -0400
Subject: [PATCH 33/92] CLN Address comments

---
 sklearn/preprocessing/_encoders.py           | 42 +++++++++++---------
 sklearn/preprocessing/tests/test_encoders.py |  8 ++--
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index e8e4bb6805b8e..577db25c5e893 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -420,8 +420,8 @@ def _validate_keywords(self):
         # TODO: Remove when handle_unknown='ignore' is deprecated
         if self.handle_unknown == 'ignore':
             warnings.warn("handle_unknown='ignore' is deprecated in favor "
-                          "of 'auto' in version 0.23 and will be removed in "
-                          "version 0.25", FutureWarning)
+                          "of 'auto' in version 0.24 and will be removed in "
+                          "version 0.26", FutureWarning)
             if self._infrequent_enabled():
                 raise ValueError("infrequent categories are only supported "
                                  "when handle_unknown is 'error' or 'auto'")
@@ -433,12 +433,12 @@ def _validate_keywords(self):
             if not self.min_frequency >= 1:
                 raise ValueError("min_frequency must be an integer at least "
                                  "1 or a float in (0.0, 1.0); got the "
-                                 "integer {}".format(self.min_frequency))
+                                 f"integer {self.min_frequency}")
         else:  # float
             if not 0.0 < self.min_frequency < 1.0:
                 raise ValueError("min_frequency must be an integer at least "
                                  "1 or a float in (0.0, 1.0); got the "
-                                 "float {}".format(self.min_frequency))
+                                 f"float {self.min_frequency}")
 
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -488,11 +488,15 @@ def _compute_drop_idx(self):
 
     def _infrequent_enabled(self):
         """Infrequent category is enabled."""
-        return (self.max_categories is not None and self.max_categories > 1 or
-                (isinstance(self.min_frequency, numbers.Integral)
-                    and self.min_frequency > 1) or
-                (isinstance(self.min_frequency, numbers.Real)
-                    and 0.0 < self.min_frequency < 1.0))
+        if self.max_categories is not None and self.max_categories > 1:
+            return True
+        if (isinstance(self.min_frequency, numbers.Integral)
+                and self.min_frequency > 1):
+            return True
+        if (isinstance(self.min_frequency, numbers.Real)
+                and 0.0 < self.min_frequency < 1.0):
+            return True
+        return False
 
     def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indicies based on max_categories and
@@ -501,13 +505,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         Parameters
         ----------
         category_count : ndarray of shape (n_cardinality,)
-            category counts
+            Category counts.
 
         n_samples : int
-            number of samples
+            Number of samples.
 
         col_idx : int
-            index of current category only used for the error message
+            Index of the current category. Only used for the error message.
 
         Returns
         -------
@@ -527,14 +531,15 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         if (self.max_categories is not None and self.max_categories > 1
                 and self.max_categories < category_count.size):
             # stable sort to preserve original count order
-            smallest_levels = np.argsort(category_count, kind='mergesort'
-                                         )[:-self.max_categories + 1]
+            smallest_levels = np.argsort(
+                category_count, kind='mergesort'
+                )[:-self.max_categories + 1]
             infrequent_mask[smallest_levels] = True
 
         output = np.flatnonzero(infrequent_mask)
         if output.size == category_count.size:
-            raise ValueError("All categories in column {} are infrequent"
-                             .format(col_idx))
+            raise ValueError(f"All categories in column {col_idx} are "
+                             "infrequent")
         return output if output.size > 0 else None
 
     def _fit_infrequent_category_mapping(self, fit_results):
@@ -555,7 +560,8 @@ def _fit_infrequent_category_mapping(self, fit_results):
 
         self.infrequent_indices_ = [
             self._identify_infrequent(category_count, n_samples, col_idx)
-            for col_idx, category_count in enumerate(category_counts)]
+            for col_idx, category_count in enumerate(category_counts)
+        ]
 
         # compute mapping from default mapping to infrequent mapping
         default_to_infrequent_mappings = []
@@ -590,7 +596,7 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
         Parameters
         ----------
         X_int: ndarray of shape (n_samples, n_features)
-            integer encoded categories
+            Integer encoded categories.
         """
         if not self._infrequent_enabled():
             return
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 01dea52bfa544..621a4c53245b5 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -87,7 +87,7 @@ def test_one_hot_encoder_not_fitted():
         enc.transform(X)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
@@ -220,7 +220,7 @@ def test_one_hot_encoder(X):
     assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 @pytest.mark.parametrize('sparse_', [False, True])
@@ -338,7 +338,7 @@ def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
             assert np.issubdtype(res.dtype, cat_dtype)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 @pytest.mark.parametrize("X, X2, cats, cat_dtype", [
@@ -462,7 +462,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
     assert_allclose(result, expected)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],

From e2224528e435f76be08178d9d1179032b0ea37cd Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 27 May 2020 14:02:32 -0400
Subject: [PATCH 34/92] TST Fix

---
 sklearn/preprocessing/tests/test_encoders.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 621a4c53245b5..7d9806e5a9fd9 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -47,7 +47,7 @@ def test_one_hot_encoder_diff_n_features():
         enc.transform(X2)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
 def test_one_hot_encoder_handle_unknown(handle_unknown):
@@ -1091,7 +1091,7 @@ def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
         ohe.fit(X_train)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.25
+# TODO: Remove when 'ignore' is deprecated in 0.26
 @pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("kwargs, error_msg", [
     ({'max_categories': 1}, 'max_categories must be greater than 1'),
@@ -1114,12 +1114,12 @@ def test_ohe_infrequent_invalid_parameters_error(kwargs, error_msg):
         ohe.fit(X_train)
 
 
-# TODO: Remove in 0.25 when 'ignore' is deprecated
+# TODO: Remove in 0.26 when 'ignore' is deprecated
 def test_ohe_ignore_deprecated():
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
     ohe = OneHotEncoder(handle_unknown='ignore')
 
     msg = (r"handle_unknown='ignore' is deprecated in favor of 'auto' in "
-           r"version 0\.23 and will be removed in version 0\.25")
+           r"version 0\.24 and will be removed in version 0\.26")
     with pytest.warns(FutureWarning, match=msg):
         ohe.fit(X_train)

From db96b447230e3dfb1fb16817ed115fbf86ebb69a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 17 Jun 2020 12:30:34 -0400
Subject: [PATCH 35/92] CLN Address comments

---
 doc/modules/preprocessing.rst      | 15 +++---
 doc/whats_new/v0.24.rst            |  6 +--
 sklearn/preprocessing/_encoders.py | 74 +++++++++++++++---------------
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index bcbed37eaa229..9ec84f3f8f5ac 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -598,14 +598,13 @@ represented as a dict, not as scalars.
 Infrequent categories
 ---------------------
 
-:class:`OneHotEncoder` supports outputing a feature that combines infrequent
-categories in the training data. For each input feature that has an infrequent
-category a new column is formed to represent it. The parameters to enable the
-gathering of infrequent categories are `min_frequency` and `max_categories`.
+:class:`OneHotEncoder` supports aggregating infrequent categories into a single
+output. The parameters to enable the gathering of infrequent categories are
+`min_frequency` and `max_categories`.
 
 1. `min_frequency` is either an  integer greater or equal to 1, or a float in
 the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a
-cardinality smaller than `min_frequency * n_samples`  will be considered
+cardinality smaller than `min_frequency`  will be considered
 infrequent. If `min_frequency` is a float, categories with a cardinality smaller
 than this fraction of the total number of samples will be considered infrequent.
 
@@ -617,7 +616,7 @@ categories.
 In the following example, the categories, `'dog', 'snake'` are considered
 infrequent::
 
-   >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 + 
+   >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
    ...               ['snake'] * 3]).T
    >>> enc = preprocessing.OneHotEncoder(min_frequency=6,
    ...                                   handle_unknown='auto').fit(X)
@@ -641,7 +640,9 @@ feature name::
 
 Infrequent categories can be filtered out using `min_frequency` and
 `max_categories`. In the following example, we set `max_categories=2` to
-limit the number of features in the output::
+limit the number of features in the output. This will result in all but
+the `'cat'` category to be considered infrequent, leading to two features,
+one for `'cat'` and one for infrequent categories - which are all the others::
 
    >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_categories=2,
    ...                                   handle_unknown='auto').fit(X)
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index f69f6eb3a01c3..6ffd864c92fec 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -142,9 +142,9 @@ Changelog
 ............................
 
 - |MajorFeature| :class:`preprocessing.OneHotEncoder` now supports grouping
-  infrequent categories into a single infrequent category. This feature is
-  enabled by setting `handle_unknown='auto'` and specifying how to select
-  infrequent categories with `min_frequency` or `max_categories`.
+  infrequent categories ito a single feature. Infrequent categories is
+  enabled by setting `handle_unknown` to `'auto'` or `'error'` and specifying
+  how to select infrequent categories with `min_frequency` or `max_categories`.
   :pr:`16018` by `Thomas Fan`_.
 
 :mod:`sklearn.tree`
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8294a4f579364..761c5c953e4f7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -24,22 +24,25 @@
 
 
 def _get_counts(values, uniques):
-    """Get the number of times each of the values comes up `values`
+    """Get the count of each of the `uniques` in `values`. The counts will use
+    the order passed in by `uniques`.
 
-    For object dtypes, the counts returned will use the order passed in by
-    `uniques`.
+    For non-object dtypes, `uniques` is assumed to be sorted.
     """
-    if values.dtype == object:
-        uniques_dict = Counter(values)
-        counts = np.array([uniques_dict[item] for item in uniques],
+    if values.dtype.kind in 'UO':
+        counter = Counter(values)
+        counts = np.array([counter[item] for item in uniques],
                           dtype=int)
         return counts
 
-    # numerical
-    uniq_values, counts = np.unique(values, return_counts=True)
-    indices_in_uniq = np.isin(uniq_values, uniques, assume_unique=True)
-    counts[~indices_in_uniq] = 0
-    return counts
+    unique_values, counts = np.unique(values, return_counts=True)
+    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
+    unique_valid_indices = np.searchsorted(unique_values,
+                                           uniques[uniques_in_values])
+
+    output = np.zeros_like(uniques)
+    output[uniques_in_values] = counts[unique_valid_indices]
+    return output
 
 
 class _BaseEncoder(TransformerMixin, BaseEstimator):
@@ -131,8 +134,10 @@ def _fit(self, X, handle_unknown='error', return_counts=False):
 
             self.categories_.append(cats)
 
-        return {'category_counts': category_counts,
-                'n_samples': n_samples}
+        output = {'n_samples': n_samples}
+        if return_counts:
+            output['category_counts'] = category_counts
+        return output
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
@@ -236,10 +241,6 @@ class OneHotEncoder(_BaseEncoder):
         representation and can therefore induce a bias in downstream models,
         for instance for penalized linear classification or regression models.
 
-        However, dropping one category breaks the symmetry of the original
-        representation and can therefore induce a bias in downstream models,
-        for instance for penalized linear classification or regression models.
-
         - None : retain all features (the default).
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
@@ -286,7 +287,8 @@ class OneHotEncoder(_BaseEncoder):
             removed in 0.26.
 
     min_frequency : int or float, default=1
-        Specifies the categories to be considered infrequent.
+        Specifies the minimum frequency for a category not to be considered
+        infrequent.
 
             1. If int, categories with a smaller cardinality will be considered
             infrequent.
@@ -325,7 +327,7 @@ class OneHotEncoder(_BaseEncoder):
     infrequent_indices_ : list of shape (n_features,)
         Defined only when `min_frequency` or `max_categories` is set to a
         non-default value. `infrequent_indices_[i]` is an array of indices
-        corresponding to `categories_[i]` of the infrequent categories.
+        mapping from `categories_[i]` to the infrequent categories.
         `infrequent_indices_[i]` is None if the ith input feature has no
         infrequent categories.
 
@@ -499,8 +501,7 @@ def _infrequent_enabled(self):
         return False
 
     def _identify_infrequent(self, category_count, n_samples, col_idx):
-        """Compute the infrequent indicies based on max_categories and
-        min_frequency.
+        """Compute the infrequent indices
 
         Parameters
         ----------
@@ -516,19 +517,16 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         Returns
         -------
         output : ndarray of shape (n_infrequent_categories,) or None
-            If there are infrequent categories, indicies of infrequent
+            If there are infrequent categories, indices of infrequent
             categories. Otherwise None.
         """
-        # categories with no count are infrequent
-        infrequent_mask = category_count == 0
-
         if isinstance(self.min_frequency, numbers.Integral):
-            infrequent_mask |= category_count < self.min_frequency
+            infrequent_mask = category_count < self.min_frequency
         else:  # float
             min_frequency_abs = n_samples * self.min_frequency
-            infrequent_mask |= category_count < min_frequency_abs
+            infrequent_mask = category_count < min_frequency_abs
 
-        if (self.max_categories is not None and self.max_categories > 1
+        if (self.max_categories is not None
                 and self.max_categories < category_count.size):
             # stable sort to preserve original count order
             smallest_levels = np.argsort(
@@ -545,16 +543,20 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
     def _fit_infrequent_category_mapping(self, fit_results):
         """Fit infrequent categories.
 
-        Defines:
-            1. infrequent_indices_ to be the categories that are infrequent.
-            2. _default_to_infrequent_mappings to be the mapping from the
-               default mapping provided by _encode to the infrequent categories
+        Defines the private attribute: `_default_to_infrequent_mappings`.
+        For feature `i`, `_default_to_infrequent_mappings[i]` defines the
+        mapping from the integer encoding from `super().transform()` into
+        infrequent categories. If `_default_to_infrequent_mappings[i]` is
+        None, there were no infrequent categories in the training set.
 
         Parameters
         ----------
         fit_results : dict
             return values from `super()._fit()`
         """
+        if not self._infrequent_enabled():
+            return
+
         n_samples = fit_results["n_samples"]
         category_counts = fit_results["category_counts"]
 
@@ -574,11 +576,11 @@ def _fit_infrequent_category_mapping(self, fit_results):
                 continue
 
             n_cats = len(cats)
-            # infrequent indicies exist
+            # infrequent indices exist
             mapping = np.empty(n_cats, dtype=int)
             n_infrequent_cats = infreq_idx.size
 
-            # infrequent categories are apped to the last element.
+            # infrequent categories are mapped to the last element.
             n_frequent_cats = n_cats - n_infrequent_cats
             mapping[infreq_idx] = n_frequent_cats
 
@@ -590,8 +592,8 @@ def _fit_infrequent_category_mapping(self, fit_results):
         self._default_to_infrequent_mappings = default_to_infrequent_mappings
 
     def _map_to_infrequent_categories(self, X_int, X_mask):
-        """Map categories to infrequent categories.
-        This modifies X_int in-place.
+        """Map categories to infrequent categories. This modifies X_int
+        in-place.
 
         Parameters
         ----------

From dc7389490c46240adbd0ac4fa6d2a83877187c29 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 17 Jun 2020 13:09:31 -0400
Subject: [PATCH 36/92] CLN Address comments

---
 sklearn/preprocessing/_encoders.py           | 5 +++--
 sklearn/preprocessing/tests/test_encoders.py | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 761c5c953e4f7..2d1402486e1ce 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -526,8 +526,9 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             min_frequency_abs = n_samples * self.min_frequency
             infrequent_mask = category_count < min_frequency_abs
 
+        n_current_features = category_count.size - infrequent_mask.sum() + 1
         if (self.max_categories is not None
-                and self.max_categories < category_count.size):
+                and self.max_categories < n_current_features):
             # stable sort to preserve original count order
             smallest_levels = np.argsort(
                 category_count, kind='mergesort'
@@ -537,7 +538,7 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         output = np.flatnonzero(infrequent_mask)
         if output.size == category_count.size:
             raise ValueError(f"All categories in column {col_idx} are "
-                             "infrequent")
+                             "infrequent, try decreasing min_frequency")
         return output if output.size > 0 else None
 
     def _fit_infrequent_category_mapping(self, fit_results):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 7d9806e5a9fd9..4bfc00683d318 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -749,8 +749,7 @@ def test_ohe_infrequent_infrequent_is_a_cat():
     {'max_categories': 2, 'min_frequency': 6},
     {'max_categories': 4, 'min_frequency': 12},
 ])
-@pytest.mark.parametrize("categories",
-                         ["auto", [['a', 'b', 'c', 'd']]])
+@pytest.mark.parametrize("categories", ["auto", [['a', 'b', 'c', 'd']]])
 def test_ohe_infrequent_two_levels(kwargs, categories):
     # Test that different parameters for combine 'a', 'c', and 'd' into
     # the infrequent category works as expected

From 1a686b512153d20104b0e5e17a056969a3e9d6bb Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 17 Jun 2020 13:39:32 -0400
Subject: [PATCH 37/92] CLN Move docstring to userguide

---
 doc/modules/preprocessing.rst      | 27 +++++++---
 sklearn/preprocessing/_encoders.py | 81 ++++++++++++------------------
 2 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 9ec84f3f8f5ac..f280888acd12d 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -603,15 +603,15 @@ output. The parameters to enable the gathering of infrequent categories are
 `min_frequency` and `max_categories`.
 
 1. `min_frequency` is either an  integer greater or equal to 1, or a float in
-the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with a
-cardinality smaller than `min_frequency`  will be considered
-infrequent. If `min_frequency` is a float, categories with a cardinality smaller
-than this fraction of the total number of samples will be considered infrequent.
+   the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with
+   a cardinality smaller than `min_frequency`  will be considered infrequent.
+   If `min_frequency` is a float, categories with a cardinality smaller than
+   this fraction of the total number of samples will be considered infrequent.
 
 2. `max_categories` is either `None` or any integer greater than 1. This
-parameter sets an upper limit to the number of output features for each input
-feature. `max_categories` includes the feature that combines infrequent
-categories.
+   parameter sets an upper limit to the number of output features for each
+   input feature. `max_categories` includes the feature that combines
+   infrequent categories.
 
 In the following example, the categories, `'dog', 'snake'` are considered
 infrequent::
@@ -638,6 +638,19 @@ feature name::
    >>> enc.get_feature_names()
    array(['x0_cat', 'x0_rabbit', 'x0_infrequent'], dtype=object)
 
+When this `'handle_unknown'` is set to 'auto' and an unknown category is
+encountered in transform:
+
+1. If infrequent category support was not configured or there were no
+   infrequent category during training, the resulting one-hot encoded columns
+   for this feature will be all zeros. In the inverse transform, an unknown
+   category will be denoted as `None`.
+
+2. If there is an infrequent category during training, the unknown category
+   will be considered infrequent. In the inverse transform, 'infrequent' will
+   be used to represent the infrequent category. If `'infrequent'` is already a
+   category, `'infrequent_sklearn'` will be used instead.
+
 Infrequent categories can be filtered out using `min_frequency` and
 `max_categories`. In the following example, we set `max_categories=2` to
 limit the number of features in the output. This will result in all but
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 2d1402486e1ce..695dade5ad059 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -259,24 +259,11 @@ class OneHotEncoder(_BaseEncoder):
     handle_unknown : {'error', 'ignore', 'auto'}, default='error'
         Whether to raise an error or ignore if an unknown categorical feature
         is present during transform (default is to raise). When this parameter
-        is set to 'ignore' and an unknown category is encountered during
+        is set to 'auto' and an unknown category is encountered during
         transform, the resulting one-hot encoded columns for this feature
         will be all zeros. In the inverse transform, an unknown category
-        will be denoted as None.
-
-        When this parameter is set to 'auto' and an unknown category is
-        encountered in transform:
-
-            1. If infrequent category support was not configured or there were
-            no infrequent category during training, the resulting one-hot
-            encoded columns for this feature will be all zeros. In the inverse
-            transform, an unknown category will be denoted as `None`.
-
-            2. If there is an infrequent category during training, the unknown
-            category will be considered infrequent. In the inverse transform,
-            'infrequent' will be used to represent the infrequent category. If
-            'infrequent' is already a category, 'infrequent_sklearn' will be
-            used instead.
+        will be denoted as None. Read more in the
+        :ref:`User Guide <one_hot_encoder_infrequent_categories>F`
 
         .. versionadded:: 0.24
             `'auto'` was added to automatically handle unknown categories
@@ -414,20 +401,6 @@ def _validate_keywords(self):
                 "specified, as both would create categories that are all "
                 "zero.")
 
-        # validates infrequent category features
-        if self.drop is not None and self._infrequent_enabled():
-            raise ValueError("infrequent categories are not supported when "
-                             "drop is specified")
-
-        # TODO: Remove when handle_unknown='ignore' is deprecated
-        if self.handle_unknown == 'ignore':
-            warnings.warn("handle_unknown='ignore' is deprecated in favor "
-                          "of 'auto' in version 0.24 and will be removed in "
-                          "version 0.26", FutureWarning)
-            if self._infrequent_enabled():
-                raise ValueError("infrequent categories are only supported "
-                                 "when handle_unknown is 'error' or 'auto'")
-
         if self.max_categories is not None and self.max_categories <= 1:
             raise ValueError("max_categories must be greater than 1")
 
@@ -442,6 +415,30 @@ def _validate_keywords(self):
                                  "1 or a float in (0.0, 1.0); got the "
                                  f"float {self.min_frequency}")
 
+        self._infrequent_enabled = (
+            (self.max_categories is not None and self.max_categories > 1)
+            or
+            (isinstance(self.min_frequency, numbers.Integral) and
+             self.min_frequency > 1)
+            or
+            (isinstance(self.min_frequency, numbers.Real) and
+             self.min_frequency < 1.0)
+        )
+
+        # validates infrequent category features
+        if self.drop is not None and self._infrequent_enabled:
+            raise ValueError("infrequent categories are not supported when "
+                             "drop is specified")
+
+        # TODO: Remove when handle_unknown='ignore' is deprecated
+        if self.handle_unknown == 'ignore':
+            warnings.warn("handle_unknown='ignore' is deprecated in favor "
+                          "of 'auto' in version 0.24 and will be removed in "
+                          "version 0.26", FutureWarning)
+            if self._infrequent_enabled:
+                raise ValueError("infrequent categories are only supported "
+                                 "when handle_unknown is 'error' or 'auto'")
+
     def _compute_drop_idx(self):
         if self.drop is None:
             return None
@@ -488,18 +485,6 @@ def _compute_drop_idx(self):
                              zip(self.drop, self.categories_)],
                             dtype=np.object)
 
-    def _infrequent_enabled(self):
-        """Infrequent category is enabled."""
-        if self.max_categories is not None and self.max_categories > 1:
-            return True
-        if (isinstance(self.min_frequency, numbers.Integral)
-                and self.min_frequency > 1):
-            return True
-        if (isinstance(self.min_frequency, numbers.Real)
-                and 0.0 < self.min_frequency < 1.0):
-            return True
-        return False
-
     def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indices
 
@@ -555,7 +540,7 @@ def _fit_infrequent_category_mapping(self, fit_results):
         fit_results : dict
             return values from `super()._fit()`
         """
-        if not self._infrequent_enabled():
+        if not self._infrequent_enabled:
             return
 
         n_samples = fit_results["n_samples"]
@@ -601,7 +586,7 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
         X_int: ndarray of shape (n_samples, n_features)
             Integer encoded categories.
         """
-        if not self._infrequent_enabled():
+        if not self._infrequent_enabled:
             return
 
         n_features = X_int.shape[1]
@@ -637,7 +622,7 @@ def _compute_transformed_categories(self, i):
             return np.delete(cats, self.drop_idx_[i])
 
         # drop is None
-        if not self._infrequent_enabled():
+        if not self._infrequent_enabled:
             return cats
 
         # infrequent is enabled
@@ -673,7 +658,7 @@ def _get_n_transformed_features(self):
         # drop is None
         output = [len(cats) for cats in self.categories_]
 
-        if not self._infrequent_enabled():
+        if not self._infrequent_enabled:
             return output
 
         # infrequent is enabled
@@ -703,7 +688,7 @@ def fit(self, X, y=None):
         """
         self._validate_keywords()
         fit_results = self._fit(X, handle_unknown=self.handle_unknown,
-                                return_counts=self._infrequent_enabled())
+                                return_counts=self._infrequent_enabled)
         self._fit_infrequent_category_mapping(fit_results)
         self.drop_idx_ = self._compute_drop_idx()
         return self
@@ -831,7 +816,7 @@ def inverse_transform(self, X):
         j = 0
         found_unknown = {}
 
-        if self._infrequent_enabled():
+        if self._infrequent_enabled:
             infrequent_indices = self.infrequent_indices_
         else:
             infrequent_indices = [None] * n_features

From 853f54da6cca2728f22ed34d797226c2033876f1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 17 Jun 2020 13:42:20 -0400
Subject: [PATCH 38/92] DOC Better wrapping

---
 sklearn/preprocessing/_encoders.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 695dade5ad059..8b4b9478085d9 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -529,11 +529,11 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
     def _fit_infrequent_category_mapping(self, fit_results):
         """Fit infrequent categories.
 
-        Defines the private attribute: `_default_to_infrequent_mappings`.
-        For feature `i`, `_default_to_infrequent_mappings[i]` defines the
-        mapping from the integer encoding from `super().transform()` into
-        infrequent categories. If `_default_to_infrequent_mappings[i]` is
-        None, there were no infrequent categories in the training set.
+        Defines the private attribute: `_default_to_infrequent_mappings`. For
+        feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
+        from the integer encoding returned by `super().transform()` into
+        infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
+        there were no infrequent categories in the training set.
 
         Parameters
         ----------

From 5ad59170530e731d1c9a4e050bc58ffb0f58dcc5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 18 Jun 2020 17:34:12 -0400
Subject: [PATCH 39/92] TST Adds test to handle_unknown='error'

---
 sklearn/preprocessing/tests/test_encoders.py | 41 +++++++++++++++-----
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 4bfc00683d318..35f27a9d8ae2d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -819,6 +819,33 @@ def test_ohe_infrequent_three_levels(kwargs):
     assert_array_equal(['x0_b', 'x0_c', 'x0_infrequent'], feature_names)
 
 
+def test_ohe_infrequent_handle_unknown_error():
+    # Test that different parameters for combine 'a', and 'd' into
+    # the infrequent category works as expected
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='error', sparse=False,
+                        max_categories=3).fit(X_train)
+    assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
+
+    # all categories are known
+    X_test = [['b'], ['a'], ['c'], ['d']]
+    expected = np.array([
+        [1, 0, 0],
+        [0, 0, 1],
+        [0, 1, 0],
+        [0, 0, 1]])
+
+    X_trans = ohe.transform(X_test)
+    assert_allclose(expected, X_trans)
+
+    # 'bad' is not know and will error
+    X_test = [['bad']]
+    msg = r"Found unknown categories \['bad'\] in column 0"
+    with pytest.raises(ValueError, match=msg):
+        ohe.transform(X_test)
+
+
 @pytest.mark.parametrize("kwargs", [{'max_categories': 3},
                                     {'min_frequency': 4}])
 def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
@@ -843,7 +870,7 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
 
 def test_ohe_infrequent_two_levels_user_cats():
     # Test that the order of the categories provided by a user is respected.
-    # Specifically, the infrequent_indicies_ correspond to the user provided
+    # Specifically, the infrequent_indices_ correspond to the user provided
     # categories.
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
@@ -864,7 +891,7 @@ def test_ohe_infrequent_two_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    # The most frequent infrquent category is used for the inverse transform
+    # The most frequent infrequent category is used for the inverse transform
     expected_inv = [[col] for col in ['b'] + ['infrequent'] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
@@ -894,7 +921,7 @@ def test_ohe_infrequent_three_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    # The most frequent infrquent category is used for the inverse transform
+    # The most frequent infrequent category is used for the inverse transform
     expected_inv = [['b'], ['infrequent'], ['c'], ['infrequent'],
                     ['infrequent']]
     X_inv = ohe.inverse_transform(X_trans)
@@ -921,7 +948,7 @@ def test_ohe_infrequent_multiple_categories():
 
     # The most frequent infrequent category becomes the feature name
     # For the first column, 1 and 2 have the same frequency. In this case,
-    # 1 will be choosen to be the feature name because is smaller lexiconically
+    # 1 will be chosen to be the feature name because is smaller lexiconically
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_0', 'x0_3', 'x0_infrequent',
                         'x1_0', 'x1_5', 'x1_infrequent',
@@ -1025,12 +1052,6 @@ def test_ohe_infrequent_multiple_categories_dtypes():
                              ['f',          'infrequent']], dtype=object)
     assert_array_equal(expected_inv, X_inv)
 
-    # error for unknown categories
-    ohe = OneHotEncoder(categories='auto', max_categories=3,
-                        handle_unknown='error').fit(X)
-    with pytest.raises(ValueError, match="Found unknown categories"):
-        ohe.transform(X_test)
-
     # only infrequent or known categories
     X_test = pd.DataFrame(
         {'str': ['c', 'b'],

From 7414e26fbb2a075f3aff848234290859eacd4fbc Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 18 Jun 2020 17:36:38 -0400
Subject: [PATCH 40/92] ENH Spelling error in docstring

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8b4b9478085d9..9a3aa78f98c48 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -263,7 +263,7 @@ class OneHotEncoder(_BaseEncoder):
         transform, the resulting one-hot encoded columns for this feature
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None. Read more in the
-        :ref:`User Guide <one_hot_encoder_infrequent_categories>F`
+        :ref:`User Guide <one_hot_encoder_infrequent_categories>`
 
         .. versionadded:: 0.24
             `'auto'` was added to automatically handle unknown categories

From 265d85e9784c0688ae0497affe96253e5b73068c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 31 Oct 2020 20:30:48 -0400
Subject: [PATCH 41/92] BUG Fixes counter with nan values

---
 sklearn/utils/_encode.py           | 48 ++++++++++++++++++++++++++----
 sklearn/utils/tests/test_encode.py |  6 ++++
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 072300bdb2462..37d31069c525d 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -1,4 +1,3 @@
-from collections import Counter
 from typing import NamedTuple
 
 import numpy as np
@@ -40,9 +39,15 @@ def _unique(values, *, return_inverse=False, return_counts=False):
                     return_counts=return_counts)
 
     if return_inverse:
-        uniques, inverse = out
+        if return_counts:
+            uniques, inverse, counts = out
+        else:
+            uniques, inverse = out
     else:
-        uniques = out
+        if return_counts:
+            uniques, counts = out
+        else:
+            uniques = out
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
@@ -52,9 +57,19 @@ def _unique(values, *, return_inverse=False, return_counts=False):
         if return_inverse:
             inverse[inverse > nan_idx] = nan_idx
 
+        if return_counts:
+            counts[nan_idx] = np.sum(counts[nan_idx:])
+            counts = counts[:nan_idx+1]
+
+    ret = (uniques, )
+
     if return_inverse:
-        return uniques, inverse
-    return uniques
+        ret += (inverse, )
+
+    if return_counts:
+        ret += (counts, )
+
+    return ret[0] if len(ret) == 1 else ret
 
 
 class MissingValues(NamedTuple):
@@ -124,6 +139,27 @@ def __missing__(self, key):
         raise KeyError(key)
 
 
+class _NaNCounter(dict):
+    """Counter that supports nans."""
+    def __init__(self, iterable):
+        for item in iterable:
+            if is_scalar_nan(item):
+                if not hasattr(self, 'nan_cnt'):
+                    self.nan_cnt = 0
+                self.nan_cnt += 1
+                continue
+
+            try:
+                self[item] += 1
+            except KeyError:
+                self[item] = 1
+
+    def __missing__(self, key):
+        if hasattr(self, 'nan_cnt') and is_scalar_nan(key):
+            return self.nan_cnt
+        raise KeyError(key)
+
+
 def _map_to_integer(values, uniques):
     """Map values based on its position in uniques."""
     table = _nandict({val: i for i, val in enumerate(uniques)})
@@ -150,7 +186,7 @@ def _unique_python(values, *, return_inverse, return_counts):
         ret += (_map_to_integer(values, uniques), )
 
     if return_counts:
-        uniques_dict = Counter(values)
+        uniques_dict = _NaNCounter(values)
         counts = np.array([uniques_dict[item] for item in uniques],
                           dtype=int)
         ret += (counts, )
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 282be9048ba72..2de562c68902d 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -184,6 +184,9 @@ def test_unique_util_missing_values_numeric():
     assert_array_equal(uniques, expected_uniques)
     assert_array_equal(inverse, expected_inverse)
 
+    _, counts = _unique(values, return_counts=True)
+    assert_array_equal(counts, [1, 2, 1, 2])
+
     encoded = _encode(values, uniques=uniques)
     assert_array_equal(encoded, expected_inverse)
 
@@ -202,6 +205,9 @@ def test_unique_util_with_all_missing_values():
     _, inverse = _unique(values, return_inverse=True)
     assert_array_equal(inverse, expected_inverse)
 
+    _, counts = _unique(values, return_counts=True)
+    assert_array_equal(counts, [1, 2, 2, 2])
+
 
 def test_check_unknown_with_both_missing_values():
     # test for both types of missing values for object dtype

From 090c5940eb9b6abd2ad12d6aa13b3405f2afc263 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 31 Oct 2020 20:32:12 -0400
Subject: [PATCH 42/92] BUG Removes unneeded test

---
 sklearn/preprocessing/tests/test_encoders.py | 40 --------------------
 1 file changed, 40 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 78762dba8153e..5e31203cbc727 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -531,38 +531,6 @@ def test_one_hot_encoder_drop_equals_if_binary():
     assert_allclose(result, expected)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
-                               np.array([['a', np.nan]], dtype=object).T],
-                         ids=['numeric', 'object'])
-@pytest.mark.parametrize("as_data_frame", [False, True],
-                         ids=['array', 'dataframe'])
-@pytest.mark.parametrize("handle_unknown", ['error', 'auto', 'ignore'])
-def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
-    if as_data_frame:
-        pd = pytest.importorskip('pandas')
-        X = pd.DataFrame(X)
-
-    ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
-
-    if as_data_frame:
-        X_partial = X.iloc[:1, :]
-    else:
-        X_partial = X[:1, :]
-
-    ohe.fit(X_partial)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
-
-
 @pytest.mark.parametrize("X", [
     [['abc', 2, 55], ['def', 1, 55]],
     np.array([[10, 2, 55], [20, 1, 55]]),
@@ -875,14 +843,6 @@ def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
 
-@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
-def test_encoders_does_not_support_none_values(Encoder):
-    values = [["a"], [None]]
-    with pytest.raises(TypeError, match="Encoders require their input to be "
-                                        "uniformly strings or numbers."):
-        Encoder().fit(values)
-
-
 def test_ohe_infrequent_infrequent_is_a_cat():
     # category with 'infrequent' is a frequent category, ohe will name mangle
     # this into 'infrequent_sklearn'

From 8411e3d2ea952afcaadffb59a99e2b3d729f3490 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 3 Nov 2020 19:45:07 -0500
Subject: [PATCH 43/92] BUG Fixes issue

---
 sklearn/preprocessing/tests/test_encoders.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 5e31203cbc727..3244500c14f63 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1302,12 +1302,15 @@ def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
     assert_array_equal(X_trans, expected)
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.26
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['auto', 'ignore'])
 @pytest.mark.parametrize("missing_value", [np.nan, None])
-def test_ohe_missing_values_get_feature_names(missing_value):
+def test_ohe_missing_values_get_feature_names(missing_value, handle_unknown):
     # encoder with missing values with object dtypes
     X = np.array([['a', 'b', missing_value, 'a', missing_value]],
                  dtype=object).T
-    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X)
+    ohe = OneHotEncoder(sparse=False, handle_unknown=handle_unknown).fit(X)
     names = ohe.get_feature_names()
     assert_array_equal(names, ['x0_a', 'x0_b', f'x0_{missing_value}'])
 
@@ -1330,8 +1333,12 @@ def test_ohe_missing_value_support_pandas():
     assert_allclose(Xtr, expected_df_trans)
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.26
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
+@pytest.mark.parametrize("handle_unknown", ['auto', 'ignore'])
 @pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan'])
-def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
+def test_ohe_missing_value_support_pandas_categorical(pd_nan_type,
+                                                      handle_unknown):
     # checks pandas dataframe with categorical features
     if pd_nan_type == 'pd.NA':
         # pd.NA is in pandas 1.0
@@ -1353,7 +1360,7 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
         [1, 0, 0, 0],
     ])
 
-    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
+    ohe = OneHotEncoder(sparse=False, handle_unknown=handle_unknown)
     df_trans = ohe.fit_transform(df)
     assert_allclose(expected_df_trans, df_trans)
 

From ec6e23f4da3edc25ddeaba704de168679d6d77e0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 29 Dec 2020 14:47:36 -0500
Subject: [PATCH 44/92] ENH Sync with main

---
 sklearn/preprocessing/_encoders.py |  26 +------
 sklearn/utils/_encode.py           | 114 ++++++++++++++++++-----------
 sklearn/utils/tests/test_encode.py |  35 +++++++--
 3 files changed, 102 insertions(+), 73 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 0719ba2a20d3a..7c421fda0bac6 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -4,18 +4,16 @@
 
 import numbers
 import warnings
-from collections import Counter
 
 import numpy as np
 from scipy import sparse
-import numbers
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, is_scalar_nan
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 
-from ..utils._encode import _encode, _check_unknown, _unique
+from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
 
 
 __all__ = [
@@ -24,28 +22,6 @@
 ]
 
 
-def _get_counts(values, uniques):
-    """Get the count of each of the `uniques` in `values`. The counts will use
-    the order passed in by `uniques`.
-
-    For non-object dtypes, `uniques` is assumed to be sorted.
-    """
-    if values.dtype.kind in 'UO':
-        counter = Counter(values)
-        counts = np.array([counter[item] for item in uniques],
-                          dtype=int)
-        return counts
-
-    unique_values, counts = np.unique(values, return_counts=True)
-    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
-    unique_valid_indices = np.searchsorted(unique_values,
-                                           uniques[uniques_in_values])
-
-    output = np.zeros_like(uniques)
-    output[uniques_in_values] = counts[unique_valid_indices]
-    return output
-
-
 class _BaseEncoder(TransformerMixin, BaseEstimator):
     """
     Base class for encoders that includes the code to categorize and
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 37d31069c525d..13b116f8130fb 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -1,3 +1,5 @@
+from contextlib import suppress
+from collections import Counter
 from typing import NamedTuple
 
 import numpy as np
@@ -35,19 +37,26 @@ def _unique(values, *, return_inverse=False, return_counts=False):
         return _unique_python(values, return_inverse=return_inverse,
                               return_counts=return_counts)
     # numerical
-    out = np.unique(values, return_inverse=return_inverse,
-                    return_counts=return_counts)
+    return _unique_np(values, return_inverse=return_inverse,
+                      return_counts=return_counts)
+
+
+def _unique_np(values, return_inverse=False, return_counts=False):
+    """Helper function to find unique values for numpy arrays that correctly
+    accounts for nans. See `_unique` documentation for details."""
+    uniques = np.unique(values, return_inverse=return_inverse,
+                        return_counts=return_counts)
+
+    inverse, counts = None, None
+
+    if return_counts:
+        *uniques, counts = uniques
 
     if return_inverse:
-        if return_counts:
-            uniques, inverse, counts = out
-        else:
-            uniques, inverse = out
-    else:
-        if return_counts:
-            uniques, counts = out
-        else:
-            uniques = out
+        *uniques, inverse = uniques
+
+    if return_counts or return_inverse:
+        uniques = uniques[0]
 
     # np.unique will have duplicate missing values at the end of `uniques`
     # here we clip the nans and remove it from uniques
@@ -59,7 +68,7 @@ def _unique(values, *, return_inverse=False, return_counts=False):
 
         if return_counts:
             counts[nan_idx] = np.sum(counts[nan_idx:])
-            counts = counts[:nan_idx+1]
+            counts = counts[:nan_idx + 1]
 
     ret = (uniques, )
 
@@ -139,27 +148,6 @@ def __missing__(self, key):
         raise KeyError(key)
 
 
-class _NaNCounter(dict):
-    """Counter that supports nans."""
-    def __init__(self, iterable):
-        for item in iterable:
-            if is_scalar_nan(item):
-                if not hasattr(self, 'nan_cnt'):
-                    self.nan_cnt = 0
-                self.nan_cnt += 1
-                continue
-
-            try:
-                self[item] += 1
-            except KeyError:
-                self[item] = 1
-
-    def __missing__(self, key):
-        if hasattr(self, 'nan_cnt') and is_scalar_nan(key):
-            return self.nan_cnt
-        raise KeyError(key)
-
-
 def _map_to_integer(values, uniques):
     """Map values based on its position in uniques."""
     table = _nandict({val: i for i, val in enumerate(uniques)})
@@ -186,15 +174,9 @@ def _unique_python(values, *, return_inverse, return_counts):
         ret += (_map_to_integer(values, uniques), )
 
     if return_counts:
-        uniques_dict = _NaNCounter(values)
-        counts = np.array([uniques_dict[item] for item in uniques],
-                          dtype=int)
-        ret += (counts, )
+        ret += (_get_counts(values, uniques), )
 
-    if len(ret) == 1:
-        ret = ret[0]
-
-    return ret
+    return ret[0] if len(ret) == 1 else ret
 
 
 def _encode(values, *, uniques, check_unknown=True):
@@ -320,3 +302,53 @@ def is_valid(value):
     if return_mask:
         return diff, valid_mask
     return diff
+
+
+class _NaNCounter(Counter):
+    """Counter with support for nan values."""
+    def __init__(self, items):
+        super().__init__(self._generate_items(items))
+
+    def _generate_items(self, items):
+        """Generate items without nans. Stores the nan counts seperately."""
+        for item in items:
+            if not is_scalar_nan(item):
+                yield item
+                continue
+            if not hasattr(self, 'nan_count'):
+                self.nan_count = 0
+            self.nan_count += 1
+
+    def __missing__(self, key):
+        if hasattr(self, 'nan_count') and is_scalar_nan(key):
+            return self.nan_count
+        raise KeyError(key)
+
+
+def _get_counts(values, uniques):
+    """Get the count of each of the `uniques` in `values`. The counts will use
+    the order passed in by `uniques`.
+
+    For non-object dtypes, `uniques` is assumed to be sorted.
+    """
+    if values.dtype.kind in 'OU':
+        counter = _NaNCounter(values)
+        output = np.zeros(len(uniques), dtype=np.int64)
+        for i, item in enumerate(uniques):
+            with suppress(KeyError):
+                output[i] = counter[item]
+        return output
+
+    unique_values, counts = _unique_np(values, return_counts=True)
+    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
+
+    # If there are nans, they will be mapped to the end.
+    if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
+        uniques_in_values[-1] = True
+
+    unique_valid_indices = np.searchsorted(unique_values,
+                                           uniques[uniques_in_values])
+
+    output = np.zeros_like(uniques, dtype=np.int64)
+    output[uniques_in_values] = counts[unique_valid_indices]
+    return output
diff --git a/sklearn/utils/tests/test_encode.py b/sklearn/utils/tests/test_encode.py
index 2de562c68902d..5670449cd3d2d 100644
--- a/sklearn/utils/tests/test_encode.py
+++ b/sklearn/utils/tests/test_encode.py
@@ -7,17 +7,23 @@
 from sklearn.utils._encode import _unique
 from sklearn.utils._encode import _encode
 from sklearn.utils._encode import _check_unknown
+from sklearn.utils._encode import _get_counts
 
 
 @pytest.mark.parametrize(
         "values, expected",
         [(np.array([2, 1, 3, 1, 3], dtype='int64'),
           np.array([1, 2, 3], dtype='int64')),
+         (np.array([2, 1, np.nan, 1, np.nan], dtype='float32'),
+          np.array([1, 2, np.nan], dtype='float32')),
          (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
           np.array(['a', 'b', 'c'], dtype=object)),
+         (np.array(['b', 'a', None, 'a', None], dtype=object),
+          np.array(['a', 'b', None], dtype=object)),
          (np.array(['b', 'a', 'c', 'a', 'c']),
           np.array(['a', 'b', 'c']))],
-        ids=['int64', 'object', 'str'])
+        ids=['int64', 'float32-nan', 'object',
+             'object-None', 'str'])
 def test_encode_util(values, expected):
     uniques = _unique(values)
     assert_array_equal(uniques, expected)
@@ -184,9 +190,6 @@ def test_unique_util_missing_values_numeric():
     assert_array_equal(uniques, expected_uniques)
     assert_array_equal(inverse, expected_inverse)
 
-    _, counts = _unique(values, return_counts=True)
-    assert_array_equal(counts, [1, 2, 1, 2])
-
     encoded = _encode(values, uniques=uniques)
     assert_array_equal(encoded, expected_inverse)
 
@@ -205,9 +208,6 @@ def test_unique_util_with_all_missing_values():
     _, inverse = _unique(values, return_inverse=True)
     assert_array_equal(inverse, expected_inverse)
 
-    _, counts = _unique(values, return_counts=True)
-    assert_array_equal(counts, [1, 2, 2, 2])
-
 
 def test_check_unknown_with_both_missing_values():
     # test for both types of missing values for object dtype
@@ -227,3 +227,24 @@ def test_check_unknown_with_both_missing_values():
     assert np.isnan(diff[1])
     assert_array_equal(valid_mask,
                        [False, True, True, True, False, False, False])
+
+
+@pytest.mark.parametrize("values, uniques, expected_counts", [
+    (np.array([1] * 10 + [2] * 4 + [3] * 15),
+     np.array([1, 2, 3]), [10, 4, 15]),
+    (np.array([1] * 10 + [2] * 4 + [3] * 15),
+     np.array([1, 2, 3, 5]), [10, 4, 15, 0]),
+    (np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
+     np.array([2, 3, np.nan]), [4, 15, 10]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['a', 'b', 'c'], [16, 4, 20]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['c', 'b', 'a'], [20, 4, 16]),
+    (np.array([np.nan] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['c', np.nan, 'a'], [20, 4, 16]),
+    (np.array(['b'] * 4 + ['a'] * 16 + ['c'] * 20, dtype=object),
+     ['a', 'b', 'c', 'e'], [16, 4, 20, 0]),
+])
+def test_get_counts(values, uniques, expected_counts):
+    counts = _get_counts(values, uniques)
+    assert_array_equal(counts, expected_counts)

From a730bce305f345ccbee4faafb8cddaa5b9777372 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 29 Dec 2020 15:13:35 -0500
Subject: [PATCH 45/92] DOC Correct settings

---
 sklearn/utils/_encode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 13b116f8130fb..774c40d24f668 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -30,7 +30,7 @@ def _unique(values, *, return_inverse=False, return_counts=False):
         Only provided if `return_inverse` is True.
 
     unique_counts : ndarray
-        The number of times each of the unique values comes up in the originial
+        The number of times each of the unique values comes up in the original
         array. Only provided if `return_counts` is True.
     """
     if values.dtype == object:

From 97e9f7a62c50d051d0f55960bea1deb8d955f5ad Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 29 Dec 2020 15:18:42 -0500
Subject: [PATCH 46/92] DOC Adds docstring

---
 sklearn/utils/_encode.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 774c40d24f668..6e998320e75a8 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -20,6 +20,10 @@ def _unique(values, *, return_inverse=False, return_counts=False):
     return_inverse : bool, default=False
         If True, also return the indices of the unique values.
 
+    return_count : bool, default=False
+        If True, also return the number of times each unique item appears in
+        values.
+
     Returns
     -------
     unique : ndarray

From 433ccd71d72b408479251e949ab501c9472323fe Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 15:06:23 -0500
Subject: [PATCH 47/92] DOC Immprove user guide

---
 doc/modules/preprocessing.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 7d27d40d71e9b..57e686d2dcccb 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -617,8 +617,8 @@ Infrequent categories
 ---------------------
 
 :class:`OneHotEncoder` supports aggregating infrequent categories into a single
-output. The parameters to enable the gathering of infrequent categories are
-`min_frequency` and `max_categories`.
+output for each feature. The parameters to enable the gathering of infrequent
+categories are `min_frequency` and `max_categories`.
 
 1. `min_frequency` is either an  integer greater or equal to 1, or a float in
    the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with
@@ -636,8 +636,7 @@ infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
    ...               ['snake'] * 3]).T
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=6,
-   ...                                   handle_unknown='auto').fit(X)
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6).fit(X)
    >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
    array([[0., 0., 1.],
           [1., 0., 0.],
@@ -647,6 +646,8 @@ infrequent::
 By setting handle_unknown to `'auto'`, unknown categories will be considered
 infrequent::
 
+   >>> enc = preprocessing.OneHotEncoder(handle_unknown='auto',
+   ...                                   min_frequency=6).fit(X)
    >>> enc.transform([['dragon']]).toarray()
    array([[0., 0., 1.]])
 
@@ -659,7 +660,7 @@ feature name::
 When this `'handle_unknown'` is set to 'auto' and an unknown category is
 encountered in transform:
 
-1. If infrequent category support was not configured or there were no
+1. If infrequent category support was not configured or there was no
    infrequent category during training, the resulting one-hot encoded columns
    for this feature will be all zeros. In the inverse transform, an unknown
    category will be denoted as `None`.
@@ -669,14 +670,13 @@ encountered in transform:
    be used to represent the infrequent category. If `'infrequent'` is already a
    category, `'infrequent_sklearn'` will be used instead.
 
-Infrequent categories can be filtered out using `min_frequency` and
-`max_categories`. In the following example, we set `max_categories=2` to
-limit the number of features in the output. This will result in all but
-the `'cat'` category to be considered infrequent, leading to two features,
-one for `'cat'` and one for infrequent categories - which are all the others::
+Infrequent categories can also be configured using `max_categories`. In the
+following example, we set `max_categories=2` to limit the number of features in
+the output. This will result in all but the `'cat'` category to be considered
+infrequent, leading to two features, one for `'cat'` and one for infrequent
+categories - which are all the others::
 
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, max_categories=2,
-   ...                                   handle_unknown='auto').fit(X)
+   >>> enc = preprocessing.OneHotEncoder(max_categories=2).fit(X)
    >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
    array([[0., 1.],
           [1., 0.],

From ecb82dffcb7a334080e7577a867e37ec5799e576 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 15:07:29 -0500
Subject: [PATCH 48/92] DOC Move to 1.0

---
 doc/whats_new/v0.24.rst | 6 ------
 doc/whats_new/v1.0.rst  | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 68133ea23fa0b..f549b31f51aa7 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -777,12 +777,6 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
-- |MajorFeature| :class:`preprocessing.OneHotEncoder` now supports grouping
-  infrequent categories ito a single feature. Infrequent categories is
-  enabled by setting `handle_unknown` to `'auto'` or `'error'` and specifying
-  how to select infrequent categories with `min_frequency` or `max_categories`.
-  :pr:`16018` by `Thomas Fan`_.
-
 - |Feature| :class:`preprocessing.OneHotEncoder` now supports missing
   values by treating them as a category. :pr:`17317` by `Thomas Fan`_.
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 382ff363e0db7..ee7814bf0f9e9 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -133,6 +133,12 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports grouping
+  infrequent categories ito a single feature. Infrequent categories is
+  enabled by setting `handle_unknown` to `'auto'` or `'error'` and specifying
+  how to select infrequent categories with `min_frequency` or `max_categories`.
+  :pr:`16018` by `Thomas Fan`_.
+
 - |Feature| The new :class:`preprocessing.SplineTransformer` is a feature
   preprocessing tool for the generation of B-splines, parametrized by the
   polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot

From 35d0544b588a8bb782c59c26a9caa556bd73f81f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 15:10:52 -0500
Subject: [PATCH 49/92] DOC Update docs

---
 sklearn/preprocessing/tests/test_encoders.py | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 8c7f63b349321..707f1d692e46d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -859,8 +859,8 @@ def test_encoders_has_categorical_tags(Encoder):
 
 
 def test_ohe_infrequent_infrequent_is_a_cat():
-    # category with 'infrequent' is a frequent category, ohe will name mangle
-    # this into 'infrequent_sklearn'
+    """Test category with 'infrequent' is a frequent category, ohe will name
+    mangle this into 'infrequent_sklearn'."""
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['infrequent'] * 10 +
                         ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
@@ -896,8 +896,8 @@ def test_ohe_infrequent_infrequent_is_a_cat():
 ])
 @pytest.mark.parametrize("categories", ["auto", [['a', 'b', 'c', 'd']]])
 def test_ohe_infrequent_two_levels(kwargs, categories):
-    # Test that different parameters for combine 'a', 'c', and 'd' into
-    # the infrequent category works as expected
+    """Test that different parameters for combine 'a', 'c', and 'd' into
+    the infrequent category works as expected."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(categories=categories,
@@ -920,7 +920,6 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
-    # The most frequent infrequent category becomes the feature name
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_infrequent'], feature_names)
 
@@ -935,8 +934,8 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     {'max_categories': 4, 'min_frequency': 6},
 ])
 def test_ohe_infrequent_three_levels(kwargs):
-    # Test that different parameters for combine 'a', and 'd' into
-    # the infrequent category works as expected
+    """Test that different parameters for combing 'a', and 'd' into
+    the infrequent category works as expected."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
@@ -959,14 +958,13 @@ def test_ohe_infrequent_three_levels(kwargs):
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
-    # The most frequent infrequent category becomes the feature name
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_c', 'x0_infrequent'], feature_names)
 
 
 def test_ohe_infrequent_handle_unknown_error():
-    # Test that different parameters for combine 'a', and 'd' into
-    # the infrequent category works as expected
+    """Test that different parameters for combing 'a', and 'd' into
+    the infrequent category works as expected."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='error', sparse=False,
@@ -984,7 +982,7 @@ def test_ohe_infrequent_handle_unknown_error():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    # 'bad' is not know and will error
+    # 'bad' is not known and will error
     X_test = [['bad']]
     msg = r"Found unknown categories \['bad'\] in column 0"
     with pytest.raises(ValueError, match=msg):
@@ -994,7 +992,7 @@ def test_ohe_infrequent_handle_unknown_error():
 @pytest.mark.parametrize("kwargs", [{'max_categories': 3},
                                     {'min_frequency': 4}])
 def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
-    # 'a' is the only frequent category, all other categories are infrequent
+    #'a' is the only frequent category, all other categories are infrequent
 
     X_train = np.array([['a'] * 5 + ['e'] * 30], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],

From 274c0908a4c35e17f269b857e0b392612a2a69e5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 15:11:45 -0500
Subject: [PATCH 50/92] TST Remove test

---
 sklearn/preprocessing/tests/test_encoders.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 707f1d692e46d..a74dc4aaa903d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1211,24 +1211,6 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_array_equal(expected_inv, X_inv)
 
 
-def test_ohe_infrequent_user_cats_with_many_zero_counts():
-    # Only category 'd' is a frequent category. This should result in
-    # two columns.
-
-    X_train = np.array([['e'] * 3 + ['d']], dtype=object).T
-    ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b', 'f', 'g']],
-                        max_categories=3, sparse=False,
-                        handle_unknown='auto').fit(X_train)
-
-    X_trans = ohe.transform([['c'], ['d'], ['a'], ['b'], ['e']])
-    expected = [[0, 1],
-                [1, 0],
-                [0, 1],
-                [0, 1],
-                [0, 1]]
-    assert_array_equal(expected, X_trans)
-
-
 @pytest.mark.parametrize("min_frequency", [21])
 def test_ohe_infrequent_one_level_errors(min_frequency):
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T

From abc504eccd60c1a99adf7125f9c82fe72a70dbbd Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 15:14:25 -0500
Subject: [PATCH 51/92] DOC Update docstring

---
 sklearn/preprocessing/_encoders.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b2e8813456202..0fd93f5cd36bd 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -306,12 +306,12 @@ class OneHotEncoder(_BaseEncoder):
         .. versionchanged:: 0.23
            Added the possibility to contain `None` values.
 
-    infrequent_indices_ : list of shape (n_features,)
+    infrequent_indices_ : list of arrays
         Defined only when `min_frequency` or `max_categories` is set to a
         non-default value. `infrequent_indices_[i]` is an array of indices
-        mapping from `categories_[i]` to the infrequent categories.
-        `infrequent_indices_[i]` is None if the ith input feature has no
-        infrequent categories.
+        such that `categories_[i][infrequent_indices_[i]]` are all the
+        infrequent category labels. If the ith feature has no infrequent
+        categories `infrequent_indices_[i]` is None.
 
     See Also
     --------

From 484070a945c9fe9c0a24747d6afb8419eebf2535 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 22 Feb 2021 15:43:18 -0500
Subject: [PATCH 52/92] STY Linting

---
 sklearn/preprocessing/tests/test_encoders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index a74dc4aaa903d..8dfc7b216f184 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -992,7 +992,8 @@ def test_ohe_infrequent_handle_unknown_error():
 @pytest.mark.parametrize("kwargs", [{'max_categories': 3},
                                     {'min_frequency': 4}])
 def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
-    #'a' is the only frequent category, all other categories are infrequent
+    """'a' is the only frequent category, all other categories are infrequent.
+    """
 
     X_train = np.array([['a'] * 5 + ['e'] * 30], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],

From c48ada2f7104927c62d3ded76de13f9c9301a6d1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 4 Mar 2021 13:43:29 -0500
Subject: [PATCH 53/92] DOC Address comments

---
 sklearn/preprocessing/_encoders.py           | 106 ++++++++++---------
 sklearn/preprocessing/tests/test_encoders.py |   4 +-
 2 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d852815ce7814..87c403984aa8b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -262,21 +262,21 @@ class OneHotEncoder(_BaseEncoder):
             `'auto'` was added to automatically handle unknown categories
             and infrequent categories.
 
-        .. deprecated:: 0.24
+        .. deprecated:: 1.0
             `'ignore'` is deprecated in favor of `'auto'`. This option will be
-            removed in 0.26.
+            removed in 1.2.
 
     min_frequency : int or float, default=1
-        Specifies the minimum frequency for a category not to be considered
-        infrequent.
+        Specifies the minimum frequency below which a category will be
+        considered infrequent.
 
-            1. If int, categories with a smaller cardinality will be considered
-            infrequent.
+        - If `int`, categories with a smaller cardinality will be considered
+          infrequent.
 
-            2. If float, categories with a smaller cardinality than
-            `min_frequency * n_samples`  will be considered infrequent.
+        - If `float`, categories with a smaller cardinality than
+          `min_frequency * n_samples`  will be considered infrequent.
 
-        .. versionadded:: 0.24
+        .. versionadded:: 1.0
 
     max_categories : int, default=None
         Specifies an upper limit to the number of output features for each
@@ -285,7 +285,7 @@ class OneHotEncoder(_BaseEncoder):
         categories along with the frequent categories. If `None`, there is no
         limit to the number of output features.
 
-        .. versionadded:: 0.24
+        .. versionadded:: 1.0
 
     Attributes
     ----------
@@ -385,7 +385,7 @@ def __init__(self, *, categories='auto', drop=None, sparse=True,
     def _validate_keywords(self):
 
         if self.handle_unknown not in ('error', 'ignore', 'auto'):
-            msg = (f"handle_unknown should be either 'error', 'ignore', 'auto'"
+            msg = (f"handle_unknown should be one of 'error', 'ignore', 'auto'"
                    f"got {self.handle_unknown}.")
             raise ValueError(msg)
         # If we have both dropped columns and ignored unknown
@@ -429,8 +429,8 @@ def _validate_keywords(self):
         # TODO: Remove when handle_unknown='ignore' is deprecated
         if self.handle_unknown == 'ignore':
             warnings.warn("handle_unknown='ignore' is deprecated in favor "
-                          "of 'auto' in version 0.24 and will be removed in "
-                          "version 0.26", FutureWarning)
+                          "of 'auto' in version 1.0 and will be removed in "
+                          "version 1.2", FutureWarning)
             if self._infrequent_enabled:
                 raise ValueError("infrequent categories are only supported "
                                  "when handle_unknown is 'error' or 'auto'")
@@ -537,7 +537,7 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
                              "infrequent, try decreasing min_frequency")
         return output if output.size > 0 else None
 
-    def _fit_infrequent_category_mapping(self, fit_results):
+    def _fit_infrequent_category_mapping(self, n_samples, category_counts):
         """Fit infrequent categories.
 
         Defines the private attribute: `_default_to_infrequent_mappings`. For
@@ -546,35 +546,37 @@ def _fit_infrequent_category_mapping(self, fit_results):
         infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
         there were no infrequent categories in the training set.
 
+        For example if categories 0, 2 and 4 were frequent, while categories
+        1, 3, 5 were infrequent for feature 7, then these categories are mapped
+        to a single output:
+        `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
+
         Parameters
         ----------
-        fit_results : dict
-            return values from `super()._fit()`
+        n_samples : int
+            Number of samples in training set.
+        category_counts: list of ndarray
+            List of counts corresponding where `category_counts[i]` are the
+            counts for each category in `self.categories_[i]`.
         """
-        if not self._infrequent_enabled:
-            return
-
-        n_samples = fit_results["n_samples"]
-        category_counts = fit_results["category_counts"]
-
         self.infrequent_indices_ = [
             self._identify_infrequent(category_count, n_samples, col_idx)
             for col_idx, category_count in enumerate(category_counts)
         ]
 
         # compute mapping from default mapping to infrequent mapping
-        default_to_infrequent_mappings = []
+        self._default_to_infrequent_mappings = []
 
         for cats, infreq_idx in zip(self.categories_,
                                     self.infrequent_indices_):
             # no infrequent categories
             if infreq_idx is None:
-                default_to_infrequent_mappings.append(None)
+                self._default_to_infrequent_mappings.append(None)
                 continue
 
             n_cats = len(cats)
             # infrequent indices exist
-            mapping = np.empty(n_cats, dtype=int)
+            mapping = np.empty(n_cats, dtype=np.int64)
             n_infrequent_cats = infreq_idx.size
 
             # infrequent categories are mapped to the last element.
@@ -584,34 +586,40 @@ def _fit_infrequent_category_mapping(self, fit_results):
             frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
             mapping[frequent_indices] = np.arange(n_frequent_cats)
 
-            default_to_infrequent_mappings.append(mapping)
-
-        self._default_to_infrequent_mappings = default_to_infrequent_mappings
+            self._default_to_infrequent_mappings.append(mapping)
 
     def _map_to_infrequent_categories(self, X_int, X_mask):
         """Map categories to infrequent categories. This modifies X_int
-        in-place.
+        in-place. Values that were invalid based on `X_mask` are mapped to
+        the infrequent category if there was an infrequent category for that
+        feature.
 
         Parameters
         ----------
         X_int: ndarray of shape (n_samples, n_features)
             Integer encoded categories.
+
+        X_mask: ndarray of shape (n_samples, n_features)
+            Bool mask for valid values in `X_int`.
         """
         if not self._infrequent_enabled:
             return
 
-        n_features = X_int.shape[1]
-        for col_idx in range(n_features):
+        for col_idx in range(X_int.shape[1]):
             infrequent_idx = self.infrequent_indices_[col_idx]
             if infrequent_idx is None:
                 continue
 
             X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
             if self.handle_unknown == 'auto':
-                # unknown values will be mapped to infrequent in the next for
-                # loop
+                # All the unknown values are now mapped to the
+                # infrequent_idx[0], which makes the unknown values valid
+                # This is needed in `transform` when the encoding is formed
+                # using `X_mask`.
                 X_mask[:, col_idx] = True
 
+        # Remaps encoding in `X_int` where the infrequent categories are
+        # grouped together.
         for i, mapping in enumerate(self._default_to_infrequent_mappings):
             if mapping is None:
                 continue
@@ -647,16 +655,11 @@ def _compute_transformed_categories(self, i):
             infrequent_cat = 'infrequent_sklearn'
         else:
             infrequent_cat = 'infrequent'
-        return np.r_[cats[frequent_indices],
-                     np.array([infrequent_cat], dtype=object)]
+        return np.concatenate((cats[frequent_indices],
+                               np.array([infrequent_cat], dtype=object)))
 
-    def _get_transformed_categories(self):
-        """Transformed categories."""
-        return [self._compute_transformed_categories(i)
-                for i in range(len(self.categories_))]
-
-    def _get_n_transformed_features(self):
-        """Number of transformed features."""
+    def _compute_n_features_outs(self):
+        """Compute the n_features_out for each input feature."""
         if self.drop_idx_ is not None:
             output = []
             for drop_idx, cats in zip(self.drop_idx_, self.categories_):
@@ -672,7 +675,8 @@ def _get_n_transformed_features(self):
         if not self._infrequent_enabled:
             return output
 
-        # infrequent is enabled
+        # infrequent is enabled, the number of features out are reduced
+        # because the infrequent categories are grouped together
         for i, infreq_idx in enumerate(self.infrequent_indices_):
             if infreq_idx is None:
                 continue
@@ -701,7 +705,9 @@ def fit(self, X, y=None):
         fit_results = self._fit(X, handle_unknown=self.handle_unknown,
                                 force_all_finite='allow-nan',
                                 return_counts=self._infrequent_enabled)
-        self._fit_infrequent_category_mapping(fit_results)
+        if self._infrequent_enabled:
+            self._fit_infrequent_category_mapping(
+                fit_results["n_samples"], fit_results["category_counts"])
         self.drop_idx_ = self._compute_drop_idx()
         return self
 
@@ -730,7 +736,9 @@ def fit_transform(self, X, y=None):
 
     def transform(self, X):
         """
-        Transform X using one-hot encoding.
+        Transform X using one-hot encoding. If there are infrequent categories
+        for a feature, the infrequent categories will be grouped into a single
+        category.
 
         Parameters
         ----------
@@ -765,7 +773,7 @@ def transform(self, X):
             X_int[X_int > to_drop] -= 1
             X_mask &= keep_cells
 
-        n_values = self._get_n_transformed_features()
+        n_values = self._compute_n_features_outs()
 
         mask = X_mask.ravel()
         feature_indices = np.cumsum([0] + n_values)
@@ -812,7 +820,8 @@ def inverse_transform(self, X):
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
-        transformed_features = self._get_transformed_categories()
+        transformed_features = [self._compute_transformed_categories(i)
+                                for i, _ in enumerate(self.categories_)]
         n_features_out = sum(cats.shape[0] for cats in transformed_features)
 
         # validate shape of passed X
@@ -905,7 +914,8 @@ def get_feature_names(self, input_features=None):
             Array of feature names.
         """
         check_is_fitted(self)
-        cats = self._get_transformed_categories()
+        cats = [self._compute_transformed_categories(i)
+                for i, _ in enumerate(self.categories_)]
         if input_features is None:
             input_features = ['x%d' % i for i in range(len(cats))]
         elif len(input_features) != len(cats):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index b664f01ff7220..f75706fb529cd 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -75,7 +75,7 @@ def test_one_hot_encoder_handle_unknown(handle_unknown):
 
     # Raise error if handle_unknown is neither ignore or error.
     oh = OneHotEncoder(handle_unknown='42')
-    with pytest.raises(ValueError, match='handle_unknown should be either'):
+    with pytest.raises(ValueError, match='handle_unknown should be one of'):
         oh.fit(X)
 
 
@@ -1248,7 +1248,7 @@ def test_ohe_ignore_deprecated():
     ohe = OneHotEncoder(handle_unknown='ignore')
 
     msg = (r"handle_unknown='ignore' is deprecated in favor of 'auto' in "
-           r"version 0\.24 and will be removed in version 0\.26")
+           r"version 1\.0 and will be removed in version 1\.2")
     with pytest.warns(FutureWarning, match=msg):
         ohe.fit(X_train)
 

From 1922b326e64e33f51e4ca1d94be78b6c9c5b5cd6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 4 Mar 2021 14:24:03 -0500
Subject: [PATCH 54/92] ENH Neater code

---
 sklearn/preprocessing/_encoders.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 87c403984aa8b..754ee051b0b2f 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -645,17 +645,17 @@ def _compute_transformed_categories(self, i):
             return cats
 
         # infrequent is enabled
-        infreq_idx = self.infrequent_indices_[i]
-        if infreq_idx is None:
+        infreq_map = self._default_to_infrequent_mappings[i]
+        if infreq_map is None:
             return cats
 
-        frequent_indices = np.setdiff1d(np.arange(len(cats)), infreq_idx)
+        frequent_mask = infreq_map < infreq_map.max()
 
         if cats.dtype.kind in 'US' and 'infrequent' in cats:
             infrequent_cat = 'infrequent_sklearn'
         else:
             infrequent_cat = 'infrequent'
-        return np.concatenate((cats[frequent_indices],
+        return np.concatenate((cats[frequent_mask],
                                np.array([infrequent_cat], dtype=object)))
 
     def _compute_n_features_outs(self):

From 91fa58be9a22bf64c74bbdf547c3645494980c96 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 4 Mar 2021 17:28:48 -0500
Subject: [PATCH 55/92] DOC Update explaination for auto

---
 doc/modules/preprocessing.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 615b91e175f33..2518564dc7169 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -543,8 +543,9 @@ features, it can often be better to specify ``handle_unknown='auto'`` instead
 of setting the ``categories`` manually as above. When
 ``handle_unknown='auto'`` is specified and unknown categories are encountered
 during transform, no error will be raised but the resulting one-hot encoded
-columns for this feature will be all zeros
-(``handle_unknown='auto'`` is only supported for one-hot encoding)::
+columns for this feature will be all zeros or considered as an infrequent
+category if enabled. (``handle_unknown='auto'`` is only supported for one-hot
+encoding)::
 
     >>> enc = preprocessing.OneHotEncoder(handle_unknown='auto')
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]

From a68ce3156e20436e7a4ce7243302f1d47cfbdb33 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 4 Apr 2021 11:50:54 -0400
Subject: [PATCH 56/92] Update sklearn/preprocessing/_encoders.py

Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 754ee051b0b2f..0fb6900106acb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -406,7 +406,7 @@ def _validate_keywords(self):
                                  "1 or a float in (0.0, 1.0); got the "
                                  f"integer {self.min_frequency}")
         else:  # float
-            if not 0.0 < self.min_frequency < 1.0:
+            if not (0.0 < self.min_frequency < 1.0):
                 raise ValueError("min_frequency must be an integer at least "
                                  "1 or a float in (0.0, 1.0); got the "
                                  f"float {self.min_frequency}")

From 3e305ef2155050db9f8f04cb96db06773a030635 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 5 Apr 2021 13:24:34 -0400
Subject: [PATCH 57/92] TST Uses docstring instead of comments

---
 sklearn/preprocessing/_encoders.py           |  2 +-
 sklearn/preprocessing/tests/test_encoders.py | 42 ++++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8a8a45dcf6b5d..db001a15b7df5 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -753,7 +753,7 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        warn_on_unknown = (self.handle_unknown == "ignore"
+        warn_on_unknown = (self.handle_unknown in {"ignore", "auto"}
                            and self.drop is not None)
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
                                         force_all_finite='allow-nan',
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 848475f16800d..19fd764c998c4 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -993,9 +993,9 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
 
 
 def test_ohe_infrequent_two_levels_user_cats():
-    # Test that the order of the categories provided by a user is respected.
-    # Specifically, the infrequent_indices_ correspond to the user provided
-    # categories.
+    """Test that the order of the categories provided by a user is respected.
+    Specifically, the infrequent_indices_ correspond to the user provided
+    categories."""
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
@@ -1022,9 +1022,9 @@ def test_ohe_infrequent_two_levels_user_cats():
 
 
 def test_ohe_infrequent_three_levels_user_cats():
-    # Test that the order of the categories provided by a user is respected.
-    # In this case 'c' is encoded as the first category and 'b' is encoded
-    # as the second one
+    """Test that the order of the categories provided by a user is respected.
+    In this case 'c' is encoded as the first category and 'b' is encoded
+    as the second one."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
@@ -1053,7 +1053,7 @@ def test_ohe_infrequent_three_levels_user_cats():
 
 
 def test_ohe_infrequent_multiple_categories():
-    # Test infrequent categories with feature matrix with 3 features
+    """Test infrequent categories with feature matrix with 3 features."""
 
     X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3],
               [0, 0, 5, 1, 1, 10, 5, 5, 0],
@@ -1129,7 +1129,8 @@ def test_ohe_infrequent_multiple_categories():
 
 
 def test_ohe_infrequent_multiple_categories_dtypes():
-    # Test infrequent categories with a pandas dataframe with multiple dtypes
+    """Test infrequent categories with a pandas dataframe with multiple dtypes.
+    """
 
     pd = pytest.importorskip("pandas")
     X = pd.DataFrame(
@@ -1194,6 +1195,7 @@ def test_ohe_infrequent_multiple_categories_dtypes():
 
 @pytest.mark.parametrize("min_frequency", [21])
 def test_ohe_infrequent_one_level_errors(min_frequency):
+    """All user provided categories are infrequent."""
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
 
     ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
@@ -1206,7 +1208,7 @@ def test_ohe_infrequent_one_level_errors(min_frequency):
 
 @pytest.mark.parametrize("kwargs", [{'min_frequency': 2, 'max_categories': 3}])
 def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
-    # All user provided categories are infrequent
+    """All user provided categories are infrequent."""
 
     X_train = np.array([['e'] * 3], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
@@ -1345,11 +1347,14 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type,
     assert np.isnan(ohe.categories_[0][-1])
 
 
-def test_ohe_drop_first_handle_unknown_ignore_warns():
-    """Check drop='first' and handle_unknown='ignore' during transform."""
+@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
+    """Check drop='first' and handle_unknown='ignore'/'auto' during transform.
+    """
     X = [['a', 0], ['b', 2], ['b', 1]]
 
-    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
+    ohe = OneHotEncoder(drop='first', sparse=False,
+                        handle_unknown=handle_unknown)
     X_trans = ohe.fit_transform(X)
 
     X_expected = np.array([
@@ -1375,12 +1380,13 @@ def test_ohe_drop_first_handle_unknown_ignore_warns():
     assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
 
 
-def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
+@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
     """Check drop='if_binary' and handle_unknown='ignore' during transform."""
     X = [['a', 0], ['b', 2], ['b', 1]]
 
     ohe = OneHotEncoder(drop='if_binary', sparse=False,
-                        handle_unknown='ignore')
+                        handle_unknown=handle_unknown)
     X_trans = ohe.fit_transform(X)
 
     X_expected = np.array([
@@ -1406,13 +1412,15 @@ def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
     assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
 
 
-def test_ohe_drop_first_explicit_categories():
-    """Check drop='first' and handle_unknown='ignore' during fit with
+@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+def test_ohe_drop_first_explicit_categories(handle_unknown):
+    """Check drop='first' and handle_unknown='ignore'/'auto' during fit with
     categories passed in."""
 
     X = [['a', 0], ['b', 2], ['b', 1]]
 
-    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore',
+    ohe = OneHotEncoder(drop='first', sparse=False,
+                        handle_unknown=handle_unknown,
                         categories=[['b', 'a'], [1, 2]])
     ohe.fit(X)
 

From fec44b2d5e13b7bd450b571eab8f6658d263138a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 5 Apr 2021 13:31:25 -0400
Subject: [PATCH 58/92] TST Remove call to fit

---
 sklearn/preprocessing/tests/test_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 19fd764c998c4..4fa24334b1b9e 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -844,7 +844,7 @@ def test_ohe_infrequent_infrequent_is_a_cat():
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['infrequent'] * 10 +
                         ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
-                        max_categories=3).fit(X_train)
+                        max_categories=3)
     ohe.fit(X_train)
 
     X_test = [['b'], ['a'], ['infrequent'], ['d']]

From e4ad66583ef335f6cbd0e3560bc23528548f25b3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 5 Apr 2021 13:36:07 -0400
Subject: [PATCH 59/92] TST Spelling error

---
 sklearn/preprocessing/tests/test_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 4fa24334b1b9e..060164ecd183e 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -943,7 +943,7 @@ def test_ohe_infrequent_three_levels(kwargs):
 
 
 def test_ohe_infrequent_handle_unknown_error():
-    """Test that different parameters for combing 'a', and 'd' into
+    """Test that different parameters for combining 'a', and 'd' into
     the infrequent category works as expected."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T

From 10b8aec2cfce85136e1da005a6474c577cd9c7f2 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 5 Apr 2021 15:10:05 -0400
Subject: [PATCH 60/92] ENH Adds support for drop + infrequent categories

---
 sklearn/preprocessing/_encoders.py           | 55 +++++++++++++-------
 sklearn/preprocessing/tests/test_encoders.py | 46 +++++++++++++++-
 2 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index db001a15b7df5..be58a90681c04 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -249,6 +249,9 @@ class OneHotEncoder(_BaseEncoder):
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
 
+        If there are infrequent categories and drop selects any of the
+        infrequent categories, than the whole category is dropped.
+
         .. versionchanged:: 0.23
            Added option 'if_binary'.
 
@@ -422,11 +425,6 @@ def _validate_keywords(self):
              self.min_frequency < 1.0)
         )
 
-        # validates infrequent category features
-        if self.drop is not None and self._infrequent_enabled:
-            raise ValueError("infrequent categories are not supported when "
-                             "drop is specified")
-
         # TODO: Remove when handle_unknown='ignore' is deprecated
         if self.handle_unknown == 'ignore':
             warnings.warn("handle_unknown='ignore' is deprecated in favor "
@@ -443,8 +441,16 @@ def _compute_drop_idx(self):
             if self.drop == 'first':
                 return np.zeros(len(self.categories_), dtype=object)
             elif self.drop == 'if_binary':
-                return np.array([0 if len(cats) == 2 else None
-                                for cats in self.categories_], dtype=object)
+                n_features_out_no_drop = [len(cat) for cat in self.categories_]
+                if self._infrequent_enabled:
+                    for i, infreq_idx in enumerate(self.infrequent_indices_):
+                        if infreq_idx is None:
+                            continue
+                        n_features_out_no_drop[i] -= (infreq_idx.size - 1)
+
+                return np.array([0 if n_features_out == 2 else None
+                                for n_features_out in n_features_out_no_drop],
+                                dtype=object)
             else:
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
@@ -469,12 +475,25 @@ def _compute_drop_idx(self):
                                             len(self.drop)))
             missing_drops = []
             drop_indices = []
+
+            def _convert_to_infrequent_idx(idx, col_idx):
+                if not self._infrequent_enabled:
+                    return idx
+
+                default_to_infrequent = (
+                    self._default_to_infrequent_mappings[col_idx]
+                )
+                if default_to_infrequent is None:
+                    return idx
+                return default_to_infrequent[idx]
+
             for col_idx, (val, cat_list) in enumerate(zip(self.drop,
                                                           self.categories_)):
                 if not is_scalar_nan(val):
                     drop_idx = np.where(cat_list == val)[0]
                     if drop_idx.size:  # found drop idx
-                        drop_indices.append(drop_idx[0])
+                        drop_indices.append(
+                            _convert_to_infrequent_idx(drop_idx[0], col_idx))
                     else:
                         missing_drops.append((col_idx, val))
                     continue
@@ -482,7 +501,8 @@ def _compute_drop_idx(self):
                 # val is nan, find nan in categories manually
                 for cat_idx, cat in enumerate(cat_list):
                     if is_scalar_nan(cat):
-                        drop_indices.append(cat_idx)
+                        drop_indices.append(
+                            _convert_to_infrequent_idx(cat_idx, col_idx))
                         break
                 else:  # loop did not break thus drop is missing
                     missing_drops.append((col_idx, val))
@@ -661,18 +681,13 @@ def _compute_transformed_categories(self, i):
 
     def _compute_n_features_outs(self):
         """Compute the n_features_out for each input feature."""
-        if self.drop_idx_ is not None:
-            output = []
-            for drop_idx, cats in zip(self.drop_idx_, self.categories_):
-                if drop_idx is None:
-                    output.append(len(cats))
-                else:
-                    output.append(len(cats) - 1)
-            return output
-
-        # drop is None
         output = [len(cats) for cats in self.categories_]
 
+        if self.drop_idx_ is not None:
+            for i, drop_idx in enumerate(self.drop_idx_):
+                if drop_idx is not None:
+                    output[i] -= 1
+
         if not self._infrequent_enabled:
             return output
 
@@ -681,7 +696,7 @@ def _compute_n_features_outs(self):
         for i, infreq_idx in enumerate(self.infrequent_indices_):
             if infreq_idx is None:
                 continue
-            output[i] = output[i] - infreq_idx.size + 1
+            output[i] -= (infreq_idx.size - 1)
 
         return output
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 060164ecd183e..0d5a1e3aea125 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -903,6 +903,19 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_infrequent'], feature_names)
 
+    # dropping the first category which is 'b'
+    drops = ['if_binary', 'first', ['b']]
+    X_test = [['b'], ['c']]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[0], [1]], ohe.transform(X_test))
+
+    # dropping categories that are infrequent will remove the entire category
+    drops = [['a'], ['c'], ['d']]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[1], [0]], ohe.transform(X_test))
+
 
 @pytest.mark.parametrize("kwargs", [
     {'max_categories': 3},
@@ -941,6 +954,19 @@ def test_ohe_infrequent_three_levels(kwargs):
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_c', 'x0_infrequent'], feature_names)
 
+    # dropping the first category which is 'b'
+    drops = ['first', ['b']]
+    X_test = [['b'], ['c'], ['d']]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
+
+    # dropping categories that are infrequent will remove the entire category
+    drops = [['a'], ['d']]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[1, 0], [0, 1], [0, 0]], ohe.transform(X_test))
+
 
 def test_ohe_infrequent_handle_unknown_error():
     """Test that different parameters for combining 'a', and 'd' into
@@ -991,6 +1017,18 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
+    # 'a' is dropped
+    drops = ['first', 'if_binary', ['a']]
+    X_test = [['a'], ['c']]
+    for drop in drops:
+        ohe.set_params(drop=drop).fit(X_train)
+        assert_allclose([[0], [1]], ohe.transform(X_test))
+
+    # dropping 'c' means the infrequent category is dropped because
+    # 'c' in infrequent
+    ohe.set_params(drop=['c']).fit(X_train)
+    assert_allclose([[1], [0]], ohe.transform(X_test))
+
 
 def test_ohe_infrequent_two_levels_user_cats():
     """Test that the order of the categories provided by a user is respected.
@@ -1226,8 +1264,6 @@ def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
     ({'max_categories': -2}, 'max_categories must be greater than 1'),
     ({'min_frequency': -1}, 'min_frequency must be an integer at least'),
     ({'min_frequency': 1.1}, 'min_frequency must be an integer at least'),
-    ({'max_categories': 2, 'drop': 'first', 'handle_unknown': 'error'},
-     "infrequent categories are not supported when drop is specified"),
     ({'handle_unknown': 'ignore', 'max_categories': 2},
      "infrequent categories are only supported when handle_unknown is "
      "'error' or 'auto'")
@@ -1347,6 +1383,8 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type,
     assert np.isnan(ohe.categories_[0][-1])
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.26
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
 def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
     """Check drop='first' and handle_unknown='ignore'/'auto' during transform.
@@ -1380,6 +1418,8 @@ def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
     assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.26
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
 def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
     """Check drop='if_binary' and handle_unknown='ignore' during transform."""
@@ -1412,6 +1452,8 @@ def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
     assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
 
 
+# TODO: Remove when 'ignore' is deprecated in 0.26
+@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
 def test_ohe_drop_first_explicit_categories(handle_unknown):
     """Check drop='first' and handle_unknown='ignore'/'auto' during fit with

From ef86eb1fee42c6be9012b95acdf1652abe8f36b0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 5 Apr 2021 16:36:17 -0400
Subject: [PATCH 61/92] ENH Adds infrequent_if_exist option

---
 doc/modules/preprocessing.rst                | 27 ++++---
 sklearn/preprocessing/_encoders.py           | 69 ++++++++--------
 sklearn/preprocessing/tests/test_encoders.py | 85 ++++++--------------
 3 files changed, 74 insertions(+), 107 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 0225c37f8c14e..6a4015a0cdf05 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -539,18 +539,19 @@ dataset::
     array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
 
 If there is a possibility that the training data might have missing categorical
-features, it can often be better to specify ``handle_unknown='auto'`` instead
-of setting the ``categories`` manually as above. When
-``handle_unknown='auto'`` is specified and unknown categories are encountered
-during transform, no error will be raised but the resulting one-hot encoded
-columns for this feature will be all zeros or considered as an infrequent
-category if enabled. (``handle_unknown='auto'`` is only supported for one-hot
+features, it can often be better to specify
+`handle_unknown='infrequent_if_exist'` instead of setting the `categories`
+manually as above. When `handle_unknown='infrequent_if_exist'` is specified
+and unknown categories are encountered during transform, no error will be
+raised but the resulting one-hot encoded columns for this feature will be all
+zeros or considered as an infrequent category if enabled.
+(`handle_unknown='infrequent_if_exist'` is only supported for one-hot
 encoding)::
 
-    >>> enc = preprocessing.OneHotEncoder(handle_unknown='auto')
+    >>> enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist')
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
     >>> enc.fit(X)
-    OneHotEncoder(handle_unknown='auto')
+    OneHotEncoder(handle_unknown='infrequent_if_exist')
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 0., 0., 0.]])
 
@@ -677,10 +678,10 @@ infrequent::
           [0., 1., 0.],
           [0., 0., 1.]])
 
-By setting handle_unknown to `'auto'`, unknown categories will be considered
-infrequent::
+By setting handle_unknown to `'infrequent_if_exist'`, unknown categories will
+be considered infrequent::
 
-   >>> enc = preprocessing.OneHotEncoder(handle_unknown='auto',
+   >>> enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist',
    ...                                   min_frequency=6).fit(X)
    >>> enc.transform([['dragon']]).toarray()
    array([[0., 0., 1.]])
@@ -691,8 +692,8 @@ feature name::
    >>> enc.get_feature_names()
    array(['x0_cat', 'x0_rabbit', 'x0_infrequent'], dtype=object)
 
-When this `'handle_unknown'` is set to 'auto' and an unknown category is
-encountered in transform:
+When this `'handle_unknown'` is set to `'infrequent_if_exist'` and an unknown
+category is encountered in transform:
 
 1. If infrequent category support was not configured or there was no
    infrequent category during training, the resulting one-hot encoded columns
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index be58a90681c04..fb7483fe44bb6 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -261,22 +261,30 @@ class OneHotEncoder(_BaseEncoder):
     dtype : number type, default=float
         Desired dtype of output.
 
-    handle_unknown : {'error', 'ignore', 'auto'}, default='error'
-        Whether to raise an error or ignore if an unknown categorical feature
-        is present during transform (default is to raise). When this parameter
-        is set to 'auto' and an unknown category is encountered during
-        transform, the resulting one-hot encoded columns for this feature
-        will be all zeros. In the inverse transform, an unknown category
-        will be denoted as None. Read more in the
-        :ref:`User Guide <one_hot_encoder_infrequent_categories>`
+    handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \
+                     default='error'
+        Specifies a methodology for handling unknown categories during
+        :meth:`transform`.
+
+        - 'error' : Raise an error if an unknown categorical feature
+          is present during transform.
+        - 'ignore' : When an unknown category is encountered during
+          transform, the resulting one-hot encoded columns for this feature
+          will be all zeros. In the inverse transform, an unknown category
+          will be denoted as None.
+        - 'infrequent_if_exist' : When an unknown category is encountered
+          during transform, the resulting one-hot encoded columns for this
+          feature will map to the infrequent category if it exists. In the
+          inverse transform, an unknown category will be denoted as
+          'infrequent' if the category if it exists.
+          Read more in the
+          :ref:`User Guide <one_hot_encoder_infrequent_categories>`
+          If a infrequent category does not exist, then :meth:`transform`
+          and :meth:`inverse_transform` will handle as 'ignore'.
 
         .. versionadded:: 0.24
-            `'auto'` was added to automatically handle unknown categories
-            and infrequent categories.
-
-        .. deprecated:: 1.0
-            `'ignore'` is deprecated in favor of `'auto'`. This option will be
-            removed in 1.2.
+            `'infrequent_if_exist'` was added to automatically handle unknown
+            categories and infrequent categories.
 
     min_frequency : int or float, default=1
         Specifies the minimum frequency below which a category will be
@@ -349,10 +357,10 @@ class OneHotEncoder(_BaseEncoder):
 
     One can discard categories not seen during `fit`:
 
-    >>> enc = OneHotEncoder(handle_unknown='auto')
+    >>> enc = OneHotEncoder(handle_unknown='ignore')
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
-    OneHotEncoder(handle_unknown='auto')
+    OneHotEncoder(handle_unknown='ignore')
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
@@ -383,8 +391,8 @@ class OneHotEncoder(_BaseEncoder):
     """
 
     @_deprecate_positional_args
-    def __init__(self, *, categories='auto', drop=None, sparse=True,
-                 dtype=np.float64, handle_unknown='error',
+    def __init__(self, *, categories='auto', drop=None,
+                 sparse=True, dtype=np.float64, handle_unknown='error',
                  min_frequency=1, max_categories=None):
         self.categories = categories
         self.sparse = sparse
@@ -396,9 +404,10 @@ def __init__(self, *, categories='auto', drop=None, sparse=True,
 
     def _validate_keywords(self):
 
-        if self.handle_unknown not in ('error', 'ignore', 'auto'):
-            msg = (f"handle_unknown should be one of 'error', 'ignore', 'auto'"
-                   f"got {self.handle_unknown}.")
+        if self.handle_unknown not in {'error', 'ignore',
+                                       'infrequent_if_exist'}:
+            msg = (f"handle_unknown should be one of 'error', 'ignore', "
+                   f"'infrequent_if_exist' got {self.handle_unknown}.")
             raise ValueError(msg)
 
         if self.max_categories is not None and self.max_categories <= 1:
@@ -425,15 +434,6 @@ def _validate_keywords(self):
              self.min_frequency < 1.0)
         )
 
-        # TODO: Remove when handle_unknown='ignore' is deprecated
-        if self.handle_unknown == 'ignore':
-            warnings.warn("handle_unknown='ignore' is deprecated in favor "
-                          "of 'auto' in version 1.0 and will be removed in "
-                          "version 1.2", FutureWarning)
-            if self._infrequent_enabled:
-                raise ValueError("infrequent categories are only supported "
-                                 "when handle_unknown is 'error' or 'auto'")
-
     def _compute_drop_idx(self):
         if self.drop is None:
             return None
@@ -632,7 +632,7 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
                 continue
 
             X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
-            if self.handle_unknown == 'auto':
+            if self.handle_unknown == 'infrequent_if_exist':
                 # All the unknown values are now mapped to the
                 # infrequent_idx[0], which makes the unknown values valid
                 # This is needed in `transform` when the encoding is formed
@@ -768,8 +768,9 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
-        warn_on_unknown = (self.handle_unknown in {"ignore", "auto"}
-                           and self.drop is not None)
+        warn_on_unknown = (
+            self.drop is not None and
+            self.handle_unknown in {"ignore", "infrequent_if_exist"})
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
                                         force_all_finite='allow-nan',
                                         warn_on_unknown=warn_on_unknown)
@@ -881,7 +882,7 @@ def inverse_transform(self, X):
             X_tr[:, i] = cats[labels]
 
             if (self.handle_unknown == 'ignore' or
-                (self.handle_unknown == 'auto' and
+                (self.handle_unknown == 'infrequent_if_exist' and
                  infrequent_indices[i] is None)):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 0d5a1e3aea125..dc3fa18033ee6 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -49,9 +49,7 @@ def test_one_hot_encoder_diff_n_features():
         enc.transform(X2)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'infrequent_if_exist'])
 def test_one_hot_encoder_handle_unknown(handle_unknown):
     X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
     X2 = np.array([[4, 1, 1]])
@@ -89,9 +87,7 @@ def test_one_hot_encoder_not_fitted():
         enc.transform(X)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'infrequent_if_exist'])
 def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
     X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
     X2 = np.array(['55555', '22']).reshape((-1, 1))
@@ -228,9 +224,7 @@ def test_one_hot_encoder(X):
     assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'infrequent_if_exist'])
 @pytest.mark.parametrize('sparse_', [False, True])
 @pytest.mark.parametrize('drop', [None, 'first'])
 def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
@@ -387,9 +381,7 @@ def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
             assert np.issubdtype(res.dtype, cat_dtype)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['ignore', 'auto'])
+@pytest.mark.parametrize("handle_unknown", ['ignore', 'infrequent_if_exist'])
 @pytest.mark.parametrize("X, X2, cats, cat_dtype", [
     (np.array([['a', 'b']], dtype=object).T,
      np.array([['a', 'd']], dtype=object).T,
@@ -843,7 +835,7 @@ def test_ohe_infrequent_infrequent_is_a_cat():
     mangle this into 'infrequent_sklearn'."""
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['infrequent'] * 10 +
                         ['d'] * 3]).T
-    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
                         max_categories=3)
     ohe.fit(X_train)
 
@@ -881,7 +873,7 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(categories=categories,
-                        handle_unknown='auto', sparse=False,
+                        handle_unknown='infrequent_if_exist', sparse=False,
                         **kwargs).fit(X_train)
     assert_array_equal(ohe.infrequent_indices_, [[0, 2, 3]])
 
@@ -931,7 +923,7 @@ def test_ohe_infrequent_three_levels(kwargs):
     the infrequent category works as expected."""
 
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
-    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
                         **kwargs).fit(X_train)
     assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
 
@@ -1003,7 +995,7 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
 
     X_train = np.array([['a'] * 5 + ['e'] * 30], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
-                        sparse=False, handle_unknown='auto',
+                        sparse=False, handle_unknown='infrequent_if_exist',
                         **kwargs).fit(X_train)
 
     X_test = [['a'], ['b'], ['c'], ['d'], ['e']]
@@ -1037,7 +1029,7 @@ def test_ohe_infrequent_two_levels_user_cats():
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
-                        sparse=False, handle_unknown='auto',
+                        sparse=False, handle_unknown='infrequent_if_exist',
                         max_categories=2).fit(X_train)
 
     assert_array_equal(ohe.infrequent_indices_, [[0, 1, 2]])
@@ -1067,7 +1059,7 @@ def test_ohe_infrequent_three_levels_user_cats():
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3],
                        dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'b', 'a']],
-                        sparse=False, handle_unknown='auto',
+                        sparse=False, handle_unknown='infrequent_if_exist',
                         max_categories=3).fit(X_train)
 
     assert_array_equal(ohe.infrequent_indices_, [[1, 3]])
@@ -1098,7 +1090,7 @@ def test_ohe_infrequent_multiple_categories():
               [1, 0, 1, 0, 1, 0, 1, 0, 1]]
 
     ohe = OneHotEncoder(categories='auto', max_categories=3,
-                        handle_unknown='auto')
+                        handle_unknown='infrequent_if_exist')
     # X[:, 0] 1 and 2 is infrequent
     # X[:, 1] 1 and 10 are infrequent
     # X[:, 2] nothing is infrequent
@@ -1177,7 +1169,7 @@ def test_ohe_infrequent_multiple_categories_dtypes():
         columns=['str', 'int'])
 
     ohe = OneHotEncoder(categories='auto', max_categories=3,
-                        handle_unknown='auto')
+                        handle_unknown='infrequent_if_exist')
     # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
     # considered infrequent because they are greater
 
@@ -1236,7 +1228,7 @@ def test_ohe_infrequent_one_level_errors(min_frequency):
     """All user provided categories are infrequent."""
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
 
-    ohe = OneHotEncoder(handle_unknown='auto', sparse=False,
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
                         min_frequency=min_frequency)
 
     msg = "All categories in column 0 are infrequent"
@@ -1250,45 +1242,28 @@ def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
 
     X_train = np.array([['e'] * 3], dtype=object).T
     ohe = OneHotEncoder(categories=[['c', 'd', 'a', 'b']],
-                        sparse=False, handle_unknown='auto', **kwargs)
+                        sparse=False, handle_unknown='infrequent_if_exist',
+                        **kwargs)
 
     msg = "All categories in column 0 are infrequent"
     with pytest.raises(ValueError, match=msg):
         ohe.fit(X_train)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
 @pytest.mark.parametrize("kwargs, error_msg", [
     ({'max_categories': 1}, 'max_categories must be greater than 1'),
     ({'max_categories': -2}, 'max_categories must be greater than 1'),
     ({'min_frequency': -1}, 'min_frequency must be an integer at least'),
     ({'min_frequency': 1.1}, 'min_frequency must be an integer at least'),
-    ({'handle_unknown': 'ignore', 'max_categories': 2},
-     "infrequent categories are only supported when handle_unknown is "
-     "'error' or 'auto'")
 ])
 def test_ohe_infrequent_invalid_parameters_error(kwargs, error_msg):
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
 
-    default_kwargs = {**{'handle_unknown': 'auto'}, **kwargs}
-    ohe = OneHotEncoder(**default_kwargs)
-
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', **kwargs)
     with pytest.raises(ValueError, match=error_msg):
         ohe.fit(X_train)
 
 
-# TODO: Remove in 0.26 when 'ignore' is deprecated
-def test_ohe_ignore_deprecated():
-    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 2]).T
-    ohe = OneHotEncoder(handle_unknown='ignore')
-
-    msg = (r"handle_unknown='ignore' is deprecated in favor of 'auto' in "
-           r"version 1\.0 and will be removed in version 1\.2")
-    with pytest.warns(FutureWarning, match=msg):
-        ohe.fit(X_train)
-
-
 @pytest.mark.parametrize('input_dtype', ['O', 'U'])
 @pytest.mark.parametrize('category_dtype', ['O', 'U'])
 @pytest.mark.parametrize('array_type', ['list', 'array', 'dataframe'])
@@ -1316,9 +1291,7 @@ def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
     assert_array_equal(X_trans, expected)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['auto', 'ignore'])
+@pytest.mark.parametrize("handle_unknown", ['infrequent_if_exist', 'ignore'])
 @pytest.mark.parametrize("missing_value", [np.nan, None])
 def test_ohe_missing_values_get_feature_names(missing_value, handle_unknown):
     # encoder with missing values with object dtypes
@@ -1347,9 +1320,7 @@ def test_ohe_missing_value_support_pandas():
     assert_allclose(Xtr, expected_df_trans)
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ['auto', 'ignore'])
+@pytest.mark.parametrize("handle_unknown", ['infrequent_if_exist', 'ignore'])
 @pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan'])
 def test_ohe_missing_value_support_pandas_categorical(pd_nan_type,
                                                       handle_unknown):
@@ -1383,12 +1354,10 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type,
     assert np.isnan(ohe.categories_[0][-1])
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
 def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
-    """Check drop='first' and handle_unknown='ignore'/'auto' during transform.
-    """
+    """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
+    during transform."""
     X = [['a', 0], ['b', 2], ['b', 1]]
 
     ohe = OneHotEncoder(drop='first', sparse=False,
@@ -1418,9 +1387,7 @@ def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
     assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
 def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
     """Check drop='if_binary' and handle_unknown='ignore' during transform."""
     X = [['a', 0], ['b', 2], ['b', 1]]
@@ -1452,12 +1419,10 @@ def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
     assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
 
 
-# TODO: Remove when 'ignore' is deprecated in 0.26
-@pytest.mark.filterwarnings("ignore:handle_unknown='ignore':FutureWarning")
-@pytest.mark.parametrize("handle_unknown", ["ignore", "auto"])
+@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
 def test_ohe_drop_first_explicit_categories(handle_unknown):
-    """Check drop='first' and handle_unknown='ignore'/'auto' during fit with
-    categories passed in."""
+    """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
+    during fit with categories passed in."""
 
     X = [['a', 0], ['b', 2], ['b', 1]]
 

From 61d1ddba004803c4249fc00ef4883b3158ce40ed Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 12:24:38 -0400
Subject: [PATCH 62/92] DOC Address comments for user guide

---
 doc/modules/preprocessing.rst | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 6a4015a0cdf05..73099fc300aa9 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -671,8 +671,8 @@ infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
    ...               ['snake'] * 3]).T
-   >>> enc = preprocessing.OneHotEncoder(min_frequency=6).fit(X)
-   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse=False).fit(X)
+   >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
    array([[0., 0., 1.],
           [1., 0., 0.],
           [0., 1., 0.],
@@ -681,9 +681,10 @@ infrequent::
 By setting handle_unknown to `'infrequent_if_exist'`, unknown categories will
 be considered infrequent::
 
-   >>> enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist',
-   ...                                   min_frequency=6).fit(X)
-   >>> enc.transform([['dragon']]).toarray()
+   >>> enc = preprocessing.OneHotEncoder(
+   ...    handle_unknown='infrequent_if_exist', sparse=False, min_frequency=6)
+   >>> enc = enc.fit(X)
+   >>> enc.transform(np.array([['dragon']]))
    array([[0., 0., 1.]])
 
 :meth:`OneHotEncoder.get_feature_names` uses 'infrequent' as the infrequent
@@ -711,8 +712,9 @@ the output. This will result in all but the `'cat'` category to be considered
 infrequent, leading to two features, one for `'cat'` and one for infrequent
 categories - which are all the others::
 
-   >>> enc = preprocessing.OneHotEncoder(max_categories=2).fit(X)
-   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]).toarray()
+   >>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse=False)
+   >>> env = enc.fit(X)
+   >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
    array([[0., 1.],
           [1., 0.],
           [0., 1.],

From 2493223cd644818e2391d957b46d1532c34759c0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 12:57:12 -0400
Subject: [PATCH 63/92] DOC Address comments for whats_new

---
 doc/whats_new/v1.0.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index cf8198cc59e0f..8cbfbc4f41a09 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -105,7 +105,7 @@ Changelog
 - |Fix| Improved convergence detection based on center change in
   :class:`cluster.MiniBatchKMeans` which was almost never achievable.
   :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
-  
+
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
 
@@ -356,10 +356,10 @@ Changelog
 ............................
 
 - |Feature| :class:`preprocessing.OneHotEncoder` now supports grouping
-  infrequent categories ito a single feature. Infrequent categories is
-  enabled by setting `handle_unknown` to `'auto'` or `'error'` and specifying
-  how to select infrequent categories with `min_frequency` or `max_categories`.
-  :pr:`16018` by `Thomas Fan`_.
+  infrequent categories into a single feature. Grouping infrequent categories
+  is enabled by setting `handle_unknown` to `'infrequent_if_exist'` or
+  `'error'` and specifying how to select infrequent categories with
+  `min_frequency` or `max_categories`. :pr:`16018` by `Thomas Fan`_.
 
 - |Feature| The new :class:`preprocessing.SplineTransformer` is a feature
   preprocessing tool for the generation of B-splines, parametrized by the

From a9f643f16b6b0e1eef3829e9870b215b2383543b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 14:49:39 -0400
Subject: [PATCH 64/92] DOC Update docstring based on comments

---
 sklearn/preprocessing/_encoders.py | 18 ++++++++----------
 sklearn/utils/_encode.py           |  2 +-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 50c7abb998cd5..65a18d3dc1ecb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -236,9 +236,7 @@ class OneHotEncoder(_BaseEncoder):
         Specifies a methodology to use to drop one of the categories per
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
-        into a neural network or an unregularized regression. Drop is not
-        support when `min_frequency` or `max_categories` is set to combine
-        infrequent categories.
+        into a neural network or an unregularized regression.
 
         However, dropping one category breaks the symmetry of the original
         representation and can therefore induce a bias in downstream models,
@@ -282,14 +280,14 @@ class OneHotEncoder(_BaseEncoder):
         - 'infrequent_if_exist' : When an unknown category is encountered
           during transform, the resulting one-hot encoded columns for this
           feature will map to the infrequent category if it exists. In the
-          inverse transform, an unknown category will be denoted as
-          'infrequent' if the category if it exists.
-          Read more in the
-          :ref:`User Guide <one_hot_encoder_infrequent_categories>`
-          If a infrequent category does not exist, then :meth:`transform`
-          and :meth:`inverse_transform` will handle as 'ignore'.
+          inverse transform, an unknown category will be mapped to the category
+          denoted `'infrequent'` if it exists. If the `'infrequent'` category
+          does not exist, then :meth:`transform` and :meth:`inverse_transform`
+          will handle an unknown category with `handle_unknown='ignore'`. Read
+          more in the
+          :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
 
-        .. versionadded:: 0.24
+        .. versionadded:: 1.0
             `'infrequent_if_exist'` was added to automatically handle unknown
             categories and infrequent categories.
 
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 6e998320e75a8..8090a3d374c55 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -20,7 +20,7 @@ def _unique(values, *, return_inverse=False, return_counts=False):
     return_inverse : bool, default=False
         If True, also return the indices of the unique values.
 
-    return_count : bool, default=False
+    return_counts : bool, default=False
         If True, also return the number of times each unique item appears in
         values.
 

From 1de557a830a43f4934675bcec627835107b18a51 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 15:19:12 -0400
Subject: [PATCH 65/92] CLN Update test with suggestions

---
 sklearn/preprocessing/_encoders.py           |  6 +--
 sklearn/preprocessing/tests/test_encoders.py | 56 ++++++++++----------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 65a18d3dc1ecb..c98e6cc5e6cca 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -251,8 +251,8 @@ class OneHotEncoder(_BaseEncoder):
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
 
-        If there are infrequent categories and drop selects any of the
-        infrequent categories, than the whole category is dropped.
+        If there are infrequent categories and `drop` selects any of the
+        infrequent categories, then all these categories are dropped.
 
         .. versionadded:: 0.21
            The parameter `drop` was added in 0.21.
@@ -523,7 +523,7 @@ def _convert_to_infrequent_idx(idx, col_idx):
             return np.array(drop_indices, dtype=object)
 
     def _identify_infrequent(self, category_count, n_samples, col_idx):
-        """Compute the infrequent indices
+        """Compute the infrequent indices.
 
         Parameters
         ----------
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 420605180bb00..99f8e5cc9f637 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1045,7 +1045,8 @@ def test_ohe_infrequent_two_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    # The most frequent infrequent category is used for the inverse transform
+    # 'infrequent' is used to denote the infrequent categories for
+    # `inverse_transform`
     expected_inv = [[col] for col in ['b'] + ['infrequent'] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
@@ -1075,7 +1076,8 @@ def test_ohe_infrequent_three_levels_user_cats():
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    # The most frequent infrequent category is used for the inverse transform
+    # 'infrequent' is used to denote the infrequent categories for
+    # `inverse_transform`
     expected_inv = [['b'], ['infrequent'], ['c'], ['infrequent'],
                     ['infrequent']]
     X_inv = ohe.inverse_transform(X_trans)
@@ -1100,7 +1102,7 @@ def test_ohe_infrequent_multiple_categories():
     assert_array_equal(ohe.infrequent_indices_[1], [1, 3])
     assert_array_equal(ohe.infrequent_indices_[2], None)
 
-    # The most frequent infrequent category becomes the feature name
+    # 'infrequent' is used to denote the infrequent categories
     # For the first column, 1 and 2 have the same frequency. In this case,
     # 1 will be chosen to be the feature name because is smaller lexiconically
     feature_names = ohe.get_feature_names()
@@ -1108,15 +1110,15 @@ def test_ohe_infrequent_multiple_categories():
                         'x1_0', 'x1_5', 'x1_infrequent',
                         'x2_0', 'x2_1'], feature_names)
 
-    expected = [[1, 0, 0,  1, 0, 0,  0, 1],
-                [0, 0, 1,  1, 0, 0,  1, 0],
-                [0, 1, 0,  0, 1, 0,  0, 1],
-                [0, 1, 0,  0, 0, 1,  1, 0],
-                [0, 1, 0,  0, 0, 1,  0, 1],
-                [0, 1, 0,  0, 0, 1,  1, 0],
-                [0, 0, 1,  0, 1, 0,  0, 1],
-                [1, 0, 0,  0, 1, 0,  1, 0],
-                [0, 1, 0,  1, 0, 0,  0, 1]]
+    expected = [[1, 0, 0, 1, 0, 0, 0, 1],
+                [0, 0, 1, 1, 0, 0, 1, 0],
+                [0, 1, 0, 0, 1, 0, 0, 1],
+                [0, 1, 0, 0, 0, 1, 1, 0],
+                [0, 1, 0, 0, 0, 1, 0, 1],
+                [0, 1, 0, 0, 0, 1, 1, 0],
+                [0, 0, 1, 0, 1, 0, 0, 1],
+                [1, 0, 0, 0, 1, 0, 1, 0],
+                [0, 1, 0, 1, 0, 0, 0, 1]]
 
     assert_allclose(expected, X_trans)
 
@@ -1147,8 +1149,8 @@ def test_ohe_infrequent_multiple_categories():
               [3, 10, 0]]
     X_test_trans = ohe.transform(X_test)
 
-    expected = [[0, 0, 1,  0, 0, 1,  0, 1],
-                [0, 1, 0,  0, 0, 1,  1, 0]]
+    expected = [[0, 0, 1, 0, 0, 1, 0, 1],
+                [0, 1, 0, 0, 0, 1, 1, 0]]
     assert_allclose(expected, X_test_trans.toarray())
 
     X_inv = ohe.inverse_transform(X_test_trans)
@@ -1180,15 +1182,15 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_allclose(ohe.infrequent_indices_[0], [0, 1])
     assert_allclose(ohe.infrequent_indices_[1], [0, 1, 4])
 
-    expected = [[0, 0, 1,  1, 0, 0],
-                [0, 1, 0,  0, 0, 1],
-                [1, 0, 0,  0, 0, 1],
-                [0, 1, 0,  0, 1, 0],
-                [0, 1, 0,  0, 1, 0],
-                [0, 0, 1,  0, 0, 1],
-                [1, 0, 0,  0, 0, 1],
-                [0, 0, 1,  0, 0, 1],
-                [0, 0, 1,  1, 0, 0]]
+    expected = [[0, 0, 1, 1, 0, 0],
+                [0, 1, 0, 0, 0, 1],
+                [1, 0, 0, 0, 0, 1],
+                [0, 1, 0, 0, 1, 0],
+                [0, 1, 0, 0, 1, 0],
+                [0, 0, 1, 0, 0, 1],
+                [1, 0, 0, 0, 0, 1],
+                [0, 0, 1, 0, 0, 1],
+                [0, 0, 1, 1, 0, 0]]
 
     assert_allclose(expected, X_trans)
 
@@ -1197,8 +1199,8 @@ def test_ohe_infrequent_multiple_categories_dtypes():
          'int': [14, 12]},
         columns=['str', 'int'])
 
-    expected = [[0, 0, 1,  0, 0, 1],
-                [0, 1, 0,  0, 0, 1]]
+    expected = [[0, 0, 1, 0, 0, 1],
+                [0, 1, 0, 0, 0, 1]]
     X_test_trans = ohe.transform(X_test)
     assert_allclose(expected, X_test_trans.toarray())
 
@@ -1213,8 +1215,8 @@ def test_ohe_infrequent_multiple_categories_dtypes():
          'int': [12, 5]},
         columns=['str', 'int'])
     X_test_trans = ohe.transform(X_test).toarray()
-    expected = [[1, 0, 0,  0, 0, 1],
-                [0, 0, 1,  1, 0, 0]]
+    expected = [[1, 0, 0, 0, 0, 1],
+                [0, 0, 1, 1, 0, 0]]
     assert_allclose(expected, X_test_trans)
 
     X_inv = ohe.inverse_transform(X_test_trans)

From 058112e11d230fc40b8f17da54364186021abd4a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 15:54:22 -0400
Subject: [PATCH 66/92] ENH Adds computed property infrequent_categories_

---
 sklearn/preprocessing/_encoders.py           | 32 ++++++++++++++++----
 sklearn/preprocessing/tests/test_encoders.py |  9 ++++--
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c98e6cc5e6cca..b8ac95ab6c012 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -332,12 +332,22 @@ class OneHotEncoder(_BaseEncoder):
         .. versionchanged:: 0.23
            Added the possibility to contain `None` values.
 
-    infrequent_indices_ : list of arrays
-        Defined only when `min_frequency` or `max_categories` is set to a
-        non-default value. `infrequent_indices_[i]` is an array of indices
-        such that `categories_[i][infrequent_indices_[i]]` are all the
-        infrequent category labels. If the ith feature has no infrequent
-        categories `infrequent_indices_[i]` is None.
+    infrequent_categories_ : list of ndarray
+        Defined if infrequent categories are enabled by setting `min_frequency`
+        or `max_categories` to a non-default value. `infrequent_indices_[i]`
+        are the infrequent categories for feature `i`. If the feature `i` has
+        no infrequent categories `infrequent_categories_[i]` is None.
+
+        .. versionadded:: 1.0
+
+    infrequent_indices_ : list of ndarray
+        Defined if infrequent categories are enabled by setting `min_frequency`
+        or `max_categories` to a non-default value. `infrequent_indices_[i]` is
+        an array of indices such that `categories_[i][infrequent_indices_[i]]`
+        are all the infrequent category labels. If the feature `i` has
+        no infrequent categories `infrequent_indices_[i]` is None.
+
+        .. versionadded:: 1.0
 
     See Also
     --------
@@ -407,6 +417,16 @@ def __init__(self, *, categories='auto', drop=None,
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
+    @property
+    def infrequent_categories_(self):
+        """Infrequent categories for each feature."""
+        # raises an AttributeError if `infrequent_indices_` is not defined
+        infrequent_indices = self.infrequent_indices_
+        return [
+            None if indices is None else category[indices]
+            for category, indices in zip(self.categories_, infrequent_indices)
+        ]
+
     def _validate_keywords(self):
 
         if self.handle_unknown not in {'error', 'ignore',
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 99f8e5cc9f637..401eb3a3598a7 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -876,6 +876,7 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
                         handle_unknown='infrequent_if_exist', sparse=False,
                         **kwargs).fit(X_train)
     assert_array_equal(ohe.infrequent_indices_, [[0, 2, 3]])
+    assert_array_equal(ohe.infrequent_categories_, [['a', 'c', 'd']])
 
     X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
     expected = np.array([
@@ -926,6 +927,7 @@ def test_ohe_infrequent_three_levels(kwargs):
     ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
                         **kwargs).fit(X_train)
     assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
+    assert_array_equal(ohe.infrequent_categories_, [['a', 'd']])
 
     X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
     expected = np.array([
@@ -1064,6 +1066,7 @@ def test_ohe_infrequent_three_levels_user_cats():
                         max_categories=3).fit(X_train)
 
     assert_array_equal(ohe.infrequent_indices_, [[1, 3]])
+    assert_array_equal(ohe.infrequent_categories_, [['d', 'a']])
 
     X_test = [['b'], ['a'], ['c'], ['d'], ['e']]
     expected = np.array([
@@ -1129,8 +1132,8 @@ def test_ohe_infrequent_multiple_categories():
 
     # X[:, 2] does not have an infrequent category, thus it is encoded as all
     # zeros
-    expected = [[0, 1, 0,  0, 0, 1,  0, 0],
-                [0, 0, 1,  1, 0, 0,  0, 0]]
+    expected = [[0, 1, 0, 0, 0, 1, 0, 0],
+                [0, 0, 1, 1, 0, 0, 0, 0]]
     assert_allclose(expected, X_test_trans.toarray())
 
     X_inv = ohe.inverse_transform(X_test_trans)
@@ -1145,7 +1148,7 @@ def test_ohe_infrequent_multiple_categories():
         ohe.transform(X_test)
 
     # only infrequent or known categories
-    X_test = [[1, 1, 1],
+    X_test = [[1,  1, 1],
               [3, 10, 0]]
     X_test_trans = ohe.transform(X_test)
 

From 7ab2434163872417ad5270dcd6dcb363ce0ab16c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 15:56:21 -0400
Subject: [PATCH 67/92] DOC Adds where the infrequent column is located

---
 sklearn/preprocessing/_encoders.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b8ac95ab6c012..1b8ce92f827f9 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -279,12 +279,13 @@ class OneHotEncoder(_BaseEncoder):
           will be denoted as None.
         - 'infrequent_if_exist' : When an unknown category is encountered
           during transform, the resulting one-hot encoded columns for this
-          feature will map to the infrequent category if it exists. In the
-          inverse transform, an unknown category will be mapped to the category
-          denoted `'infrequent'` if it exists. If the `'infrequent'` category
-          does not exist, then :meth:`transform` and :meth:`inverse_transform`
-          will handle an unknown category with `handle_unknown='ignore'`. Read
-          more in the
+          feature will map to the infrequent category if it exists. The
+          infrequent category will be mapped to the last position in the
+          encoding. During inverse transform, an unknown category will be
+          mapped to the category denoted `'infrequent'` if it exists. If the
+          `'infrequent'` category does not exist, then :meth:`transform` and
+          :meth:`inverse_transform` will handle an unknown category with
+          `handle_unknown='ignore'`. Read more in the
           :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
 
         .. versionadded:: 1.0

From aa7d5cf0fcae5915482543c2e2a6392a68c54013 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 16:00:28 -0400
Subject: [PATCH 68/92] TST Adds more test for infrequent_categories_

---
 sklearn/preprocessing/tests/test_encoders.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 401eb3a3598a7..a703481342611 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1102,8 +1102,11 @@ def test_ohe_infrequent_multiple_categories():
 
     X_trans = ohe.fit_transform(X).toarray()
     assert_array_equal(ohe.infrequent_indices_[0], [1, 2])
+    assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
     assert_array_equal(ohe.infrequent_indices_[1], [1, 3])
+    assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
     assert_array_equal(ohe.infrequent_indices_[2], None)
+    assert_array_equal(ohe.infrequent_categories_[2], None)
 
     # 'infrequent' is used to denote the infrequent categories
     # For the first column, 1 and 2 have the same frequency. In this case,

From 939123cad10cda3df96b8afb1418940d7c4ac92b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 16:57:04 -0400
Subject: [PATCH 69/92] DOC Adds docstring for _compute_drop_idx

---
 sklearn/preprocessing/_encoders.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1b8ce92f827f9..8d81e07d3d868 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -461,6 +461,18 @@ def _validate_keywords(self):
         )
 
     def _compute_drop_idx(self):
+        """Compute the drop indices associated with `self.categories_`.
+
+        If `self.drop` is:
+        - `None`, returns `None`.
+        - `'first'`, returns all zeros to drop the first category.
+        - `'if_binary'`, returns zero if the category is binary and `None`
+          otherwise.
+        - array-like, returns the indices of the categories that match the
+          categories in `self.drop`. If the dropped category is an infrequent
+          category, then the index for the infrequent category is used. This
+          means that the entire infrequent category is dropped.
+        """
         if self.drop is None:
             return None
         elif isinstance(self.drop, str):

From 6a467ac23f88dc0e5170006bd43caaa6fb5b12a9 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 17:36:25 -0400
Subject: [PATCH 70/92] CLN Moves _convert_to_infrequent_idx into its own
 method

---
 sklearn/preprocessing/_encoders.py | 35 ++++++++++++++++++------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8d81e07d3d868..84051bbf47fd2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -460,6 +460,23 @@ def _validate_keywords(self):
              self.min_frequency < 1.0)
         )
 
+    def _convert_to_infrequent_idx(self, feature_idx, original_idx):
+        """Convert `original_idx` for `feature_idx` into the
+        index for infrequent categories.
+
+        If there are no infrequent categories, then `original_idx` is
+        returned."""
+
+        if not self._infrequent_enabled:
+            return original_idx
+
+        default_to_infrequent = (
+            self._default_to_infrequent_mappings[feature_idx]
+        )
+        if default_to_infrequent is None:
+            return original_idx
+        return default_to_infrequent[original_idx]
+
     def _compute_drop_idx(self):
         """Compute the drop indices associated with `self.categories_`.
 
@@ -514,24 +531,14 @@ def _compute_drop_idx(self):
             missing_drops = []
             drop_indices = []
 
-            def _convert_to_infrequent_idx(idx, col_idx):
-                if not self._infrequent_enabled:
-                    return idx
-
-                default_to_infrequent = (
-                    self._default_to_infrequent_mappings[col_idx]
-                )
-                if default_to_infrequent is None:
-                    return idx
-                return default_to_infrequent[idx]
-
             for col_idx, (val, cat_list) in enumerate(zip(self.drop,
                                                           self.categories_)):
                 if not is_scalar_nan(val):
                     drop_idx = np.where(cat_list == val)[0]
                     if drop_idx.size:  # found drop idx
                         drop_indices.append(
-                            _convert_to_infrequent_idx(drop_idx[0], col_idx))
+                            self._convert_to_infrequent_idx(col_idx,
+                                                            drop_idx[0]))
                     else:
                         missing_drops.append((col_idx, val))
                     continue
@@ -540,9 +547,9 @@ def _convert_to_infrequent_idx(idx, col_idx):
                 for cat_idx, cat in enumerate(cat_list):
                     if is_scalar_nan(cat):
                         drop_indices.append(
-                            _convert_to_infrequent_idx(cat_idx, col_idx))
+                            self._convert_to_infrequent_idx(col_idx, cat_idx))
                         break
-                else:  # loop did not break thus drop is missing
+                else:  # no break
                     missing_drops.append((col_idx, val))
 
             if any(missing_drops):

From f11ccff140c15062d2cc4b6c71aff27d8a476928 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 17:59:50 -0400
Subject: [PATCH 71/92] TST Increases test coverage

---
 sklearn/preprocessing/_encoders.py           | 14 +++++------
 sklearn/preprocessing/tests/test_encoders.py | 26 +++++++++++++++++++-
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 84051bbf47fd2..0ff89777b238b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -531,26 +531,26 @@ def _compute_drop_idx(self):
             missing_drops = []
             drop_indices = []
 
-            for col_idx, (val, cat_list) in enumerate(zip(self.drop,
-                                                          self.categories_)):
-                if not is_scalar_nan(val):
-                    drop_idx = np.where(cat_list == val)[0]
+            drop_cat_itr = enumerate(zip(self.drop, self.categories_))
+            for col_idx, (drop_val, cat_list) in drop_cat_itr:
+                if not is_scalar_nan(drop_val):
+                    drop_idx = np.where(cat_list == drop_val)[0]
                     if drop_idx.size:  # found drop idx
                         drop_indices.append(
                             self._convert_to_infrequent_idx(col_idx,
                                                             drop_idx[0]))
                     else:
-                        missing_drops.append((col_idx, val))
+                        missing_drops.append((col_idx, drop_val))
                     continue
 
-                # val is nan, find nan in categories manually
+                # drop_val is nan, find nan in categories manually
                 for cat_idx, cat in enumerate(cat_list):
                     if is_scalar_nan(cat):
                         drop_indices.append(
                             self._convert_to_infrequent_idx(col_idx, cat_idx))
                         break
                 else:  # no break
-                    missing_drops.append((col_idx, val))
+                    missing_drops.append((col_idx, drop_val))
 
             if any(missing_drops):
                 msg = ("The following categories were supposed to be "
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index a703481342611..944bb8186d90d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1087,6 +1087,30 @@ def test_ohe_infrequent_three_levels_user_cats():
     assert_array_equal(expected_inv, X_inv)
 
 
+def test_ohe_infrequent_mixed():
+    """Test infrequent categories where feature 0 has infrequent categories,
+    and feature 1 does not."""
+
+    # X[:, 0] 1 and 2 are infrequent
+    # X[:, 1] nothing is infrequent
+    X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3],
+              [0, 0, 0, 0, 1, 1, 1, 1, 1]]
+
+    ohe = OneHotEncoder(max_categories=3, drop='if_binary', sparse=False)
+    ohe.fit(X)
+
+    X_test = [[3, 0], [1, 1]]
+    X_trans = ohe.transform(X_test)
+
+    # feature 1 is binary so it drops a category 0
+    assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])
+
+    # dropping a infrequent category in feature 0
+    ohe.set_params(drop=[1, 1]).fit(X)
+    X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, [[0, 1, 1], [0, 0, 0]])
+
+
 def test_ohe_infrequent_multiple_categories():
     """Test infrequent categories with feature matrix with 3 features."""
 
@@ -1096,7 +1120,7 @@ def test_ohe_infrequent_multiple_categories():
 
     ohe = OneHotEncoder(categories='auto', max_categories=3,
                         handle_unknown='infrequent_if_exist')
-    # X[:, 0] 1 and 2 is infrequent
+    # X[:, 0] 1 and 2 are infrequent
     # X[:, 1] 1 and 10 are infrequent
     # X[:, 2] nothing is infrequent
 

From fac1f2186144aca04e9b875f474e0c647b15f2c9 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 9 May 2021 23:27:44 -0400
Subject: [PATCH 72/92] TST Adds failing test

---
 sklearn/preprocessing/tests/test_encoders.py | 80 ++++++++++++++------
 1 file changed, 58 insertions(+), 22 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 944bb8186d90d..7f82c355c2575 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -896,18 +896,45 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_infrequent'], feature_names)
 
-    # dropping the first category which is 'b'
-    drops = ['if_binary', 'first', ['b']]
-    X_test = [['b'], ['c']]
-    for drop in drops:
-        ohe.set_params(drop=drop).fit(X_train)
-        assert_allclose([[0], [1]], ohe.transform(X_test))
 
-    # dropping categories that are infrequent will remove the entire category
-    drops = [['a'], ['c'], ['d']]
-    for drop in drops:
-        ohe.set_params(drop=drop).fit(X_train)
-        assert_allclose([[1], [0]], ohe.transform(X_test))
+@pytest.mark.parametrize("drop", ['if_binary', 'first', ['b']])
+def test_ohe_infrequent_two_levels_drop_frequent(drop):
+    """Test two levels and dropping the frequent category."""
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
+                        max_categories=2, drop=drop).fit(X_train)
+    assert_array_equal(ohe.drop_idx_, [0])
+
+    X_test = np.array([['b'], ['c']])
+    X_trans = ohe.transform(X_test)
+    assert_allclose([[0], [1]], X_trans)
+
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(['x0_infrequent'], feature_names)
+
+    X_inverse = ohe.inverse_transform(X_trans)
+    assert_array_equal([['b'], ['infrequent']], X_inverse)
+
+
+@pytest.mark.parametrize("drop", [['a'], ['c'], ['d']])
+def test_ohe_infrequent_two_levels_drop_infrequent(drop):
+    """Test two levels and dropping any infrequent category removes the
+    whole infrequent category."""
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
+                        max_categories=2, drop=drop).fit(X_train)
+
+    X_test = np.array([['b'], ['c']])
+    X_trans = ohe.transform(X_test)
+    assert_allclose([[1], [0]], X_trans)
+
+    feature_names = ohe.get_feature_names()
+    assert_array_equal(['x0_b'], feature_names)
+
+    X_inverse = ohe.inverse_transform(X_trans)
+    assert_array_equal([['b'], ['infrequent']], X_inverse)
 
 
 @pytest.mark.parametrize("kwargs", [
@@ -948,18 +975,27 @@ def test_ohe_infrequent_three_levels(kwargs):
     feature_names = ohe.get_feature_names()
     assert_array_equal(['x0_b', 'x0_c', 'x0_infrequent'], feature_names)
 
-    # dropping the first category which is 'b'
-    drops = ['first', ['b']]
-    X_test = [['b'], ['c'], ['d']]
-    for drop in drops:
-        ohe.set_params(drop=drop).fit(X_train)
-        assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
 
-    # dropping categories that are infrequent will remove the entire category
-    drops = [['a'], ['d']]
-    for drop in drops:
-        ohe.set_params(drop=drop).fit(X_train)
-        assert_allclose([[1, 0], [0, 1], [0, 0]], ohe.transform(X_test))
+@pytest.mark.parametrize("drop", ["first", ["b"]])
+def test_ohe_infrequent_three_levels_drop_frequent(drop):
+    """Test three levels and dropping the frequent category."""
+
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
+                        max_categories=3, drop=drop).fit(X_train)
+
+    X_test = np.array([['b'], ['c'], ['d']])
+    assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
+
+@pytest.mark.parametrize("drop", [['a'], ['d']])
+def test_ohe_infrequent_three_levels_drop_infrequent(drop):
+    """Test three levels and dropping the infrequent category."""
+    X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
+    ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
+                        max_categories=3, drop=drop).fit(X_train)
+
+    X_test = np.array([['b'], ['c'], ['d']])
+    assert_allclose([[1, 0], [0, 1], [0, 0]], ohe.transform(X_test))
 
 
 def test_ohe_infrequent_handle_unknown_error():

From 87a06fbcc9d5bbeaba21da2d49a4bff194ab573e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 10 May 2021 16:36:25 -0400
Subject: [PATCH 73/92] CLN Careful consideration of dropped and
 inverse_transform

---
 sklearn/preprocessing/_encoders.py           | 71 ++++++++++----------
 sklearn/preprocessing/tests/test_encoders.py |  1 +
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 0ff89777b238b..a67afabbab0f9 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -691,7 +691,7 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
                 continue
             X_int[:, i] = np.take(mapping, X_int[:, i])
 
-    def _compute_transformed_categories(self, i):
+    def _compute_transformed_categories(self, i, remove_dropped=True):
         """Compute the transformed categories used for column `i`.
 
         1. Dropped columns are removed.
@@ -701,28 +701,29 @@ def _compute_transformed_categories(self, i):
         """
         cats = self.categories_[i]
 
-        if self.drop_idx_ is not None:
-            if self.drop_idx_[i] is None:
-                return cats
-            return np.delete(cats, self.drop_idx_[i])
-
-        # drop is None
-        if not self._infrequent_enabled:
-            return cats
-
-        # infrequent is enabled
-        infreq_map = self._default_to_infrequent_mappings[i]
-        if infreq_map is None:
-            return cats
-
-        frequent_mask = infreq_map < infreq_map.max()
+        if self._infrequent_enabled:
+            infreq_map = self._default_to_infrequent_mappings[i]
+            if infreq_map is not None:
+                frequent_mask = infreq_map < infreq_map.max()
 
-        if cats.dtype.kind in 'US' and 'infrequent' in cats:
-            infrequent_cat = 'infrequent_sklearn'
-        else:
-            infrequent_cat = 'infrequent'
-        return np.concatenate((cats[frequent_mask],
-                               np.array([infrequent_cat], dtype=object)))
+                if cats.dtype.kind in 'US' and 'infrequent' in cats:
+                    infrequent_cat = 'infrequent_sklearn'
+                else:
+                    infrequent_cat = 'infrequent'
+                # infrequent category is always at the end
+                cats = np.concatenate(
+                    (cats[frequent_mask],
+                     np.array([infrequent_cat], dtype=object)))
+
+        if remove_dropped:
+            cats = self._remove_dropped_categories(cats, i)
+        return cats
+
+    def _remove_dropped_categories(self, categories, i):
+        """Remove dropped categories."""
+        if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
+            return np.delete(categories, self.drop_idx_[i])
+        return categories
 
     def _compute_n_features_outs(self):
         """Compute the n_features_out for each input feature."""
@@ -770,6 +771,7 @@ def fit(self, X, y=None):
             self._fit_infrequent_category_mapping(
                 fit_results["n_samples"], fit_results["category_counts"])
         self.drop_idx_ = self._compute_drop_idx()
+        self._n_features_outs = self._compute_n_features_outs()
         return self
 
     def fit_transform(self, X, y=None):
@@ -842,10 +844,8 @@ def transform(self, X):
             X_int[X_int > to_drop] -= 1
             X_mask &= keep_cells
 
-        n_values = self._compute_n_features_outs()
-
         mask = X_mask.ravel()
-        feature_indices = np.cumsum([0] + n_values)
+        feature_indices = np.cumsum([0] + self._n_features_outs)
         indices = (X_int + feature_indices[:-1]).ravel()[mask]
 
         indptr = np.empty(n_samples + 1, dtype=int)
@@ -892,9 +892,8 @@ def inverse_transform(self, X):
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
-        transformed_features = [self._compute_transformed_categories(i)
-                                for i, _ in enumerate(self.categories_)]
-        n_features_out = sum(cats.shape[0] for cats in transformed_features)
+
+        n_features_out = np.sum(self._n_features_outs)
 
         # validate shape of passed X
         msg = ("Shape of the passed X data is not correct. Expected {0} "
@@ -902,6 +901,10 @@ def inverse_transform(self, X):
         if X.shape[1] != n_features_out:
             raise ValueError(msg.format(n_features_out, X.shape[1]))
 
+        transformed_features = [
+            self._compute_transformed_categories(i, remove_dropped=False)
+            for i, _ in enumerate(self.categories_)]
+
         # create resulting array of appropriate dtype
         dt = np.find_common_type([cat.dtype
                                   for cat in transformed_features], [])
@@ -916,8 +919,9 @@ def inverse_transform(self, X):
             infrequent_indices = [None] * n_features
 
         for i in range(n_features):
-            cats = transformed_features[i]
-            n_categories = cats.shape[0]
+            cats_wo_dropped = self._remove_dropped_categories(
+                transformed_features[i], i)
+            n_categories = cats_wo_dropped.shape[0]
 
             # Only happens if there was a column with a unique
             # category. In this case we just fill the column with this
@@ -929,7 +933,7 @@ def inverse_transform(self, X):
             sub = X[:, j:j + n_categories]
             # for sparse X argmax returns 2D matrix, ensure 1D array
             labels = np.asarray(sub.argmax(axis=1)).flatten()
-            X_tr[:, i] = cats[labels]
+            X_tr[:, i] = cats_wo_dropped[labels]
 
             if (self.handle_unknown == 'ignore' or
                 (self.handle_unknown == 'infrequent_if_exist' and
@@ -956,9 +960,8 @@ def inverse_transform(self, X):
                             "because they contain all zeros")
                     # we can safely assume that all of the nulls in each column
                     # are the dropped value
-                    X_tr[dropped, i] = self.categories_[i][
-                        self.drop_idx_[i]
-                    ]
+                    drop_idx = self.drop_idx_[i]
+                    X_tr[dropped, i] = transformed_features[i][drop_idx]
 
             j += n_categories
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 7f82c355c2575..5b7fb2fe66c85 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -925,6 +925,7 @@ def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     X_train = np.array([['a'] * 5 + ['b'] * 20 + ['c'] * 10 + ['d'] * 3]).T
     ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False,
                         max_categories=2, drop=drop).fit(X_train)
+    assert_array_equal(ohe.drop_idx_, [1])
 
     X_test = np.array([['b'], ['c']])
     X_trans = ohe.transform(X_test)

From 49aaa237e3f2f2d1414dca701e106ec02a2d5a56 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 10 May 2021 16:41:16 -0400
Subject: [PATCH 74/92] STY Linting

---
 sklearn/preprocessing/tests/test_encoders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 5b7fb2fe66c85..83f79bf6611de 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -988,6 +988,7 @@ def test_ohe_infrequent_three_levels_drop_frequent(drop):
     X_test = np.array([['b'], ['c'], ['d']])
     assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
 
+
 @pytest.mark.parametrize("drop", [['a'], ['d']])
 def test_ohe_infrequent_three_levels_drop_infrequent(drop):
     """Test three levels and dropping the infrequent category."""

From cd3d29b5f46f4e8f3885a4b60f1499983914514a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 10 May 2021 16:51:43 -0400
Subject: [PATCH 75/92] DOC Adds docstrinb about dropping infrequent

---
 sklearn/preprocessing/_encoders.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a67afabbab0f9..8dd642530f52d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -330,6 +330,11 @@ class OneHotEncoder(_BaseEncoder):
         - ``drop_idx_ = None`` if all the transformed features will be
           retained.
 
+        If infrequent categories are enabled by setting `min_frequency` or
+        `max_categories` to a non-default value and `drop_idx[i]` corresponds
+        to a infrequent category, then the entire infrequent category is
+        dropped.
+
         .. versionchanged:: 0.23
            Added the possibility to contain `None` values.
 

From 06397b2baf7b3479e8fd4655e398dc4edcd9e5ee Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 10 May 2021 21:54:20 -0400
Subject: [PATCH 76/92] DOC Uses only

---
 sklearn/preprocessing/_encoders.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 503cb3bb584cf..1a39ef79ea480 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -339,19 +339,21 @@ class OneHotEncoder(_BaseEncoder):
            Added the possibility to contain `None` values.
 
     infrequent_categories_ : list of ndarray
-        Defined if infrequent categories are enabled by setting `min_frequency`
-        or `max_categories` to a non-default value. `infrequent_indices_[i]`
-        are the infrequent categories for feature `i`. If the feature `i` has
-        no infrequent categories `infrequent_categories_[i]` is None.
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_indices_[i]` are the infrequent categories for feature `i`.
+        If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` is None.
 
         .. versionadded:: 1.0
 
     infrequent_indices_ : list of ndarray
-        Defined if infrequent categories are enabled by setting `min_frequency`
-        or `max_categories` to a non-default value. `infrequent_indices_[i]` is
-        an array of indices such that `categories_[i][infrequent_indices_[i]]`
-        are all the infrequent category labels. If the feature `i` has
-        no infrequent categories `infrequent_indices_[i]` is None.
+        Defined only if infrequent categories are enabled by setting
+        `min_frequency` or `max_categories` to a non-default value.
+        `infrequent_indices_[i]` is an array of indices such that
+        `categories_[i][infrequent_indices_[i]]` are all the infrequent
+        category labels. If the feature `i` has no infrequent categories
+        `infrequent_indices_[i]` is None.
 
         .. versionadded:: 1.0
 

From 48a03eab51da13317975fb4ae56f195ddc123882 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 30 Aug 2021 11:56:17 -0400
Subject: [PATCH 77/92] DOC Numpydoc

---
 sklearn/preprocessing/_encoders.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 293ccba352a6a..1387c5fb728c5 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -853,9 +853,10 @@ def fit_transform(self, X, y=None):
 
     def transform(self, X):
         """
-        Transform X using one-hot encoding. If there are infrequent categories
-        for a feature, the infrequent categories will be grouped into a single
-        category.
+        Transform X using one-hot encoding.
+
+        If there are infrequent categories for a feature, the infrequent
+        categories will be grouped into a single category.
 
         Parameters
         ----------

From e36ca579b6fb83aee816c58cef13db92f2ee4b6d Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 29 Nov 2021 14:19:52 -0500
Subject: [PATCH 78/92] TST Includes test for get_feature_names_out

---
 doc/modules/preprocessing.rst                |  17 ++-
 sklearn/preprocessing/_encoders.py           |  26 ++--
 sklearn/preprocessing/tests/test_encoders.py | 127 ++++++++++---------
 3 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 08241ec1adcdd..f3167fc14944d 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -714,7 +714,7 @@ categories are `min_frequency` and `max_categories`.
    the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with
    a cardinality smaller than `min_frequency`  will be considered infrequent.
    If `min_frequency` is a float, categories with a cardinality smaller than
-   this fraction of the total number of samples will be considered infrequent.
+   this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately.
 
 2. `max_categories` is either `None` or any integer greater than 1. This
    parameter sets an upper limit to the number of output features for each
@@ -746,9 +746,9 @@ be considered infrequent::
 feature name::
 
    >>> enc.get_feature_names()
-   array(['x0_cat', 'x0_rabbit', 'x0_infrequent'], dtype=object)
+   array(['x0_cat', 'x0_rabbit', 'x0_infrequent_sklearn'], dtype=object)
 
-When this `'handle_unknown'` is set to `'infrequent_if_exist'` and an unknown
+When `'handle_unknown'` is set to `'infrequent_if_exist'` and an unknown
 category is encountered in transform:
 
 1. If infrequent category support was not configured or there was no
@@ -757,9 +757,8 @@ category is encountered in transform:
    category will be denoted as `None`.
 
 2. If there is an infrequent category during training, the unknown category
-   will be considered infrequent. In the inverse transform, 'infrequent' will
-   be used to represent the infrequent category. If `'infrequent'` is already a
-   category, `'infrequent_sklearn'` will be used instead.
+   will be considered infrequent. In the inverse transform, 'infrequent_sklearn'
+   will be used to represent the infrequent category.
 
 Infrequent categories can also be configured using `max_categories`. In the
 following example, we set `max_categories=2` to limit the number of features in
@@ -775,6 +774,10 @@ categories - which are all the others::
           [0., 1.],
           [0., 1.]])
 
+If both `max_categories` and `min_frequency` are non-default values, then
+categories are selected based on `min_frequency` first and `max_categories`
+categories are kept.
+
 .. _preprocessing_discretization:
 
 Discretization
@@ -1057,7 +1060,7 @@ Interestingly, a :class:`SplineTransformer` of ``degree=0`` is the same as
       Penalties <10.1214/ss/1038425655>`. Statist. Sci. 11 (1996), no. 2, 89--121.
 
     * Perperoglou, A., Sauerbrei, W., Abrahamowicz, M. et al. :doi:`A review of
-      spline function procedures in R <10.1186/s12874-019-0666-3>`. 
+      spline function procedures in R <10.1186/s12874-019-0666-3>`.
       BMC Med Res Methodol 19, 46 (2019).
 
 .. _function_transformer:
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 0b719e29c3257..1d338f6b44ea2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -742,8 +742,7 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
 
         1. Dropped columns are removed.
         2. If there are infrequent categories, the category is named
-        'infrequent'. If 'infrequent' is already a category, then then new
-        category is called 'infrequent_sklearn'.
+        'infrequent_sklearn'.
         """
         cats = self.categories_[i]
 
@@ -751,11 +750,7 @@ def _compute_transformed_categories(self, i, remove_dropped=True):
             infreq_map = self._default_to_infrequent_mappings[i]
             if infreq_map is not None:
                 frequent_mask = infreq_map < infreq_map.max()
-
-                if cats.dtype.kind in "US" and "infrequent" in cats:
-                    infrequent_cat = "infrequent_sklearn"
-                else:
-                    infrequent_cat = "infrequent"
+                infrequent_cat = "infrequent_sklearn"
                 # infrequent category is always at the end
                 cats = np.concatenate(
                     (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
@@ -930,9 +925,7 @@ def inverse_transform(self, X):
         category will be its inverse.
 
         For a given input feature, if there is an infrequent category,
-        'infrequent' will be used to represent the infrequent category. If
-        'infrequent' is already a category, 'infrequent_sklearn' will be used
-        instead.
+        'infrequent_sklearn' will be used to represent the infrequent category.
 
         Parameters
         ----------
@@ -1044,8 +1037,7 @@ def get_feature_names(self, input_features=None):
         """Return feature names for output features.
 
         For a given input feature, if there is an infrequent category, the most
-        'infrequent' will be used as a feature name. If 'infrequent' is already
-        a category, 'infrequent_sklearn' will be used instead.
+        'infrequent_sklearn' will be used as a feature name.
 
         Parameters
         ----------
@@ -1098,16 +1090,18 @@ def get_feature_names_out(self, input_features=None):
             Transformed feature names.
         """
         check_is_fitted(self)
-        cats = self.categories_
         input_features = _check_feature_names_in(self, input_features)
+        cats = [
+            self._compute_transformed_categories(i)
+            for i, _ in enumerate(self.categories_)
+        ]
 
         feature_names = []
         for i in range(len(cats)):
             names = [input_features[i] + "_" + str(t) for t in cats[i]]
-            if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
-                names.pop(self.drop_idx_[i])
             feature_names.extend(names)
-        return np.asarray(feature_names, dtype=object)
+
+        return np.array(feature_names, dtype=object)
 
 
 class OrdinalEncoder(_BaseEncoder):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index d4b535a6feb97..c8238b4778841 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -947,36 +947,8 @@ def test_encoders_has_categorical_tags(Encoder):
     assert "categorical" in Encoder()._get_tags()["X_types"]
 
 
-def test_ohe_infrequent_infrequent_is_a_cat():
-    """Test category with 'infrequent' is a frequent category, ohe will name
-    mangle this into 'infrequent_sklearn'."""
-    X_train = np.array([["a"] * 5 + ["b"] * 20 + ["infrequent"] * 10 + ["d"] * 3]).T
-    ohe = OneHotEncoder(
-        handle_unknown="infrequent_if_exist", sparse=False, max_categories=3
-    )
-    ohe.fit(X_train)
-
-    X_test = [["b"], ["a"], ["infrequent"], ["d"]]
-    expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])
-
-    X_trans = ohe.transform(X_test)
-    assert_allclose(expected, X_trans)
-
-    expected_inv = [
-        ["b"],
-        ["infrequent_sklearn"],
-        ["infrequent"],
-        ["infrequent_sklearn"],
-    ]
-    X_inv = ohe.inverse_transform(X_trans)
-    assert_array_equal(expected_inv, X_inv)
-
-    feature_names = ohe.get_feature_names()
-    assert_array_equal(
-        feature_names, ["x0_b", "x0_infrequent", "x0_infrequent_sklearn"]
-    )
-
-
+# TODO: Remove in 1.2 when get_feature_names is removed.
+@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -1008,14 +980,20 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    expected_inv = [[col] for col in ["b"] + ["infrequent"] * 4]
+    expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
     feature_names = ohe.get_feature_names()
-    assert_array_equal(["x0_b", "x0_infrequent"], feature_names)
+    assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
+
+    # TODO(1.2) Remove when get_feature_names is removed
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
 
 
+# TODO: Remove in 1.2 when get_feature_names is removed.
+@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
 def test_ohe_infrequent_two_levels_drop_frequent(drop):
     """Test two levels and dropping the frequent category."""
@@ -1031,12 +1009,18 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
     assert_allclose([[0], [1]], X_trans)
 
     feature_names = ohe.get_feature_names()
-    assert_array_equal(["x0_infrequent"], feature_names)
+    assert_array_equal(["x0_infrequent_sklearn"], feature_names)
+
+    # TODO(1.2) Remove when get_feature_names is removed
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_infrequent_sklearn"], feature_names)
 
     X_inverse = ohe.inverse_transform(X_trans)
-    assert_array_equal([["b"], ["infrequent"]], X_inverse)
+    assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
 
+# TODO: Remove in 1.2 when get_feature_names is removed.
+@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize("drop", [["a"], ["c"], ["d"]])
 def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     """Test two levels and dropping any infrequent category removes the
@@ -1055,10 +1039,16 @@ def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     feature_names = ohe.get_feature_names()
     assert_array_equal(["x0_b"], feature_names)
 
+    # TODO(1.2) Remove when get_feature_names is removed
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_b"], feature_names)
+
     X_inverse = ohe.inverse_transform(X_trans)
-    assert_array_equal([["b"], ["infrequent"]], X_inverse)
+    assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
 
+# TODO: Remove in 1.2 when get_feature_names is removed.
+@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "kwargs",
     [
@@ -1088,12 +1078,22 @@ def test_ohe_infrequent_three_levels(kwargs):
     X_trans = ohe.transform(X_test)
     assert_allclose(expected, X_trans)
 
-    expected_inv = [["b"], ["infrequent"], ["c"], ["infrequent"], ["infrequent"]]
+    expected_inv = [
+        ["b"],
+        ["infrequent_sklearn"],
+        ["c"],
+        ["infrequent_sklearn"],
+        ["infrequent_sklearn"],
+    ]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
     feature_names = ohe.get_feature_names()
-    assert_array_equal(["x0_b", "x0_c", "x0_infrequent"], feature_names)
+    assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
+
+    # TODO(1.2) Remove when get_feature_names is removed
+    feature_names = ohe.get_feature_names_out()
+    assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
 
 
 @pytest.mark.parametrize("drop", ["first", ["b"]])
@@ -1200,7 +1200,7 @@ def test_ohe_infrequent_two_levels_user_cats():
 
     # 'infrequent' is used to denote the infrequent categories for
     # `inverse_transform`
-    expected_inv = [[col] for col in ["b"] + ["infrequent"] * 4]
+    expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
@@ -1231,7 +1231,13 @@ def test_ohe_infrequent_three_levels_user_cats():
 
     # 'infrequent' is used to denote the infrequent categories for
     # `inverse_transform`
-    expected_inv = [["b"], ["infrequent"], ["c"], ["infrequent"], ["infrequent"]]
+    expected_inv = [
+        ["b"],
+        ["infrequent_sklearn"],
+        ["c"],
+        ["infrequent_sklearn"],
+        ["infrequent_sklearn"],
+    ]
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
@@ -1259,6 +1265,8 @@ def test_ohe_infrequent_mixed():
     assert_allclose(X_trans, [[0, 1, 1], [0, 0, 0]])
 
 
+# TODO: Remove in 1.2 when get_feature_names is removed.
+@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 def test_ohe_infrequent_multiple_categories():
     """Test infrequent categories with feature matrix with 3 features."""
 
@@ -1286,20 +1294,21 @@ def test_ohe_infrequent_multiple_categories():
     # 'infrequent' is used to denote the infrequent categories
     # For the first column, 1 and 2 have the same frequency. In this case,
     # 1 will be chosen to be the feature name because is smaller lexiconically
-    feature_names = ohe.get_feature_names()
-    assert_array_equal(
-        [
-            "x0_0",
-            "x0_3",
-            "x0_infrequent",
-            "x1_0",
-            "x1_5",
-            "x1_infrequent",
-            "x2_0",
-            "x2_1",
-        ],
-        feature_names,
-    )
+    for get_names in ["get_feature_names", "get_feature_names_out"]:
+        feature_names = getattr(ohe, get_names)()
+        assert_array_equal(
+            [
+                "x0_0",
+                "x0_3",
+                "x0_infrequent_sklearn",
+                "x1_0",
+                "x1_5",
+                "x1_infrequent_sklearn",
+                "x2_0",
+                "x2_1",
+            ],
+            feature_names,
+        )
 
     expected = [
         [1, 0, 0, 1, 0, 0, 0, 1],
@@ -1326,7 +1335,7 @@ def test_ohe_infrequent_multiple_categories():
 
     X_inv = ohe.inverse_transform(X_test_trans)
     expected_inv = np.array(
-        [[3, "infrequent", None], ["infrequent", 0, None]], dtype=object
+        [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object
     )
     assert_array_equal(expected_inv, X_inv)
 
@@ -1347,7 +1356,8 @@ def test_ohe_infrequent_multiple_categories():
     X_inv = ohe.inverse_transform(X_test_trans)
 
     expected_inv = np.array(
-        [["infrequent", "infrequent", 1], [3, "infrequent", 0]], dtype=object
+        [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]],
+        dtype=object,
     )
     assert_array_equal(expected_inv, X_inv)
 
@@ -1399,7 +1409,8 @@ def test_ohe_infrequent_multiple_categories_dtypes():
 
     X_inv = ohe.inverse_transform(X_test_trans)
     expected_inv = np.array(
-        [["infrequent", "infrequent"], ["f", "infrequent"]], dtype=object
+        [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]],
+        dtype=object,
     )
     assert_array_equal(expected_inv, X_inv)
 
@@ -1410,7 +1421,9 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_allclose(expected, X_test_trans)
 
     X_inv = ohe.inverse_transform(X_test_trans)
-    expected_inv = np.array([["c", "infrequent"], ["infrequent", 5]], dtype=object)
+    expected_inv = np.array(
+        [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object
+    )
     assert_array_equal(expected_inv, X_inv)
 
 

From 6bbc6d462c20a31537097ac7015b45615aaae9a4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 29 Nov 2021 14:20:24 -0500
Subject: [PATCH 79/92] DOC Move whats new

---
 doc/whats_new/v1.0.rst | 6 ------
 doc/whats_new/v1.1.rst | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index f833dc4ac1be2..a6c955b5afcdc 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -970,12 +970,6 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
-- |Feature| :class:`preprocessing.OneHotEncoder` now supports grouping
-  infrequent categories into a single feature. Grouping infrequent categories
-  is enabled by setting `handle_unknown` to `'infrequent_if_exist'` or
-  `'error'` and specifying how to select infrequent categories with
-  `min_frequency` or `max_categories`. :pr:`16018` by `Thomas Fan`_.
-
 - |Feature| The new :class:`preprocessing.SplineTransformer` is a feature
   preprocessing tool for the generation of B-splines, parametrized by the
   polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 7792fa14b13a5..f58786e89a913 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -233,6 +233,12 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports grouping
+  infrequent categories into a single feature. Grouping infrequent categories
+  is enabled by setting `handle_unknown` to `'infrequent_if_exist'` or
+  `'error'` and specifying how to select infrequent categories with
+  `min_frequency` or `max_categories`. :pr:`16018` by `Thomas Fan`_.
+
 - |Enhancement| Adds a `subsample` parameter to :class:`preprocessing.KBinsDiscretizer`.
   This allows specifying a maximum number of samples to be used while fitting
   the model. The option is only available when `strategy` is set to `quantile`.

From 23ae2e8e1f17e09ec5ef86fda50c999a5870c3a0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 1 Mar 2022 16:12:08 -0500
Subject: [PATCH 80/92] DOC Address docstring comments

---
 doc/modules/preprocessing.rst      | 29 +++++++++++++++------
 sklearn/preprocessing/_encoders.py | 42 ++++++++++++++++--------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index f3167fc14944d..dd6d0ac1df77d 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -623,7 +623,8 @@ since co-linearity would cause the covariance matrix to be non-invertible::
     ...      ['female', 'from Europe', 'uses Firefox']]
     >>> drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
     >>> drop_enc.categories_
-    [array(['female', 'male'], dtype=object), array(['from Europe', 'from US'], dtype=object), array(['uses Firefox', 'uses Safari'], dtype=object)]
+    [array(['female', 'male'], dtype=object), array(['from Europe', 'from US'], dtype=object),
+     array(['uses Firefox', 'uses Safari'], dtype=object)]
     >>> drop_enc.transform(X).toarray()
     array([[1., 1., 1.],
            [0., 0., 0.]])
@@ -636,7 +637,8 @@ categories. In this case, you can set the parameter `drop='if_binary'`.
     ...      ['female', 'Asia', 'Chrome']]
     >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
     >>> drop_enc.categories_
-    [array(['female', 'male'], dtype=object), array(['Asia', 'Europe', 'US'], dtype=object), array(['Chrome', 'Firefox', 'Safari'], dtype=object)]
+    [array(['female', 'male'], dtype=object), array(['Asia', 'Europe', 'US'], dtype=object),
+     array(['Chrome', 'Firefox', 'Safari'], dtype=object)]
     >>> drop_enc.transform(X).toarray()
     array([[1., 0., 0., 1., 0., 0., 1.],
            [0., 0., 1., 0., 0., 1., 0.],
@@ -714,7 +716,8 @@ categories are `min_frequency` and `max_categories`.
    the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with
    a cardinality smaller than `min_frequency`  will be considered infrequent.
    If `min_frequency` is a float, categories with a cardinality smaller than
-   this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately.
+   this fraction of the total number of samples will be considered infrequent.
+   The default value is 1, which means every category is encoded separately.
 
 2. `max_categories` is either `None` or any integer greater than 1. This
    parameter sets an upper limit to the number of output features for each
@@ -742,10 +745,10 @@ be considered infrequent::
    >>> enc.transform(np.array([['dragon']]))
    array([[0., 0., 1.]])
 
-:meth:`OneHotEncoder.get_feature_names` uses 'infrequent' as the infrequent
+:meth:`OneHotEncoder.get_feature_names_out` uses 'infrequent' as the infrequent
 feature name::
 
-   >>> enc.get_feature_names()
+   >>> enc.get_feature_names_out()
    array(['x0_cat', 'x0_rabbit', 'x0_infrequent_sklearn'], dtype=object)
 
 When `'handle_unknown'` is set to `'infrequent_if_exist'` and an unknown
@@ -767,8 +770,8 @@ infrequent, leading to two features, one for `'cat'` and one for infrequent
 categories - which are all the others::
 
    >>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse=False)
-   >>> env = enc.fit(X)
-   >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
+   >>> enc = enc.fit(X)
+   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
    array([[0., 1.],
           [1., 0.],
           [0., 1.],
@@ -776,7 +779,17 @@ categories - which are all the others::
 
 If both `max_categories` and `min_frequency` are non-default values, then
 categories are selected based on `min_frequency` first and `max_categories`
-categories are kept.
+categories are kept. In the following example, `min_frequency=4` considers
+only `snake` to be infrequent, but `max_categories=3`, forces `dog` to also be
+infrequent::
+
+   >>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse=False)
+   >>> enc = enc.fit(X)
+   >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']])
+   array([[0., 0., 1.],
+          [1., 0., 0.],
+          [0., 1., 0.],
+          [0., 0., 1.]])
 
 .. _preprocessing_discretization:
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1d338f6b44ea2..de849359ca6f9 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -8,7 +8,7 @@
 import numpy as np
 from scipy import sparse
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin
 from ..utils import check_array, is_scalar_nan
 from ..utils.deprecation import deprecated
 from ..utils.validation import check_is_fitted
@@ -37,7 +37,7 @@ def _check_X(self, X, force_all_finite=True):
         - return list of features (arrays): this list of features is
           constructed feature by feature to preserve the data types
           of pandas DataFrame columns, as otherwise information is lost
-          and cannot be used, eg for the `categories_` attribute.
+          and cannot be used, e.g. for the `categories_` attribute.
 
         """
         if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
@@ -236,12 +236,12 @@ class OneHotEncoder(_BaseEncoder):
 
         .. versionadded:: 0.20
 
-    drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
+    drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
             default=None
         Specifies a methodology to use to drop one of the categories per
         feature. This is useful in situations where perfectly collinear
         features cause problems, such as when feeding the resulting data
-        into a neural network or an unregularized regression.
+        into an unregularized linear regression model.
 
         However, dropping one category breaks the symmetry of the original
         representation and can therefore induce a bias in downstream models,
@@ -273,10 +273,9 @@ class OneHotEncoder(_BaseEncoder):
 
     handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \
                      default='error'
-        Specifies a methodology for handling unknown categories during
-        :meth:`transform`.
+        Specifies the way unknown categories are handled during :meth:`transform`.
 
-        - 'error' : Raise an error if an unknown categorical feature
+        - 'error' : Raise an error if an unknown category feature
           is present during transform.
         - 'ignore' : When an unknown category is encountered during
           transform, the resulting one-hot encoded columns for this feature
@@ -290,10 +289,11 @@ class OneHotEncoder(_BaseEncoder):
           mapped to the category denoted `'infrequent'` if it exists. If the
           `'infrequent'` category does not exist, then :meth:`transform` and
           :meth:`inverse_transform` will handle an unknown category with
-          `handle_unknown='ignore'`. Read more in the
+          `handle_unknown='ignore'`. Infrequent categories exist based on
+          `min_frequency` and `max_categories`. Read more in the
           :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
 
-        .. versionadded:: 1.0
+        .. versionchanged:: 1.1
             `'infrequent_if_exist'` was added to automatically handle unknown
             categories and infrequent categories.
 
@@ -307,16 +307,18 @@ class OneHotEncoder(_BaseEncoder):
         - If `float`, categories with a smaller cardinality than
           `min_frequency * n_samples`  will be considered infrequent.
 
-        .. versionadded:: 1.0
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
 
     max_categories : int, default=None
-        Specifies an upper limit to the number of output features for each
-        input feature when considering infrequent categories. Note that
-        `max_categories` includes the category representing the infrequent
-        categories along with the frequent categories. If `None`, there is no
-        limit to the number of output features.
+        Specifies an upper limit to the number of output features for each input
+        feature when considering infrequent categories. If there are infrequent
+        categories, `max_categories` includes the category representing the
+        infrequent categories along with the frequent categories. If `None`,
+        there is no limit to the number of output features.
 
-        .. versionadded:: 1.0
+        .. versionadded:: 1.1
+            Read more in the :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
 
     Attributes
     ----------
@@ -740,9 +742,9 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
     def _compute_transformed_categories(self, i, remove_dropped=True):
         """Compute the transformed categories used for column `i`.
 
-        1. Dropped columns are removed.
-        2. If there are infrequent categories, the category is named
+        1. If there are infrequent categories, the category is named
         'infrequent_sklearn'.
+        2. Dropped columns are removed when remove_dropped=True.
         """
         cats = self.categories_[i]
 
@@ -921,7 +923,7 @@ def inverse_transform(self, X):
 
         When unknown categories are encountered (all zeros in the
         one-hot encoding), ``None`` is used to represent this category. If the
-        feature with the unknown category has a dropped caregory, the dropped
+        feature with the unknown category has a dropped category, the dropped
         category will be its inverse.
 
         For a given input feature, if there is an infrequent category,
@@ -1104,7 +1106,7 @@ def get_feature_names_out(self, input_features=None):
         return np.array(feature_names, dtype=object)
 
 
-class OrdinalEncoder(_BaseEncoder):
+class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder):
     """
     Encode categorical features as an integer array.
 

From 7980c6ef44547d66827709e10710fc661e526bee Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 1 Mar 2022 23:13:04 -0500
Subject: [PATCH 81/92] DOC Docstring changes

---
 sklearn/preprocessing/_encoders.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index de849359ca6f9..4738445c51201 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -265,6 +265,9 @@ class OneHotEncoder(_BaseEncoder):
         .. versionchanged:: 0.23
            The option `drop='if_binary'` was added in 0.23.
 
+        .. versionchanged:: 1.1
+            Support for dropping infrequent categories.
+
     sparse : bool, default=True
         Will return sparse matrix if set True else will return an array.
 
@@ -275,8 +278,7 @@ class OneHotEncoder(_BaseEncoder):
                      default='error'
         Specifies the way unknown categories are handled during :meth:`transform`.
 
-        - 'error' : Raise an error if an unknown category feature
-          is present during transform.
+        - 'error' : Raise an error if an unknown category is present during transform.
         - 'ignore' : When an unknown category is encountered during
           transform, the resulting one-hot encoded columns for this feature
           will be all zeros. In the inverse transform, an unknown category
@@ -671,8 +673,8 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
         n_samples : int
             Number of samples in training set.
         category_counts: list of ndarray
-            List of counts corresponding where `category_counts[i]` are the
-            counts for each category in `self.categories_[i]`.
+            `category_counts[i]` is the category counts corresponding to
+            `self.categories_[i]`.
         """
         self.infrequent_indices_ = [
             self._identify_infrequent(category_count, n_samples, col_idx)

From 552c9830896469c58ca5111156a1e8f55db0315f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 2 Mar 2022 15:45:29 -0500
Subject: [PATCH 82/92] TST Better comments

---
 sklearn/preprocessing/tests/test_encoders.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index c419dff7231cc..da897f2cb2f49 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -947,7 +947,7 @@ def test_encoders_has_categorical_tags(Encoder):
     assert "categorical" in Encoder()._get_tags()["X_types"]
 
 
-# TODO: Remove in 1.2 when get_feature_names is removed.
+# TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "kwargs",
@@ -984,15 +984,15 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
+    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names()
     assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
 
-    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names_out()
     assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
 
 
-# TODO: Remove in 1.2 when get_feature_names is removed.
+# TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
 def test_ohe_infrequent_two_levels_drop_frequent(drop):
@@ -1008,10 +1008,10 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
     X_trans = ohe.transform(X_test)
     assert_allclose([[0], [1]], X_trans)
 
+    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names()
     assert_array_equal(["x0_infrequent_sklearn"], feature_names)
 
-    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names_out()
     assert_array_equal(["x0_infrequent_sklearn"], feature_names)
 
@@ -1019,7 +1019,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
     assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
 
-# TODO: Remove in 1.2 when get_feature_names is removed.
+# TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize("drop", [["a"], ["c"], ["d"]])
 def test_ohe_infrequent_two_levels_drop_infrequent(drop):
@@ -1036,10 +1036,10 @@ def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     X_trans = ohe.transform(X_test)
     assert_allclose([[1], [0]], X_trans)
 
+    # TODO(1.2): Remove get_feature_names is removed.
     feature_names = ohe.get_feature_names()
     assert_array_equal(["x0_b"], feature_names)
 
-    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names_out()
     assert_array_equal(["x0_b"], feature_names)
 
@@ -1047,7 +1047,7 @@ def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
 
-# TODO: Remove in 1.2 when get_feature_names is removed.
+# TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "kwargs",
@@ -1088,10 +1088,10 @@ def test_ohe_infrequent_three_levels(kwargs):
     X_inv = ohe.inverse_transform(X_trans)
     assert_array_equal(expected_inv, X_inv)
 
+    # TODO(1.2): Remove get_feature_names is removed.
     feature_names = ohe.get_feature_names()
     assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
 
-    # TODO(1.2) Remove when get_feature_names is removed
     feature_names = ohe.get_feature_names_out()
     assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
 
@@ -1265,7 +1265,7 @@ def test_ohe_infrequent_mixed():
     assert_allclose(X_trans, [[0, 1, 1], [0, 0, 0]])
 
 
-# TODO: Remove in 1.2 when get_feature_names is removed.
+# TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
 def test_ohe_infrequent_multiple_categories():
     """Test infrequent categories with feature matrix with 3 features."""

From 4deb1057179defc554f2b06ab410c95ad9ec664a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 2 Mar 2022 16:17:14 -0500
Subject: [PATCH 83/92] TST Adds check for handle_unknown='ignore' for
 infrequent

---
 doc/whats_new/v1.1.rst                       |  3 +--
 sklearn/preprocessing/tests/test_encoders.py | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 89509c62f5e49..da169d8f7edd2 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -680,8 +680,7 @@ Changelog
 
 - |Feature| :class:`preprocessing.OneHotEncoder` now supports grouping
   infrequent categories into a single feature. Grouping infrequent categories
-  is enabled by setting `handle_unknown` to `'infrequent_if_exist'` or
-  `'error'` and specifying how to select infrequent categories with
+  is enabled by specifying how to select infrequent categories with
   `min_frequency` or `max_categories`. :pr:`16018` by `Thomas Fan`_.
 
 - |Enhancement| Adds a `subsample` parameter to :class:`preprocessing.KBinsDiscretizer`.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index da897f2cb2f49..21bcce4983ce9 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1046,6 +1046,14 @@ def test_ohe_infrequent_two_levels_drop_infrequent(drop):
     X_inverse = ohe.inverse_transform(X_trans)
     assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
+    # Check handle_unknown="ignore"
+    ohe.set_params(handle_unknown="ignore").fit(X_train)
+    msg = "Found unknown categories"
+    with pytest.warns(UserWarning, match=msg):
+        X_trans = ohe.transform([["b"], ["e"]])
+
+    assert_allclose([[1], [0]], X_trans)
+
 
 # TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@@ -1108,6 +1116,14 @@ def test_ohe_infrequent_three_levels_drop_frequent(drop):
     X_test = np.array([["b"], ["c"], ["d"]])
     assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
 
+    # Check handle_unknown="ignore"
+    ohe.set_params(handle_unknown="ignore").fit(X_train)
+    msg = "Found unknown categories"
+    with pytest.warns(UserWarning, match=msg):
+        X_trans = ohe.transform([["b"], ["e"]])
+
+    assert_allclose([[0, 0], [0, 0]], X_trans)
+
 
 @pytest.mark.parametrize("drop", [["a"], ["d"]])
 def test_ohe_infrequent_three_levels_drop_infrequent(drop):

From ecb2a44717c59b90e8cf2b429601a403c9d27c25 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 2 Mar 2022 17:13:55 -0500
Subject: [PATCH 84/92] CLN Make _infrequent_indices private

---
 sklearn/preprocessing/_encoders.py           | 50 ++++++++++++--------
 sklearn/preprocessing/tests/test_encoders.py | 18 ++-----
 2 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 4738445c51201..8f6664623e2d7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -290,7 +290,7 @@ class OneHotEncoder(_BaseEncoder):
           encoding. During inverse transform, an unknown category will be
           mapped to the category denoted `'infrequent'` if it exists. If the
           `'infrequent'` category does not exist, then :meth:`transform` and
-          :meth:`inverse_transform` will handle an unknown category with
+          :meth:`inverse_transform` will handle an unknown category as with
           `handle_unknown='ignore'`. Infrequent categories exist based on
           `min_frequency` and `max_categories`. Read more in the
           :ref:`User Guide <one_hot_encoder_infrequent_categories>`.
@@ -350,21 +350,12 @@ class OneHotEncoder(_BaseEncoder):
     infrequent_categories_ : list of ndarray
         Defined only if infrequent categories are enabled by setting
         `min_frequency` or `max_categories` to a non-default value.
-        `infrequent_indices_[i]` are the infrequent categories for feature `i`.
-        If the feature `i` has no infrequent categories
+        `infrequent_categories_[i]` are the infrequent categories for feature
+        `i`. If the feature `i` has no infrequent categories
         `infrequent_categories_[i]` is None.
 
         .. versionadded:: 1.1
 
-    infrequent_indices_ : list of ndarray
-        Defined only if infrequent categories are enabled by setting
-        `min_frequency` or `max_categories` to a non-default value.
-        `infrequent_indices_[i]` is an array of indices such that
-        `categories_[i][infrequent_indices_[i]]` are all the infrequent
-        category labels. If the feature `i` has no infrequent categories
-        `infrequent_indices_[i]` is None.
-
-        .. versionadded:: 1.1
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
@@ -430,6 +421,17 @@ class OneHotEncoder(_BaseEncoder):
     >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
     array([[0., 1., 0., 0.],
            [1., 0., 1., 0.]])
+
+    Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
+
+    >>> import numpy as np
+    >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
+    >>> ohe = OneHotEncoder(max_categories=3, sparse=False).fit(X)
+    >>> ohe.infrequent_categories_
+    [array(['a', 'd'], dtype=object)]
+    >>> ohe.transform([["a"], ["b"]])
+    array([[0., 0., 1.],
+           [1., 0., 0.]])
     """
 
     def __init__(
@@ -454,8 +456,8 @@ def __init__(
     @property
     def infrequent_categories_(self):
         """Infrequent categories for each feature."""
-        # raises an AttributeError if `infrequent_indices_` is not defined
-        infrequent_indices = self.infrequent_indices_
+        # raises an AttributeError if `_infrequent_indices` is not defined
+        infrequent_indices = self._infrequent_indices
         return [
             None if indices is None else category[indices]
             for category, indices in zip(self.categories_, infrequent_indices)
@@ -536,7 +538,7 @@ def _compute_drop_idx(self):
             elif self.drop == "if_binary":
                 n_features_out_no_drop = [len(cat) for cat in self.categories_]
                 if self._infrequent_enabled:
-                    for i, infreq_idx in enumerate(self.infrequent_indices_):
+                    for i, infreq_idx in enumerate(self._infrequent_indices):
                         if infreq_idx is None:
                             continue
                         n_features_out_no_drop[i] -= infreq_idx.size - 1
@@ -668,6 +670,14 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
         to a single output:
         `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
 
+        Defines private attrite: `_infrequent_indices`. `_infrequent_indices[i]`
+        is an array of indices such that
+        `categories_[i][_infrequent_indices[i]]` are all the infrequent category
+        labels. If the feature `i` has no infrequent categories
+        `_infrequent_indices[i]` is None.
+
+        .. versionadded:: 1.1
+
         Parameters
         ----------
         n_samples : int
@@ -676,7 +686,7 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
             `category_counts[i]` is the category counts corresponding to
             `self.categories_[i]`.
         """
-        self.infrequent_indices_ = [
+        self._infrequent_indices = [
             self._identify_infrequent(category_count, n_samples, col_idx)
             for col_idx, category_count in enumerate(category_counts)
         ]
@@ -684,7 +694,7 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
         # compute mapping from default mapping to infrequent mapping
         self._default_to_infrequent_mappings = []
 
-        for cats, infreq_idx in zip(self.categories_, self.infrequent_indices_):
+        for cats, infreq_idx in zip(self.categories_, self._infrequent_indices):
             # no infrequent categories
             if infreq_idx is None:
                 self._default_to_infrequent_mappings.append(None)
@@ -722,7 +732,7 @@ def _map_to_infrequent_categories(self, X_int, X_mask):
             return
 
         for col_idx in range(X_int.shape[1]):
-            infrequent_idx = self.infrequent_indices_[col_idx]
+            infrequent_idx = self._infrequent_indices[col_idx]
             if infrequent_idx is None:
                 continue
 
@@ -784,7 +794,7 @@ def _compute_n_features_outs(self):
 
         # infrequent is enabled, the number of features out are reduced
         # because the infrequent categories are grouped together
-        for i, infreq_idx in enumerate(self.infrequent_indices_):
+        for i, infreq_idx in enumerate(self._infrequent_indices):
             if infreq_idx is None:
                 continue
             output[i] -= infreq_idx.size - 1
@@ -970,7 +980,7 @@ def inverse_transform(self, X):
         found_unknown = {}
 
         if self._infrequent_enabled:
-            infrequent_indices = self.infrequent_indices_
+            infrequent_indices = self._infrequent_indices
         else:
             infrequent_indices = [None] * n_features
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 21bcce4983ce9..5b5560c07b46a 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -971,7 +971,6 @@ def test_ohe_infrequent_two_levels(kwargs, categories):
         sparse=False,
         **kwargs,
     ).fit(X_train)
-    assert_array_equal(ohe.infrequent_indices_, [[0, 2, 3]])
     assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]])
 
     X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
@@ -1077,7 +1076,6 @@ def test_ohe_infrequent_three_levels(kwargs):
     ohe = OneHotEncoder(
         handle_unknown="infrequent_if_exist", sparse=False, **kwargs
     ).fit(X_train)
-    assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
     assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
 
     X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
@@ -1145,7 +1143,7 @@ def test_ohe_infrequent_handle_unknown_error():
     ohe = OneHotEncoder(handle_unknown="error", sparse=False, max_categories=3).fit(
         X_train
     )
-    assert_array_equal(ohe.infrequent_indices_, [[0, 3]])
+    assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
 
     # all categories are known
     X_test = [["b"], ["a"], ["c"], ["d"]]
@@ -1193,9 +1191,7 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
 
 
 def test_ohe_infrequent_two_levels_user_cats():
-    """Test that the order of the categories provided by a user is respected.
-    Specifically, the infrequent_indices_ correspond to the user provided
-    categories."""
+    """Test that the order of the categories provided by a user is respected."""
     X_train = np.array(
         [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
     ).T
@@ -1206,7 +1202,7 @@ def test_ohe_infrequent_two_levels_user_cats():
         max_categories=2,
     ).fit(X_train)
 
-    assert_array_equal(ohe.infrequent_indices_, [[0, 1, 2]])
+    assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]])
 
     X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
     expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
@@ -1236,7 +1232,6 @@ def test_ohe_infrequent_three_levels_user_cats():
         max_categories=3,
     ).fit(X_train)
 
-    assert_array_equal(ohe.infrequent_indices_, [[1, 3]])
     assert_array_equal(ohe.infrequent_categories_, [["d", "a"]])
 
     X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
@@ -1300,11 +1295,8 @@ def test_ohe_infrequent_multiple_categories():
     # X[:, 2] nothing is infrequent
 
     X_trans = ohe.fit_transform(X).toarray()
-    assert_array_equal(ohe.infrequent_indices_[0], [1, 2])
     assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
-    assert_array_equal(ohe.infrequent_indices_[1], [1, 3])
     assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
-    assert_array_equal(ohe.infrequent_indices_[2], None)
     assert_array_equal(ohe.infrequent_categories_[2], None)
 
     # 'infrequent' is used to denote the infrequent categories
@@ -1400,8 +1392,8 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     # 0, 3, 12 will be considered infrequent
 
     X_trans = ohe.fit_transform(X).toarray()
-    assert_allclose(ohe.infrequent_indices_[0], [0, 1])
-    assert_allclose(ohe.infrequent_indices_[1], [0, 1, 4])
+    assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"])
+    assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12])
 
     expected = [
         [0, 0, 1, 1, 0, 0],

From e7d8301a488cfc3bb4ac5d832164d4a8fc7d63bb Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 2 Mar 2022 19:28:08 -0500
Subject: [PATCH 85/92] CLN Change min_frequency default to None

---
 sklearn/preprocessing/_encoders.py           | 26 ++++++++------------
 sklearn/preprocessing/tests/test_encoders.py |  4 ++-
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8f6664623e2d7..a5223bcc16819 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -299,7 +299,7 @@ class OneHotEncoder(_BaseEncoder):
             `'infrequent_if_exist'` was added to automatically handle unknown
             categories and infrequent categories.
 
-    min_frequency : int or float, default=1
+    min_frequency : int or float, default=None
         Specifies the minimum frequency below which a category will be
         considered infrequent.
 
@@ -442,7 +442,7 @@ def __init__(
         sparse=True,
         dtype=np.float64,
         handle_unknown="error",
-        min_frequency=1,
+        min_frequency=None,
         max_categories=None,
     ):
         self.categories = categories
@@ -482,7 +482,7 @@ def _validate_keywords(self):
                     "1 or a float in (0.0, 1.0); got the "
                     f"integer {self.min_frequency}"
                 )
-        else:  # float
+        elif isinstance(self.min_frequency, numbers.Real):
             if not (0.0 < self.min_frequency < 1.0):
                 raise ValueError(
                     "min_frequency must be an integer at least "
@@ -491,16 +491,8 @@ def _validate_keywords(self):
                 )
 
         self._infrequent_enabled = (
-            (self.max_categories is not None and self.max_categories > 1)
-            or (
-                isinstance(self.min_frequency, numbers.Integral)
-                and self.min_frequency > 1
-            )
-            or (
-                isinstance(self.min_frequency, numbers.Real)
-                and self.min_frequency < 1.0
-            )
-        )
+            self.max_categories is not None and self.max_categories > 1
+        ) or self.min_frequency is not None
 
     def _convert_to_infrequent_idx(self, feature_idx, original_idx):
         """Convert `original_idx` for `feature_idx` into the
@@ -636,9 +628,11 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
         """
         if isinstance(self.min_frequency, numbers.Integral):
             infrequent_mask = category_count < self.min_frequency
-        else:  # float
+        elif isinstance(self.min_frequency, numbers.Real):
             min_frequency_abs = n_samples * self.min_frequency
             infrequent_mask = category_count < min_frequency_abs
+        else:
+            infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
 
         n_current_features = category_count.size - infrequent_mask.sum() + 1
         if self.max_categories is not None and self.max_categories < n_current_features:
@@ -714,7 +708,7 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
 
             self._default_to_infrequent_mappings.append(mapping)
 
-    def _map_to_infrequent_categories(self, X_int, X_mask):
+    def _map_infrequent_categories(self, X_int, X_mask):
         """Map categories to infrequent categories. This modifies X_int
         in-place. Values that were invalid based on `X_mask` are mapped to
         the infrequent category if there was an infrequent category for that
@@ -890,7 +884,7 @@ def transform(self, X):
             force_all_finite="allow-nan",
             warn_on_unknown=warn_on_unknown,
         )
-        self._map_to_infrequent_categories(X_int, X_mask)
+        self._map_infrequent_categories(X_int, X_mask)
 
         n_samples, n_features = X_int.shape
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 5b5560c07b46a..6f5891b88f6d8 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1159,7 +1159,9 @@ def test_ohe_infrequent_handle_unknown_error():
         ohe.transform(X_test)
 
 
-@pytest.mark.parametrize("kwargs", [{"max_categories": 3}, {"min_frequency": 4}])
+@pytest.mark.parametrize(
+    "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}]
+)
 def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
     """'a' is the only frequent category, all other categories are infrequent."""
 

From 0bc1fee2140dcbd067d3508c6c1ef3722d7c3574 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 2 Mar 2022 23:39:49 -0500
Subject: [PATCH 86/92] DOC Adds comments

---
 sklearn/preprocessing/_encoders.py |  9 +++++----
 sklearn/utils/_encode.py           | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a5223bcc16819..539d4b13a4b70 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -709,10 +709,11 @@ def _fit_infrequent_category_mapping(self, n_samples, category_counts):
             self._default_to_infrequent_mappings.append(mapping)
 
     def _map_infrequent_categories(self, X_int, X_mask):
-        """Map categories to infrequent categories. This modifies X_int
-        in-place. Values that were invalid based on `X_mask` are mapped to
-        the infrequent category if there was an infrequent category for that
-        feature.
+        """Map infrequent categories to integer representing the infrequent category.
+
+        This modifies X_int in-place. Values that were invalid based on `X_mask`
+        are mapped to the infrequent category if there was an infrequent
+        category for that feature.
 
         Parameters
         ----------
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index 0322250957005..8224cb87a4c75 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -340,10 +340,10 @@ def __missing__(self, key):
 
 
 def _get_counts(values, uniques):
-    """Get the count of each of the `uniques` in `values`. The counts will use
-    the order passed in by `uniques`.
+    """Get the count of each of the `uniques` in `values`.
 
-    For non-object dtypes, `uniques` is assumed to be sorted.
+    The counts will use the order passed in by `uniques`. For non-object dtypes,
+    `uniques` is assumed to be sorted and `np.nan` is at the end.
     """
     if values.dtype.kind in "OU":
         counter = _NaNCounter(values)
@@ -354,14 +354,13 @@ def _get_counts(values, uniques):
         return output
 
     unique_values, counts = _unique_np(values, return_counts=True)
-    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
 
-    # If there are nans, they will be mapped to the end.
+    # Recorder unique_values based on input: `uniques`
+    uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
     if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
         uniques_in_values[-1] = True
 
     unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
-
     output = np.zeros_like(uniques, dtype=np.int64)
     output[uniques_in_values] = counts[unique_valid_indices]
     return output

From c80229185e0598d8d9cd0cff0d0c2b16c7bc348f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Mar 2022 13:47:03 -0500
Subject: [PATCH 87/92] ENH adds support for max_categories=1

---
 sklearn/preprocessing/_encoders.py           |  9 ++------
 sklearn/preprocessing/tests/test_encoders.py | 22 ++++++++------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 539d4b13a4b70..f6d5c7fcc94d5 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -472,7 +472,7 @@ def _validate_keywords(self):
             )
             raise ValueError(msg)
 
-        if self.max_categories is not None and self.max_categories <= 1:
+        if self.max_categories is not None and self.max_categories < 1:
             raise ValueError("max_categories must be greater than 1")
 
         if isinstance(self.min_frequency, numbers.Integral):
@@ -491,7 +491,7 @@ def _validate_keywords(self):
                 )
 
         self._infrequent_enabled = (
-            self.max_categories is not None and self.max_categories > 1
+            self.max_categories is not None and self.max_categories >= 1
         ) or self.min_frequency is not None
 
     def _convert_to_infrequent_idx(self, feature_idx, original_idx):
@@ -643,11 +643,6 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             infrequent_mask[smallest_levels] = True
 
         output = np.flatnonzero(infrequent_mask)
-        if output.size == category_count.size:
-            raise ValueError(
-                f"All categories in column {col_idx} are "
-                "infrequent, try decreasing min_frequency"
-            )
         return output if output.size > 0 else None
 
     def _fit_infrequent_category_mapping(self, n_samples, category_counts):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 6f5891b88f6d8..e792247c846a7 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1437,18 +1437,16 @@ def test_ohe_infrequent_multiple_categories_dtypes():
     assert_array_equal(expected_inv, X_inv)
 
 
-@pytest.mark.parametrize("min_frequency", [21])
-def test_ohe_infrequent_one_level_errors(min_frequency):
+@pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}])
+def test_ohe_infrequent_one_level_errors(kwargs):
     """All user provided categories are infrequent."""
     X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T
 
-    ohe = OneHotEncoder(
-        handle_unknown="infrequent_if_exist", sparse=False, min_frequency=min_frequency
-    )
+    ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False, **kwargs)
+    ohe.fit(X_train)
 
-    msg = "All categories in column 0 are infrequent"
-    with pytest.raises(ValueError, match=msg):
-        ohe.fit(X_train)
+    X_trans = ohe.transform([["a"]])
+    assert_allclose(X_trans, [[1]])
 
 
 @pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}])
@@ -1461,17 +1459,15 @@ def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
         sparse=False,
         handle_unknown="infrequent_if_exist",
         **kwargs,
-    )
+    ).fit(X_train)
 
-    msg = "All categories in column 0 are infrequent"
-    with pytest.raises(ValueError, match=msg):
-        ohe.fit(X_train)
+    X_trans = ohe.transform([["a"], ["e"]])
+    assert_allclose(X_trans, [[1], [1]])
 
 
 @pytest.mark.parametrize(
     "kwargs, error_msg",
     [
-        ({"max_categories": 1}, "max_categories must be greater than 1"),
         ({"max_categories": -2}, "max_categories must be greater than 1"),
         ({"min_frequency": -1}, "min_frequency must be an integer at least"),
         ({"min_frequency": 1.1}, "min_frequency must be an integer at least"),

From 10137a5ca9cba43c785b4a383127f3aaa0c5a736 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Mar 2022 14:36:45 -0500
Subject: [PATCH 88/92] ENH Describe lexicon ordering for ties

---
 doc/modules/preprocessing.rst | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index dd6d0ac1df77d..997bccf66782d 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -728,8 +728,10 @@ In the following example, the categories, `'dog', 'snake'` are considered
 infrequent::
 
    >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +
-   ...               ['snake'] * 3]).T
+   ...               ['snake'] * 3], dtype=object).T
    >>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse=False).fit(X)
+   >>> enc.infrequent_categories_
+   [array(['dog', 'snake'], dtype=object)]
    >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']]))
    array([[0., 0., 1.],
           [1., 0., 0.],
@@ -791,6 +793,17 @@ infrequent::
           [0., 1., 0.],
           [0., 0., 1.]])
 
+If there are infrequent categories with the same cardinality at the cutoff of
+`max_categories`, then then the first `max_categories` are taken based on lexicon
+ordering. In the following example, "b", "c", and "d", have the same cardinality
+and with `max_categories=2`, "b" and "c" are infrequent because they have a higher
+lexicon order.
+
+   >>> X = np.asarray([["a"] * 20 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10], dtype=object).T
+   >>> enc = preprocessing.OneHotEncoder(max_categories=3).fit(X)
+   >>> enc.infrequent_categories_
+   [array(['b', 'c'], dtype=object)]
+
 .. _preprocessing_discretization:
 
 Discretization

From 0da2ee1b68a3c5d9767562ec623bd91f185fdbe8 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Mar 2022 14:59:04 -0500
Subject: [PATCH 89/92] DOC Better docstring

---
 sklearn/preprocessing/_encoders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index f6d5c7fcc94d5..d39ceb1d5bb19 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -257,7 +257,8 @@ class OneHotEncoder(_BaseEncoder):
           should be dropped.
 
         If there are infrequent categories and `drop` selects any of the
-        infrequent categories, then all these categories are dropped.
+        infrequent categories, then the category representing the
+        infrequent categories is dropped.
 
         .. versionadded:: 0.21
            The parameter `drop` was added in 0.21.

From 07b38bd53bfb807f38cd66d8622f7335cccf83d5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Mar 2022 15:00:43 -0500
Subject: [PATCH 90/92] STY Fix

---
 sklearn/preprocessing/_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d39ceb1d5bb19..8d121672ee0ef 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -357,7 +357,6 @@ class OneHotEncoder(_BaseEncoder):
 
         .. versionadded:: 1.1
 
-
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 

From cf73b27b68901c92ceed9070952d495096ac2de5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Mar 2022 13:49:45 -0500
Subject: [PATCH 91/92] CLN Error when explicity dropping an infrequent
 category

---
 sklearn/preprocessing/_encoders.py           | 43 +++++++++-------
 sklearn/preprocessing/tests/test_encoders.py | 52 +++++---------------
 2 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 8d121672ee0ef..5b966023fce2c 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -256,10 +256,6 @@ class OneHotEncoder(_BaseEncoder):
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
 
-        If there are infrequent categories and `drop` selects any of the
-        infrequent categories, then the category representing the
-        infrequent categories is dropped.
-
         .. versionadded:: 0.21
            The parameter `drop` was added in 0.21.
 
@@ -494,20 +490,29 @@ def _validate_keywords(self):
             self.max_categories is not None and self.max_categories >= 1
         ) or self.min_frequency is not None
 
-    def _convert_to_infrequent_idx(self, feature_idx, original_idx):
-        """Convert `original_idx` for `feature_idx` into the
-        index for infrequent categories.
-
-        If there are no infrequent categories, then `original_idx` is
-        returned."""
+    def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
+        """Convert `drop_idx` into the index for infrequent categories.
 
+        If there are no infrequent categories, then `drop_idx` is
+        returned. This method is called in `_compute_drop_idx` when the `drop`
+        parameter is a array-like.
+        """
         if not self._infrequent_enabled:
-            return original_idx
+            return drop_idx
 
         default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
         if default_to_infrequent is None:
-            return original_idx
-        return default_to_infrequent[original_idx]
+            return drop_idx
+
+        # Raise error when explicitly dropping a category that is infrequent
+        infrequent_indices = self._infrequent_indices[feature_idx]
+        if infrequent_indices is not None and drop_idx in infrequent_indices:
+            categories = self.categories_[feature_idx]
+            raise ValueError(
+                f"Unable to drop category {categories[drop_idx]!r} from feature"
+                f" {feature_idx} because it is infrequent"
+            )
+        return default_to_infrequent[drop_idx]
 
     def _compute_drop_idx(self):
         """Compute the drop indices associated with `self.categories_`.
@@ -567,28 +572,28 @@ def _compute_drop_idx(self):
                 raise ValueError(msg.format(len(self.categories_), droplen))
             missing_drops = []
             drop_indices = []
-            for col_idx, (drop_val, cat_list) in enumerate(
+            for feature_idx, (drop_val, cat_list) in enumerate(
                 zip(drop_array, self.categories_)
             ):
                 if not is_scalar_nan(drop_val):
                     drop_idx = np.where(cat_list == drop_val)[0]
                     if drop_idx.size:  # found drop idx
                         drop_indices.append(
-                            self._convert_to_infrequent_idx(col_idx, drop_idx[0])
+                            self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
                         )
                     else:
-                        missing_drops.append((col_idx, drop_val))
+                        missing_drops.append((feature_idx, drop_val))
                     continue
 
                 # drop_val is nan, find nan in categories manually
                 for cat_idx, cat in enumerate(cat_list):
                     if is_scalar_nan(cat):
                         drop_indices.append(
-                            self._convert_to_infrequent_idx(col_idx, cat_idx)
+                            self._map_drop_idx_to_infrequent(feature_idx, cat_idx)
                         )
                         break
-                else:  # no break
-                    missing_drops.append((col_idx, drop_val))
+                else:  # loop did not break thus drop is missing
+                    missing_drops.append((feature_idx, drop_val))
 
             if any(missing_drops):
                 msg = (
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 5caf6ebb34b53..96bd9c2c5b9ff 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1018,40 +1018,19 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
     assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
 
 
-# TODO(1.2): Remove filterwarning when get_feature_names is removed.
-@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
-@pytest.mark.parametrize("drop", [["a"], ["c"], ["d"]])
-def test_ohe_infrequent_two_levels_drop_infrequent(drop):
+@pytest.mark.parametrize("drop", [["a"], ["d"]])
+def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop):
     """Test two levels and dropping any infrequent category removes the
     whole infrequent category."""
 
     X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
     ohe = OneHotEncoder(
         handle_unknown="infrequent_if_exist", sparse=False, max_categories=2, drop=drop
-    ).fit(X_train)
-    assert_array_equal(ohe.drop_idx_, [1])
-
-    X_test = np.array([["b"], ["c"]])
-    X_trans = ohe.transform(X_test)
-    assert_allclose([[1], [0]], X_trans)
-
-    # TODO(1.2): Remove get_feature_names is removed.
-    feature_names = ohe.get_feature_names()
-    assert_array_equal(["x0_b"], feature_names)
-
-    feature_names = ohe.get_feature_names_out()
-    assert_array_equal(["x0_b"], feature_names)
-
-    X_inverse = ohe.inverse_transform(X_trans)
-    assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
-
-    # Check handle_unknown="ignore"
-    ohe.set_params(handle_unknown="ignore").fit(X_train)
-    msg = "Found unknown categories"
-    with pytest.warns(UserWarning, match=msg):
-        X_trans = ohe.transform([["b"], ["e"]])
+    )
 
-    assert_allclose([[1], [0]], X_trans)
+    msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
 
 
 # TODO(1.2): Remove filterwarning when get_feature_names is removed.
@@ -1124,15 +1103,16 @@ def test_ohe_infrequent_three_levels_drop_frequent(drop):
 
 
 @pytest.mark.parametrize("drop", [["a"], ["d"]])
-def test_ohe_infrequent_three_levels_drop_infrequent(drop):
+def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop):
     """Test three levels and dropping the infrequent category."""
     X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
     ohe = OneHotEncoder(
         handle_unknown="infrequent_if_exist", sparse=False, max_categories=3, drop=drop
-    ).fit(X_train)
+    )
 
-    X_test = np.array([["b"], ["c"], ["d"]])
-    assert_allclose([[1, 0], [0, 1], [0, 0]], ohe.transform(X_test))
+    msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
+    with pytest.raises(ValueError, match=msg):
+        ohe.fit(X_train)
 
 
 def test_ohe_infrequent_handle_unknown_error():
@@ -1186,11 +1166,6 @@ def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
         ohe.set_params(drop=drop).fit(X_train)
         assert_allclose([[0], [1]], ohe.transform(X_test))
 
-    # dropping 'c' means the infrequent category is dropped because
-    # 'c' in infrequent
-    ohe.set_params(drop=["c"]).fit(X_train)
-    assert_allclose([[1], [0]], ohe.transform(X_test))
-
 
 def test_ohe_infrequent_two_levels_user_cats():
     """Test that the order of the categories provided by a user is respected."""
@@ -1272,11 +1247,6 @@ def test_ohe_infrequent_mixed():
     # feature 1 is binary so it drops a category 0
     assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])
 
-    # dropping a infrequent category in feature 0
-    ohe.set_params(drop=[1, 1]).fit(X)
-    X_trans = ohe.transform(X_test)
-    assert_allclose(X_trans, [[0, 1, 1], [0, 0, 0]])
-
 
 # TODO(1.2): Remove filterwarning when get_feature_names is removed.
 @pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")

From 66306a4a00cbc8ea7217e1ea0e3458dd9737aeb2 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 4 Mar 2022 13:55:00 -0500
Subject: [PATCH 92/92] STY Grammar

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 5b966023fce2c..740378645d774 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -495,7 +495,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
 
         If there are no infrequent categories, then `drop_idx` is
         returned. This method is called in `_compute_drop_idx` when the `drop`
-        parameter is a array-like.
+        parameter is an array-like.
         """
         if not self._infrequent_enabled:
             return drop_idx