From 6cf152140f1989741ac58e18b39a0b0b6b19ab83 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 09:49:20 +0200
Subject: [PATCH 01/18] MAINT Move DistanceMetric under metrics

---
 doc/glossary.rst                              |  7 +--
 doc/modules/classes.rst                       | 11 +++-
 doc/modules/density.rst                       |  2 +-
 sklearn/cluster/_agglomerative.py             |  4 +-
 sklearn/cluster/_hierarchical_fast.pyx        | 15 +++--
 sklearn/cluster/tests/test_hierarchical.py    |  5 +-
 sklearn/metrics/__init__.py                   |  3 +
 .../{neighbors => metrics}/_dist_metrics.pxd  | 16 +++---
 .../{neighbors => metrics}/_dist_metrics.pyx  | 57 +++++++++----------
 sklearn/metrics/pairwise.py                   |  2 +-
 sklearn/metrics/setup.py                      |  8 +++
 .../tests/test_dist_metrics.py                | 13 +----
 sklearn/neighbors/__init__.py                 |  2 +-
 sklearn/neighbors/_binary_tree.pxi            | 15 +++--
 sklearn/neighbors/_classification.py          |  8 +--
 sklearn/neighbors/_distance_metric.py         | 20 +++++++
 sklearn/neighbors/_graph.py                   | 18 +++---
 sklearn/neighbors/_partition_nodes.pxd        |  2 +-
 sklearn/neighbors/_unsupervised.py            |  4 +-
 sklearn/neighbors/setup.py                    | 13 -----
 sklearn/neighbors/tests/test_ball_tree.py     | 13 ++++-
 .../neighbors/tests/test_neighbors_tree.py    |  2 +-
 sklearn/{neighbors => utils}/_typedefs.pxd    |  0
 sklearn/{neighbors => utils}/_typedefs.pyx    |  0
 sklearn/utils/setup.py                        |  7 +++
 25 files changed, 142 insertions(+), 105 deletions(-)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pxd (87%)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pyx (95%)
 rename sklearn/{neighbors => metrics}/tests/test_dist_metrics.py (95%)
 create mode 100644 sklearn/neighbors/_distance_metric.py
 rename sklearn/{neighbors => utils}/_typedefs.pxd (100%)
 rename sklearn/{neighbors => utils}/_typedefs.pyx (100%)

diff --git a/doc/glossary.rst b/doc/glossary.rst
index 010f16a361531..2b4c6af0d1866 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -644,9 +644,8 @@ General Concepts
 
         Note that for most distance metrics, we rely on implementations from
         :mod:`scipy.spatial.distance`, but may reimplement for efficiency in
-        our context.  The :mod:`neighbors` module also duplicates some metric
-        implementations for integration with efficient binary tree search data
-        structures.
+        our context. The :class:`metrics.DistanceMetric` interface is used to implement
+        distance metrics for integration with efficient neighbors search.
 
     pd
         A shorthand for `Pandas <https://pandas.pydata.org>`_ due to the
@@ -1023,7 +1022,7 @@ such as:
 
 Further examples:
 
-* :class:`neighbors.DistanceMetric`
+* :class:`metrics.DistanceMetric`
 * :class:`gaussian_process.kernels.Kernel`
 * ``tree.Criterion``
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 72b67b23e8dc3..b7000bcf7cbb2 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1058,6 +1058,16 @@ further details.
 
    metrics.consensus_score
 
+Distance metrics
+----------------
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metrics.DistanceMetric
 
 Pairwise metrics
 ----------------
@@ -1317,7 +1327,6 @@ Model validation
    :template: class.rst
 
    neighbors.BallTree
-   neighbors.DistanceMetric
    neighbors.KDTree
    neighbors.KernelDensity
    neighbors.KNeighborsClassifier
diff --git a/doc/modules/density.rst b/doc/modules/density.rst
index 115d318183577..6440bf79ab729 100644
--- a/doc/modules/density.rst
+++ b/doc/modules/density.rst
@@ -136,7 +136,7 @@ The form of these kernels is as follows:
   :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h`
 
 The kernel density estimator can be used with any of the valid distance
-metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though
+metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though
 the results are properly normalized only for the Euclidean metric.  One
 particularly useful metric is the
 `Haversine distance <https://en.wikipedia.org/wiki/Haversine_formula>`_
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 6606f370b81eb..70b3a5028169b 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,8 +16,8 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics.pairwise import paired_distances
-from ..neighbors import DistanceMetric
-from ..neighbors._dist_metrics import METRIC_MAPPING
+from ..metrics import DistanceMetric
+from ..metrics._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index 2a58757ce327d..11ea3294c086a 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -13,7 +13,7 @@ ctypedef np.int8_t INT8
 
 np.import_array()
 
-from ..neighbors._dist_metrics cimport DistanceMetric
+from ..metrics._dist_metrics cimport DistanceMetric
 from ..utils._fast_dict cimport IntFloatDict
 
 # C++
@@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b,
 def average_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
-    """Merge two IntFloatDicts with the average strategy: when the 
-    same key is present in the two dicts, the weighted average of the two 
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
     values is used.
 
     Parameters
@@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b,
 
 
 ###############################################################################
-# An edge object for fast comparisons 
+# An edge object for fast comparisons
 
 cdef class WeightedEdge:
     cdef public ITYPE_t a
     cdef public ITYPE_t b
     cdef public DTYPE_t weight
-    
+
     def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
         self.weight = weight
         self.a = a
@@ -326,7 +326,7 @@ cdef class WeightedEdge:
             return self.weight > other.weight
         elif op == 5:
             return self.weight >= other.weight
-        
+
     def __repr__(self):
         return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
                                               self.weight,
@@ -475,7 +475,7 @@ def mst_linkage_core(
 
     dist_metric: DistanceMetric
         A DistanceMetric object conforming to the API from
-        ``sklearn.neighbors._dist_metrics.pxd`` that will be
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
         used to compute distances.
 
     Returns
@@ -534,4 +534,3 @@ def mst_linkage_core(
         current_node = new_node
 
     return np.array(result)
-
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 92f92dc3736e3..3525643383c26 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -17,7 +17,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
 from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -31,6 +31,7 @@
     _fix_connectivity,
 )
 from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
 from sklearn.metrics.pairwise import (
     PAIRED_DISTANCES,
     cosine_distances,
@@ -38,7 +39,7 @@
     pairwise_distances,
 )
 from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.neighbors import kneighbors_graph, DistanceMetric
+from sklearn.neighbors import kneighbors_graph
 from sklearn.cluster._hierarchical_fast import (
     average_merge,
     max_merge,
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 46958ea4ef7f8..e4339229c5b64 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -36,6 +36,8 @@
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
 
+from ._dist_metrics import DistanceMetric
+
 from . import cluster
 from .cluster import adjusted_mutual_info_score
 from .cluster import adjusted_rand_score
@@ -115,6 +117,7 @@
     "davies_bouldin_score",
     "DetCurveDisplay",
     "det_curve",
+    "DistanceMetric",
     "euclidean_distances",
     "explained_variance_score",
     "f1_score",
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
similarity index 87%
rename from sklearn/neighbors/_dist_metrics.pxd
rename to sklearn/metrics/_dist_metrics.pxd
index 5b223f8c6d8a8..61bb4fb2fe011 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -1,14 +1,12 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
 
-cimport cython
 cimport numpy as np
-from libc.math cimport fabs, sqrt, exp, cos, pow
+from libc.math cimport sqrt, exp
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 ######################################################################
 # Inline distance functions
@@ -60,7 +58,7 @@ cdef class DistanceMetric:
     cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                       ITYPE_t size) nogil except -1
 
-    cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
     cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
similarity index 95%
rename from sklearn/neighbors/_dist_metrics.pyx
rename to sklearn/metrics/_dist_metrics.pyx
index 240a7a3f7d14d..eb1512fe25aef 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1,8 +1,7 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: initializedcheck=False
-#cython: cdivision=True
+# cython: boundscheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: wraparound=False
 
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
@@ -19,7 +18,7 @@ cdef extern from "arrayobject.h":
                                      int typenum, void* data)
 
 
-cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
+cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
     # Wrap a memory buffer with an ndarray. Warning: this is not robust.
     # In particular, if x is deallocated before the returned array goes
     # out of scope, this could cause memory errors.  Since there is not
@@ -33,8 +32,8 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
 from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
 cdef DTYPE_t INF = np.inf
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
-from ._typedefs import DTYPE, ITYPE
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
+from ..utils._typedefs import DTYPE, ITYPE
 
 
 ######################################################################
@@ -98,7 +97,7 @@ cdef class DistanceMetric:
 
     Examples
     --------
-    >>> from sklearn.neighbors import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
@@ -291,14 +290,14 @@ cdef class DistanceMetric:
 
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1:
-        """Compute the reduced distance between vectors x1 and x2.
+        """Compute the ranking-preserving distance between vectors x1 and x2.
 
         This can optionally be overridden in a base class.
 
-        The reduced distance is any measure that yields the same rank as the
-        distance, but is more efficient to compute.  For example, for the
-        Euclidean metric, the reduced distance is the squared-euclidean
-        distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For exampke, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
 
@@ -323,25 +322,25 @@ cdef class DistanceMetric:
         return 0
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        """Convert the reduced distance to the distance"""
+        """Convert the ranking-preserving distance to the distance"""
         return rdist
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        """Convert the distance to the reduced distance"""
+        """Convert the distance to the ranking-preserving distance"""
         return dist
 
     def rdist_to_dist(self, rdist):
-        """Convert the Reduced distance to the true distance.
+        """Convert the ranking-preserving distance to the true distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For exampke, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
         rdist : double
-            Reduced distance.
+            Ranking-preserving distance.
 
         Returns
         -------
@@ -351,12 +350,12 @@ cdef class DistanceMetric:
         return rdist
 
     def dist_to_rdist(self, dist):
-        """Convert the true distance to the reduced distance.
+        """Convert the true distance to the rank-preserving surrogate distance.
 
-        The reduced distance, defined for some metrics, is a computationally
-        more efficient measure which preserves the rank of the true distance.
-        For example, in the Euclidean distance metric, the reduced distance
-        is the squared-euclidean distance.
+        The rank-preserving surrogate distance is any measure that yields the same
+        rank as the distance, but is more efficient to compute. For exampke, for the
+        Euclidean metric, the rank-preserving surrogate distance is the
+        squared-euclidean distance.
 
         Parameters
         ----------
@@ -366,7 +365,7 @@ cdef class DistanceMetric:
         Returns
         -------
         double
-            Reduced distance.
+            Ranking-preserving distance.
         """
         return dist
 
@@ -519,7 +518,7 @@ cdef class ChebyshevDistance(DistanceMetric):
 
     Examples
     --------
-    >>> from sklearn.neighbors.dist_metrics import DistanceMetric
+    >>> from sklearn.metrics.dist_metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('chebyshev')
     >>> X = [[0, 1, 2],
     ...      [3, 4, 5]]
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index d493ad68603ea..51cf80614cb3c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -780,7 +780,7 @@ def haversine_distances(X, Y=None):
     array([[    0.        , 11099.54035582],
            [11099.54035582,     0.        ]])
     """
-    from ..neighbors import DistanceMetric
+    from ..metrics import DistanceMetric
 
     return DistanceMetric.get_metric("haversine").pairwise(X, Y)
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index df1a1caad17e0..69925a3590be6 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -1,4 +1,5 @@
 import os
+import numpy as np
 
 from numpy.distutils.misc_util import Configuration
 
@@ -18,6 +19,13 @@ def configuration(parent_package="", top_path=None):
         "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries
     )
 
+    config.add_extension(
+        "_dist_metrics",
+        sources=["_dist_metrics.pyx"],
+        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
similarity index 95%
rename from sklearn/neighbors/tests/test_dist_metrics.py
rename to sklearn/metrics/tests/test_dist_metrics.py
index 08298f087c216..9440abba6f848 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -7,8 +7,7 @@
 import pytest
 
 from scipy.spatial.distance import cdist
-from sklearn.neighbors import DistanceMetric
-from sklearn.neighbors import BallTree
+from sklearn.metrics import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -230,16 +229,6 @@ def test_pyfunc_metric():
     assert_array_almost_equal(D1_pkl, D2_pkl)
 
 
-def test_bad_pyfunc_metric():
-    def wrong_distance(x, y):
-        return "1"
-
-    X = np.ones((5, 2))
-    msg = "Custom distance function must accept two vectors"
-    with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_distance)
-
-
 def test_input_data_size():
     # Regression test for #6288
     # Previously, a metric requiring a particular input dimension would fail
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 8a0934eecf142..340910008f75c 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -5,7 +5,7 @@
 
 from ._ball_tree import BallTree
 from ._kd_tree import KDTree
-from ._dist_metrics import DistanceMetric
+from ._distance_metric import DistanceMetric
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from ._unsupervised import NearestNeighbors
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 9f90414994550..f25da86e2148c 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -153,11 +153,16 @@ import numpy as np
 import warnings
 from ..utils import check_array
 
-from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
-from ._typedefs import DTYPE, ITYPE
+from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
+from sklearn.utils._typedefs import DTYPE, ITYPE
 
-from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
-                             euclidean_dist_to_rdist, euclidean_rdist_to_dist)
+from ..metrics._dist_metrics cimport (
+    DistanceMetric,
+    euclidean_dist,
+    euclidean_rdist,
+    euclidean_dist_to_rdist,
+    euclidean_rdist_to_dist,
+)
 
 from ._partition_nodes cimport partition_node_indices
 
@@ -878,7 +883,7 @@ def newObj(obj):
 
 ######################################################################
 # define the reverse mapping of VALID_METRICS
-from ._dist_metrics import get_valid_metric_ids
+from sklearn.metrics._dist_metrics import get_valid_metric_ids
 VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index ced21c7885962..08790cd1976bb 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -67,8 +67,8 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -348,8 +348,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
     metric : str or callable, default='minkowski'
         Distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py
new file mode 100644
index 0000000000000..10d6e24139068
--- /dev/null
+++ b/sklearn/neighbors/_distance_metric.py
@@ -0,0 +1,20 @@
+# TODO: Remove this file in 1.2
+import warnings
+
+from ..metrics import DistanceMetric as _DistanceMetric
+
+
+class DistanceMetric(_DistanceMetric):
+    @classmethod
+    def _warn(cls):
+        warnings.warn(
+            "sklearn.neighbors.DistanceMetric has been moved "
+            "to sklearn.metrics.DistanceMetric in 1.0. "
+            "This import path will be removed in 1.2",
+            category=FutureWarning,
+        )
+
+    @classmethod
+    def get_metric(cls, metric, **kwargs):
+        DistanceMetric._warn()
+        return _DistanceMetric.get_metric(metric, **kwargs)
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index e6fdeffe3b291..9afa37b71a808 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -65,10 +65,11 @@ def kneighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the k-Neighbors for each sample
-        point. The DistanceMetric class gives a list of available metrics.
-        The default distance is 'euclidean' ('minkowski' metric with the p
-        param equal to 2.)
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
@@ -157,10 +158,11 @@ def radius_neighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The DistanceMetric class
-        gives a list of available metrics. The default distance is
-        'euclidean' ('minkowski' metric with the param equal to 2.)
+        The distance metric to use for the tree. The default metric is
+        minkowski, and with p=2 is equivalent to the standard Euclidean
+        metric.
+        For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 522e826632824..94b02002d7a1e 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,4 +1,4 @@
-from ._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 cdef int partition_node_indices(
         DTYPE_t *data,
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 6b6eec1a3112b..440ac41eb71d5 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         The distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. For a list of available metrics, see the documentation of
+        :class:`~sklearn.metrics.DistanceMetric`.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 85305efc29c78..aa19ba501b18d 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -32,19 +32,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_dist_metrics",
-        sources=["_dist_metrics.pyx"],
-        include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")],
-        libraries=libraries,
-    )
-
-    config.add_extension(
-        "_typedefs",
-        sources=["_typedefs.pyx"],
-        include_dirs=[numpy.get_include()],
-        libraries=libraries,
-    )
     config.add_extension(
         "_quad_tree",
         sources=["_quad_tree.pyx"],
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index c751539f2a1ae..a823a03251a1b 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -4,7 +4,6 @@
 import pytest
 from numpy.testing import assert_array_almost_equal
 from sklearn.neighbors._ball_tree import BallTree
-from sklearn.neighbors import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_array
 from sklearn.utils._testing import _convert_container
@@ -40,6 +39,8 @@
 
 
 def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
     X, Y = check_array(X), check_array(Y)
     D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
     ind = np.argsort(D, axis=1)[:, :k]
@@ -84,3 +85,13 @@ def test_array_object_type():
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
         BallTree(X)
+
+
+def test_bad_pyfunc_metric():
+    def wrong_distance(x, y):
+        return "1"
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors"
+    with pytest.raises(TypeError, match=msg):
+        BallTree(X, metric=wrong_distance)
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index de34b4d230171..e043ffb730708 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from sklearn.neighbors import DistanceMetric
+from sklearn.metrics import DistanceMetric
 from sklearn.neighbors._ball_tree import (
     BallTree,
     kernel_norm,
diff --git a/sklearn/neighbors/_typedefs.pxd b/sklearn/utils/_typedefs.pxd
similarity index 100%
rename from sklearn/neighbors/_typedefs.pxd
rename to sklearn/utils/_typedefs.pxd
diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/utils/_typedefs.pyx
similarity index 100%
rename from sklearn/neighbors/_typedefs.pyx
rename to sklearn/utils/_typedefs.pyx
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index c75cbe2d86495..ed78ecc5db76f 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -88,6 +88,13 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_typedefs",
+        sources=["_typedefs.pyx"],
+        include_dirs=[numpy.get_include()],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config

From 2e0fff9f1116624872d92948a126405c46694025 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 11:21:23 +0200
Subject: [PATCH 02/18] Add whats_new entry

---
 doc/whats_new/v1.1.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 3aabed6214771..23b03d4bcb027 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -45,6 +45,15 @@ Changelog
   message when the solver does not support sparse matrices with int64 indices.
   :pr:`21093` by `Tom Dupre la Tour`_.
 
+:mod:`sklearn.metrics`
+......................
+
+- |API| :class:`metrics.DistanceMetric` has been moved from
+  :mod:`sklearn.neighbors` to :mod:`sklearn.metric`.
+  Using :class:`neighbors.DistanceMetric` for imports is still valid for
+  backward compatibility, but this interface will be removed in 1.2.
+ :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.utils`
 ....................
 

From ec7ca1b8fd5dcc89d628db4afc9310bb0eeff000 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 11:39:38 +0200
Subject: [PATCH 03/18] Add a test for the deprecation cycle

---
 sklearn/neighbors/tests/test_neighbors.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 1e1f3a082786e..7c37cb55d7768 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1818,3 +1818,12 @@ def test_pairwise_deprecated(NearestNeighbors):
     msg = r"Attribute `_pairwise` was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
         nn._pairwise
+
+
+# TODO: Remove in 1.2
+def test_neighbors_distance_metric_deprecation():
+    from sklearn.neighbors import DistanceMetric
+
+    msg = r"This import path will be removed in 1\.2"
+    with pytest.warns(FutureWarning, match=msg):
+        DistanceMetric.get_metric("euclidean")

From 4dbe6518395374757247e2c3a9fdae748622fdf2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 13:35:45 +0200
Subject: [PATCH 04/18] Add a space to make Sphinx happy

---
 doc/whats_new/v1.1.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 23b03d4bcb027..8d9eba46a9069 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -52,7 +52,7 @@ Changelog
   :mod:`sklearn.neighbors` to :mod:`sklearn.metric`.
   Using :class:`neighbors.DistanceMetric` for imports is still valid for
   backward compatibility, but this interface will be removed in 1.2.
- :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
+  :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
 
 :mod:`sklearn.utils`
 ....................

From a3e03cf4802c480edf757b10e66517778801d152 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 28 Sep 2021 18:41:05 +0200
Subject: [PATCH 05/18] Apply suggestions from review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/modules/density.rst                   |  6 +++---
 doc/whats_new/v1.1.rst                    |  2 +-
 sklearn/neighbors/_distance_metric.py     |  4 ++--
 sklearn/neighbors/tests/test_ball_tree.py | 10 ----------
 sklearn/neighbors/tests/test_neighbors.py |  9 ++++++---
 5 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/doc/modules/density.rst b/doc/modules/density.rst
index 6440bf79ab729..9e542b803ef68 100644
--- a/doc/modules/density.rst
+++ b/doc/modules/density.rst
@@ -136,9 +136,9 @@ The form of these kernels is as follows:
   :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h`
 
 The kernel density estimator can be used with any of the valid distance
-metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though
-the results are properly normalized only for the Euclidean metric.  One
-particularly useful metric is the
+metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of
+available metrics), though the results are properly normalized only
+for the Euclidean metric.  One particularly useful metric is the
 `Haversine distance <https://en.wikipedia.org/wiki/Haversine_formula>`_
 which measures the angular distance between points on a sphere.  Here
 is an example of using a kernel density estimate for a visualization
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 8d9eba46a9069..99e1a87ca3017 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -51,7 +51,7 @@ Changelog
 - |API| :class:`metrics.DistanceMetric` has been moved from
   :mod:`sklearn.neighbors` to :mod:`sklearn.metric`.
   Using :class:`neighbors.DistanceMetric` for imports is still valid for
-  backward compatibility, but this interface will be removed in 1.2.
+  backward compatibility, but this alias will be removed in 1.3.
   :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
 
 :mod:`sklearn.utils`
diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py
index 10d6e24139068..c973425d2e7b6 100644
--- a/sklearn/neighbors/_distance_metric.py
+++ b/sklearn/neighbors/_distance_metric.py
@@ -1,4 +1,4 @@
-# TODO: Remove this file in 1.2
+# TODO: Remove this file in 1.3
 import warnings
 
 from ..metrics import DistanceMetric as _DistanceMetric
@@ -10,7 +10,7 @@ def _warn(cls):
         warnings.warn(
             "sklearn.neighbors.DistanceMetric has been moved "
             "to sklearn.metrics.DistanceMetric in 1.0. "
-            "This import path will be removed in 1.2",
+            "This import path will be removed in 1.3",
             category=FutureWarning,
         )
 
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index a823a03251a1b..41ccff25a260e 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -85,13 +85,3 @@ def test_array_object_type():
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
         BallTree(X)
-
-
-def test_bad_pyfunc_metric():
-    def wrong_distance(x, y):
-        return "1"
-
-    X = np.ones((5, 2))
-    msg = "Custom distance function must accept two vectors"
-    with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_distance)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 7c37cb55d7768..82144115ffbf3 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1820,10 +1820,13 @@ def test_pairwise_deprecated(NearestNeighbors):
         nn._pairwise
 
 
-# TODO: Remove in 1.2
+# TODO: Remove in 1.3
 def test_neighbors_distance_metric_deprecation():
     from sklearn.neighbors import DistanceMetric
+    from sklearn.metrics import DistanceMetric as ActualDistanceMetric
 
-    msg = r"This import path will be removed in 1\.2"
+    msg = r"This import path will be removed in 1\.3"
     with pytest.warns(FutureWarning, match=msg):
-        DistanceMetric.get_metric("euclidean")
+        dist_metric = DistanceMetric.get_metric("euclidean")
+
+    assert isinstance(dist_metric, ActualDistanceMetric)

From b6e54bad9f0d831a0cb285705694c6dae5729cc1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 6 Oct 2021 14:30:54 +0200
Subject: [PATCH 06/18] DOC Fix formatting in doc/whats_new/v1.1.rst

---
 doc/whats_new/v1.1.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 4374129315395..355830aa274b8 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -61,6 +61,7 @@ Changelog
   Using :class:`neighbors.DistanceMetric` for imports is still valid for
   backward compatibility, but this alias will be removed in 1.3.
   :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 

From cd3cd5d75a37ce06d7e7f2ce09bb6068bb63f3fc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 8 Oct 2021 08:19:10 -0400
Subject: [PATCH 07/18] Fix formatting

Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 doc/whats_new/v1.1.rst            | 2 +-
 sklearn/metrics/_dist_metrics.pyx | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 355830aa274b8..5bece85ad9f54 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -58,7 +58,7 @@ Changelog
 
 - |API| :class:`metrics.DistanceMetric` has been moved from
   :mod:`sklearn.neighbors` to :mod:`sklearn.metric`.
-  Using :class:`neighbors.DistanceMetric` for imports is still valid for
+  Using `neighbors.DistanceMetric` for imports is still valid for
   backward compatibility, but this alias will be removed in 1.3.
   :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
 
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index eb1512fe25aef..c592c1d8c5d4a 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -295,7 +295,7 @@ cdef class DistanceMetric:
         This can optionally be overridden in a base class.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For exampke, for the
+        rank as the distance, but is more efficient to compute. For example, for the
         Euclidean metric, the rank-preserving surrogate distance is the
         squared-euclidean distance.
         """
@@ -333,7 +333,7 @@ cdef class DistanceMetric:
         """Convert the ranking-preserving distance to the true distance.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For exampke, for the
+        rank as the distance, but is more efficient to compute. For example, for the
         Euclidean metric, the rank-preserving surrogate distance is the
         squared-euclidean distance.
 
@@ -353,7 +353,7 @@ cdef class DistanceMetric:
         """Convert the true distance to the rank-preserving surrogate distance.
 
         The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For exampke, for the
+        rank as the distance, but is more efficient to compute. For example, for the
         Euclidean metric, the rank-preserving surrogate distance is the
         squared-euclidean distance.
 

From cb8223b0f2209d71b6c099739f5cf160641c83bb Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 6 Oct 2021 15:05:57 +0200
Subject: [PATCH 08/18] FIX out of bound error in split_indices (#21130)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.1.rst                        |  9 ++++++-
 .../_hist_gradient_boosting/splitting.pyx     | 26 ++++++++++++++-----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 5bece85ad9f54..1a9f773ce08df 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -38,7 +38,6 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
-
 :mod:`sklearn.calibration`
 ..........................
 
@@ -46,6 +45,14 @@ Changelog
   `pos_label` to specify the positive class label.
   :pr:`21032` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.ensemble`
+...........................
+
+- |Fix| Fixed a bug that could produce a segfault in rare cases for
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor`.
+  :pr:`21130` :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 08ae7aaf0862c..232cf094876cb 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -388,11 +388,25 @@ cdef class Splitter:
                     &left_indices_buffer[offset_in_buffers[thread_idx]],
                     sizeof(unsigned int) * left_counts[thread_idx]
                 )
-                memcpy(
-                    &sample_indices[right_offset[thread_idx]],
-                    &right_indices_buffer[offset_in_buffers[thread_idx]],
-                    sizeof(unsigned int) * right_counts[thread_idx]
-                )
+                if right_counts[thread_idx] > 0:
+                    # If we're splitting the rightmost node of the tree, i.e. the
+                    # rightmost node in the partition array, and if n_threads >= 2, one
+                    # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices)
+                    # leading to evaluating
+                    #
+                    #    &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node]
+                    #                                      = &partition[n_samples_in_tree]
+                    #
+                    # which is an out-of-bounds read access that can cause a segmentation fault.
+                    # When boundscheck=True, removing this check produces this exception:
+                    #
+                    #    IndexError: Out of bounds on buffer access
+                    #
+                    memcpy(
+                        &sample_indices[right_offset[thread_idx]],
+                        &right_indices_buffer[offset_in_buffers[thread_idx]],
+                        sizeof(unsigned int) * right_counts[thread_idx]
+                    )
 
         return (sample_indices[:right_child_position],
                 sample_indices[right_child_position:],
@@ -839,7 +853,7 @@ cdef class Splitter:
         # other category. The low-support categories will always be mapped to
         # the right child. We scan the sorted categories array from left to
         # right and from right to left, and we stop at the middle.
-        
+
         # Considering ordered categories A B C D, with E being a low-support
         # category: A B C D
         #              ^

From 7cc80df0671851fbda57ed85ad33950d325974d2 Mon Sep 17 00:00:00 2001
From: Helder Geovane Gomes de Lima <he7d3r@gmail.com>
Date: Wed, 6 Oct 2021 11:21:15 -0300
Subject: [PATCH 09/18] DOC Remove unused import from example (#21253)

---
 sklearn/feature_extraction/text.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index a0b74a60dab4d..8dd743813fa27 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1557,7 +1557,6 @@ class TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     >>> from sklearn.feature_extraction.text import TfidfTransformer
     >>> from sklearn.feature_extraction.text import CountVectorizer
     >>> from sklearn.pipeline import Pipeline
-    >>> import numpy as np
     >>> corpus = ['this is the first document',
     ...           'this document is the second document',
     ...           'and this is the third one',

From b1223e7e4598b77b093730ab04d43ef13b008096 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 7 Oct 2021 04:13:00 -0400
Subject: [PATCH 10/18] MAINT Enable and run black on examples (#20502)

---
 .../applications/plot_digits_denoising.py     |  29 ++-
 .../applications/plot_face_recognition.py     |  38 ++--
 .../plot_model_complexity_influence.py        | 161 +++++++------
 .../plot_out_of_core_classification.py        | 169 +++++++-------
 .../plot_outlier_detection_wine.py            |  71 +++---
 .../applications/plot_prediction_latency.py   | 211 ++++++++++--------
 .../plot_species_distribution_modeling.py     |  91 +++++---
 examples/applications/plot_stock_market.py    | 209 +++++++++--------
 .../plot_tomography_l1_reconstruction.py      |  32 ++-
 .../plot_topics_extraction_with_nmf_lda.py    |  97 ++++----
 examples/applications/svm_gui.py              | 140 ++++++++----
 .../wikipedia_principal_eigenvector.py        |  18 +-
 .../bicluster/plot_bicluster_newsgroups.py    |  86 ++++---
 .../bicluster/plot_spectral_biclustering.py   |  17 +-
 .../bicluster/plot_spectral_coclustering.py   |   7 +-
 examples/calibration/plot_calibration.py      |  68 +++---
 .../calibration/plot_calibration_curve.py     |  90 +++++---
 .../plot_calibration_multiclass.py            | 155 ++++++++-----
 .../calibration/plot_compare_calibration.py   |  38 ++--
 .../plot_classification_probability.py        |  39 ++--
 .../plot_classifier_comparison.py             |  84 ++++---
 .../plot_digits_classification.py             |  17 +-
 examples/classification/plot_lda.py           |  58 +++--
 examples/classification/plot_lda_qda.py       | 116 ++++++----
 .../plot_adjusted_for_chance_measures.py      |  51 +++--
 examples/cluster/plot_affinity_propagation.py |  40 ++--
 .../cluster/plot_agglomerative_clustering.py  |  36 +--
 .../plot_agglomerative_clustering_metrics.py  |  66 +++---
 .../cluster/plot_agglomerative_dendrogram.py  |   9 +-
 .../cluster/plot_birch_vs_minibatchkmeans.py  |  43 ++--
 examples/cluster/plot_cluster_comparison.py   | 195 ++++++++++------
 examples/cluster/plot_cluster_iris.py         |  54 ++---
 examples/cluster/plot_coin_segmentation.py    |  19 +-
 .../cluster/plot_coin_ward_segmentation.py    |  20 +-
 examples/cluster/plot_color_quantization.py   |  16 +-
 examples/cluster/plot_dbscan.py               |  50 +++--
 examples/cluster/plot_dict_face_patches.py    |  19 +-
 examples/cluster/plot_digits_agglomeration.py |  23 +-
 examples/cluster/plot_digits_linkage.py       |  20 +-
 examples/cluster/plot_face_compress.py        |  11 +-
 ...e_agglomeration_vs_univariate_selection.py |  21 +-
 examples/cluster/plot_kmeans_assumptions.py   |   9 +-
 examples/cluster/plot_kmeans_digits.py        |  58 +++--
 examples/cluster/plot_kmeans_plusplus.py      |  17 +-
 .../plot_kmeans_silhouette_analysis.py        |  66 ++++--
 .../plot_kmeans_stability_low_dim_dense.py    |  54 +++--
 examples/cluster/plot_linkage_comparison.py   |  96 +++++---
 examples/cluster/plot_mean_shift.py           |  16 +-
 examples/cluster/plot_mini_batch_kmeans.py    |  65 +++---
 examples/cluster/plot_optics.py               |  60 ++---
 examples/cluster/plot_segmentation_toy.py     |   8 +-
 .../plot_ward_structured_vs_unstructured.py   |  36 ++-
 examples/compose/plot_column_transformer.py   | 126 ++++++-----
 .../plot_column_transformer_mixed_types.py    |  69 +++---
 examples/compose/plot_compare_reduction.py    |  48 ++--
 examples/compose/plot_digits_pipe.py          |  38 ++--
 examples/compose/plot_feature_union.py        |   8 +-
 examples/compose/plot_transformed_target.py   | 137 +++++++-----
 .../covariance/plot_covariance_estimation.py  |  53 +++--
 examples/covariance/plot_lw_vs_oas.py         |  45 +++-
 .../covariance/plot_mahalanobis_distances.py  |  86 ++++---
 .../plot_robust_vs_empirical_covariance.py    |  90 +++++---
 examples/covariance/plot_sparse_cov.py        |  62 ++---
 .../plot_compare_cross_decomposition.py       |  56 ++---
 .../cross_decomposition/plot_pcr_vs_pls.py    |  55 +++--
 examples/datasets/plot_digits_last_image.py   |   2 +-
 examples/datasets/plot_iris_dataset.py        |  22 +-
 examples/datasets/plot_random_dataset.py      |  49 ++--
 .../plot_random_multilabel_dataset.py         |  77 ++++---
 .../decomposition/plot_beta_divergence.py     |   4 +-
 .../decomposition/plot_faces_decomposition.py | 206 ++++++++++-------
 .../plot_ica_blind_source_separation.py       |  12 +-
 examples/decomposition/plot_ica_vs_pca.py     |  36 +--
 .../decomposition/plot_image_denoising.py     |  75 ++++---
 .../decomposition/plot_incremental_pca.py     |  14 +-
 examples/decomposition/plot_kernel_pca.py     |  36 ++-
 examples/decomposition/plot_pca_3d.py         |  13 +-
 examples/decomposition/plot_pca_iris.py       |  20 +-
 .../plot_pca_vs_fa_model_selection.py         |  68 +++---
 examples/decomposition/plot_pca_vs_lda.py     |  26 ++-
 examples/decomposition/plot_sparse_coding.py  |  79 ++++---
 examples/decomposition/plot_varimax_fa.py     |   8 +-
 .../ensemble/plot_adaboost_hastie_10_2.py     |  56 +++--
 examples/ensemble/plot_adaboost_multiclass.py |  68 +++---
 examples/ensemble/plot_adaboost_regression.py |   5 +-
 examples/ensemble/plot_adaboost_twoclass.py   |  70 +++---
 examples/ensemble/plot_bias_variance.py       |  39 ++--
 examples/ensemble/plot_ensemble_oob.py        |  49 ++--
 examples/ensemble/plot_forest_importances.py  |  29 ++-
 .../ensemble/plot_forest_importances_faces.py |   6 +-
 examples/ensemble/plot_forest_iris.py         |  63 +++---
 .../plot_gradient_boosting_categorical.py     |  88 +++++---
 .../plot_gradient_boosting_early_stopping.py  |  63 +++---
 .../ensemble/plot_gradient_boosting_oob.py    |  37 +--
 .../plot_gradient_boosting_quantile.py        |  98 ++++----
 .../plot_gradient_boosting_regression.py      |  56 +++--
 .../plot_gradient_boosting_regularization.py  |  53 +++--
 examples/ensemble/plot_isolation_forest.py    |  20 +-
 .../ensemble/plot_monotonic_constraints.py    |   4 +-
 .../ensemble/plot_random_forest_embedding.py  |  19 +-
 ...ot_random_forest_regression_multioutput.py |  51 +++--
 examples/ensemble/plot_stack_predictors.py    | 119 ++++++----
 .../ensemble/plot_voting_decision_regions.py  |  27 +--
 examples/ensemble/plot_voting_probas.py       |  50 +++--
 examples/ensemble/plot_voting_regressor.py    |  21 +-
 examples/exercises/plot_cv_diabetes.py        |  26 ++-
 examples/exercises/plot_cv_digits.py          |  11 +-
 .../plot_digits_classification_exercise.py    |  16 +-
 examples/exercises/plot_iris_exercise.py      |  32 ++-
 .../feature_selection/plot_f_test_vs_mi.py    |   5 +-
 .../plot_feature_selection.py                 |  43 ++--
 .../plot_feature_selection_pipeline.py        |  12 +-
 .../plot_rfe_with_cross_validation.py         |  30 ++-
 .../plot_select_from_model_diabetes.py        |  25 ++-
 .../gaussian_process/plot_compare_gpr_krr.py  |  41 ++--
 examples/gaussian_process/plot_gpc.py         |  82 ++++---
 examples/gaussian_process/plot_gpc_iris.py    |  17 +-
 .../plot_gpc_isoprobability.py                |  57 +++--
 examples/gaussian_process/plot_gpc_xor.py     |  30 +--
 examples/gaussian_process/plot_gpr_co2.py     |  62 ++---
 examples/gaussian_process/plot_gpr_noisy.py   |  74 +++---
 .../plot_gpr_noisy_targets.py                 |  53 +++--
 .../plot_gpr_on_structured_data.py            | 112 ++++++----
 ...t_iterative_imputer_variants_comparison.py |  35 ++-
 examples/impute/plot_missing_values.py        | 129 ++++++-----
 .../inspection/plot_permutation_importance.py |  62 ++---
 ...t_permutation_importance_multicollinear.py |  32 +--
 .../plot_scalable_poly_kernels.py             |  99 +++++---
 examples/linear_model/plot_ard.py             |  37 +--
 examples/linear_model/plot_bayesian_ridge.py  |  41 ++--
 .../plot_bayesian_ridge_curvefit.py           |  22 +-
 ...puted_gram_matrix_with_weighted_samples.py |   2 +-
 examples/linear_model/plot_huber_vs_ridge.py  |  19 +-
 examples/linear_model/plot_iris_logistic.py   |  12 +-
 .../linear_model/plot_lasso_and_elasticnet.py |  44 ++--
 .../plot_lasso_dense_vs_sparse_data.py        |  12 +-
 examples/linear_model/plot_lasso_lars.py      |  12 +-
 .../plot_lasso_model_selection.py             |  84 ++++---
 examples/linear_model/plot_logistic.py        |  21 +-
 .../plot_logistic_l1_l2_sparsity.py           |  34 +--
 .../linear_model/plot_logistic_multinomial.py |  23 +-
 examples/linear_model/plot_logistic_path.py   |  24 +-
 .../plot_multi_task_lasso_support.py          |  38 ++--
 examples/linear_model/plot_nnls.py            |   4 +-
 examples/linear_model/plot_ols.py             |  12 +-
 examples/linear_model/plot_ols_3d.py          |  26 ++-
 .../linear_model/plot_ols_ridge_variance.py   |  23 +-
 examples/linear_model/plot_omp.py             |  26 +--
 ...plot_poisson_regression_non_normal_loss.py | 203 +++++++++--------
 .../plot_polynomial_interpolation.py          |  37 +--
 .../linear_model/plot_quantile_regression.py  |  20 +-
 examples/linear_model/plot_ransac.py          |  34 ++-
 examples/linear_model/plot_ridge_coeffs.py    |  25 ++-
 examples/linear_model/plot_ridge_path.py      |  12 +-
 examples/linear_model/plot_robust_fit.py      |  58 +++--
 examples/linear_model/plot_sgd_comparison.py  |  21 +-
 .../linear_model/plot_sgd_early_stopping.py   |  59 ++---
 examples/linear_model/plot_sgd_iris.py        |  23 +-
 .../linear_model/plot_sgd_loss_functions.py   |  33 +--
 examples/linear_model/plot_sgd_penalties.py   |  30 +--
 .../plot_sgd_separating_hyperplane.py         |   9 +-
 .../linear_model/plot_sgd_weighted_samples.py |  22 +-
 .../linear_model/plot_sgdocsvm_vs_ocsvm.py    | 105 +++++----
 ...sparse_logistic_regression_20newsgroups.py |  85 +++----
 .../plot_sparse_logistic_regression_mnist.py  |  24 +-
 examples/linear_model/plot_theilsen.py        |  48 ++--
 ...lot_tweedie_regression_insurance_claims.py | 149 +++++++------
 examples/manifold/plot_compare_methods.py     |  39 ++--
 examples/manifold/plot_manifold_sphere.py     |  57 +++--
 examples/manifold/plot_mds.py                 |  45 ++--
 examples/manifold/plot_swissroll.py           |  11 +-
 examples/manifold/plot_t_sne_perplexity.py    |  35 +--
 .../miscellaneous/plot_anomaly_comparison.py  |  81 ++++---
 .../plot_changed_only_pprint_parameter.py     |   6 +-
 .../plot_display_object_visualization.py      |   5 +-
 .../miscellaneous/plot_isotonic_regression.py |  16 +-
 .../plot_johnson_lindenstrauss_bound.py       |  47 ++--
 .../plot_kernel_approximation.py              |  91 ++++----
 .../plot_kernel_ridge_regression.py           | 122 ++++++----
 examples/miscellaneous/plot_multilabel.py     |  60 +++--
 .../plot_multioutput_face_completion.py       |  37 +--
 ...ot_partial_dependence_visualization_api.py |  25 ++-
 examples/mixture/plot_concentration_prior.py  | 125 +++++++----
 examples/mixture/plot_gmm.py                  |  44 ++--
 examples/mixture/plot_gmm_covariances.py      |  53 +++--
 examples/mixture/plot_gmm_pdf.py              |  21 +-
 examples/mixture/plot_gmm_selection.py        |  67 +++---
 examples/mixture/plot_gmm_sin.py              | 119 ++++++----
 .../grid_search_text_feature_extraction.py    |  32 +--
 .../model_selection/plot_confusion_matrix.py  |  16 +-
 examples/model_selection/plot_cv_indices.py   | 122 +++++++---
 examples/model_selection/plot_cv_predict.py   |   6 +-
 .../plot_grid_search_digits.py                |  25 +--
 .../plot_grid_search_refit_callable.py        |  62 ++---
 .../model_selection/plot_grid_search_stats.py | 114 +++++-----
 .../model_selection/plot_learning_curve.py    |  86 ++++---
 .../plot_multi_metric_evaluation.py           |  71 +++---
 .../plot_nested_cross_validation_iris.py      |  38 ++--
 .../model_selection/plot_randomized_search.py |  49 ++--
 examples/model_selection/plot_roc.py          | 105 +++++----
 .../plot_successive_halving_heatmap.py        |  58 ++---
 .../plot_successive_halving_iterations.py     |  40 ++--
 .../plot_train_error_vs_test_error.py         |  25 ++-
 .../plot_underfitting_overfitting.py          |  25 ++-
 .../model_selection/plot_validation_curve.py  |  42 +++-
 .../plot_classifier_chain_yeast.py            |  64 +++---
 .../approximate_nearest_neighbors.py          | 155 +++++++------
 .../plot_caching_nearest_neighbors.py         |  31 +--
 examples/neighbors/plot_classification.py     |  26 ++-
 .../neighbors/plot_digits_kde_sampling.py     |  14 +-
 examples/neighbors/plot_kde_1d.py             |  75 ++++---
 .../neighbors/plot_lof_novelty_detection.py   |  33 +--
 .../neighbors/plot_lof_outlier_detection.py   |  16 +-
 examples/neighbors/plot_nca_classification.py |  61 +++--
 examples/neighbors/plot_nca_dim_reduction.py  |  32 ++-
 examples/neighbors/plot_nca_illustration.py   |  34 +--
 examples/neighbors/plot_nearest_centroid.py   |  19 +-
 examples/neighbors/plot_regression.py         |  11 +-
 examples/neighbors/plot_species_kde.py        |  40 ++--
 examples/neural_networks/plot_mlp_alpha.py    |  78 ++++---
 .../plot_mlp_training_curves.py               | 114 +++++++---
 .../neural_networks/plot_mnist_filters.py     |  22 +-
 .../plot_rbm_logistic_classification.py       |  58 +++--
 examples/preprocessing/plot_all_scaling.py    | 145 +++++++-----
 examples/preprocessing/plot_discretization.py |  37 +--
 .../plot_discretization_classification.py     | 142 +++++++-----
 .../plot_discretization_strategies.py         |  41 ++--
 .../preprocessing/plot_map_data_to_normal.py  |  56 ++---
 .../preprocessing/plot_scaling_importance.py  |  73 +++---
 .../plot_release_highlights_0_22_0.py         |  53 ++---
 .../plot_release_highlights_0_23_0.py         |  47 ++--
 .../plot_release_highlights_0_24_0.py         |  60 +++--
 .../plot_label_propagation_digits.py          |  15 +-
 ...abel_propagation_digits_active_learning.py |  48 ++--
 .../plot_label_propagation_structure.py       |  61 +++--
 .../plot_self_training_varying_threshold.py   |  47 ++--
 .../plot_semi_supervised_newsgroups.py        |  63 +++---
 .../plot_semi_supervised_versus_svm_iris.py   |  37 +--
 examples/svm/plot_custom_kernel.py            |   9 +-
 examples/svm/plot_iris_svc.py                 |  34 +--
 .../svm/plot_linearsvc_support_vectors.py     |  29 ++-
 examples/svm/plot_oneclass.py                 |  36 +--
 examples/svm/plot_rbf_parameters.py           |  40 ++--
 examples/svm/plot_separating_hyperplane.py    |  17 +-
 .../plot_separating_hyperplane_unbalanced.py  |  28 ++-
 examples/svm/plot_svm_anova.py                |  19 +-
 examples/svm/plot_svm_kernels.py              |  65 +++---
 examples/svm/plot_svm_margin.py               |  36 +--
 examples/svm/plot_svm_nonlinear.py            |  22 +-
 examples/svm/plot_svm_regression.py           |  58 +++--
 examples/svm/plot_svm_scale_c.py              |  70 +++---
 examples/svm/plot_svm_tie_breaking.py         |  16 +-
 examples/svm/plot_weighted_samples.py         |  21 +-
 ...ot_document_classification_20newsgroups.py | 207 +++++++++--------
 examples/text/plot_document_clustering.py     | 143 +++++++-----
 .../text/plot_hashing_vs_dict_vectorizer.py   |  19 +-
 examples/tree/plot_cost_complexity_pruning.py |  19 +-
 examples/tree/plot_iris_dtc.py                |  21 +-
 examples/tree/plot_tree_regression.py         |   6 +-
 .../tree/plot_tree_regression_multioutput.py  |  23 +-
 examples/tree/plot_unveil_tree_structure.py   |  75 ++++---
 pyproject.toml                                |   1 -
 262 files changed, 7852 insertions(+), 5520 deletions(-)

diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py
index 426a8c61111c0..004292cdbb762 100644
--- a/examples/applications/plot_digits_denoising.py
+++ b/examples/applications/plot_digits_denoising.py
@@ -87,9 +87,9 @@ def plot_digits(X, title):
 # Let's first have a look to see the difference between noise-free and noisy
 # images. We will check the test set in this regard.
 plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_test_noisy,
-            f"Noisy test images\n"
-            f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}")
+plot_digits(
+    X_test_noisy, f"Noisy test images\nMSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}"
+)
 
 # %%
 # Learn the `PCA` basis
@@ -100,8 +100,9 @@ def plot_digits(X, title):
 from sklearn.decomposition import PCA, KernelPCA
 
 pca = PCA(n_components=32)
-kernel_pca = KernelPCA(n_components=400, kernel="rbf", gamma=1e-3,
-                       fit_inverse_transform=True, alpha=5e-3)
+kernel_pca = KernelPCA(
+    n_components=400, kernel="rbf", gamma=1e-3, fit_inverse_transform=True, alpha=5e-3
+)
 
 pca.fit(X_train_noisy)
 _ = kernel_pca.fit(X_train_noisy)
@@ -118,17 +119,21 @@ def plot_digits(X, title):
 # kernel to learn the PCA basis and a kernel ridge to learn the mapping
 # function.
 X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
-    kernel_pca.transform(X_test_noisy))
+    kernel_pca.transform(X_test_noisy)
+)
 X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy))
 
 # %%
 plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_reconstructed_pca,
-            f"PCA reconstruction\n"
-            f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}")
-plot_digits(X_reconstructed_kernel_pca,
-            f"Kernel PCA reconstruction\n"
-            f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}")
+plot_digits(
+    X_reconstructed_pca,
+    f"PCA reconstruction\nMSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}",
+)
+plot_digits(
+    X_reconstructed_kernel_pca,
+    "Kernel PCA reconstruction\n"
+    f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}",
+)
 
 # %%
 # PCA has a lower MSE than kernel PCA. However, the qualitative analysis might
diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py
index 41ef0ca0edde6..7e5d05102fa0c 100644
--- a/examples/applications/plot_face_recognition.py
+++ b/examples/applications/plot_face_recognition.py
@@ -43,7 +43,7 @@
 print(__doc__)
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
 
 
 # #############################################################################
@@ -75,7 +75,8 @@
 
 # split into a training and testing set
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.25, random_state=42)
+    X, y, test_size=0.25, random_state=42
+)
 
 
 # #############################################################################
@@ -83,11 +84,11 @@
 # dataset): unsupervised feature extraction / dimensionality reduction
 n_components = 150
 
-print("Extracting the top %d eigenfaces from %d faces"
-      % (n_components, X_train.shape[0]))
+print(
+    "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
+)
 t0 = time()
-pca = PCA(n_components=n_components, svd_solver='randomized',
-          whiten=True).fit(X_train)
+pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X_train)
 print("done in %0.3fs" % (time() - t0))
 
 eigenfaces = pca.components_.reshape((n_components, h, w))
@@ -104,11 +105,11 @@
 
 print("Fitting the classifier to the training set")
 t0 = time()
-param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
-              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
-clf = GridSearchCV(
-    SVC(kernel='rbf', class_weight='balanced'), param_grid
-)
+param_grid = {
+    "C": [1e3, 5e3, 1e4, 5e4, 1e5],
+    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
+}
+clf = GridSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid)
 clf = clf.fit(X_train_pca, y_train)
 print("done in %0.3fs" % (time() - t0))
 print("Best estimator found by grid search:")
@@ -130,10 +131,11 @@
 # #############################################################################
 # Qualitative evaluation of the predictions using matplotlib
 
+
 def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
     """Helper function to plot a gallery of portraits"""
     plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
-    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
+    plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.90, hspace=0.35)
     for i in range(n_row * n_col):
         plt.subplot(n_row, n_col, i + 1)
         plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
@@ -144,14 +146,16 @@ def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
 
 # plot the result of the prediction on a portion of the test set
 
+
 def title(y_pred, y_test, target_names, i):
-    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
-    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
-    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)
+    pred_name = target_names[y_pred[i]].rsplit(" ", 1)[-1]
+    true_name = target_names[y_test[i]].rsplit(" ", 1)[-1]
+    return "predicted: %s\ntrue:      %s" % (pred_name, true_name)
 
 
-prediction_titles = [title(y_pred, y_test, target_names, i)
-                     for i in range(y_pred.shape[0])]
+prediction_titles = [
+    title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])
+]
 
 plot_gallery(X_test, prediction_titles, h, w)
 
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index 5748a546bdaad..241d9d4e33cca 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -72,23 +72,21 @@
 
 def generate_data(case):
     """Generate regression/classification data."""
-    if case == 'regression':
+    if case == "regression":
         X, y = datasets.load_diabetes(return_X_y=True)
-    elif case == 'classification':
-        X, y = datasets.fetch_20newsgroups_vectorized(subset='all',
-                                                      return_X_y=True)
+    elif case == "classification":
+        X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
     X, y = shuffle(X, y)
     offset = int(X.shape[0] * 0.8)
     X_train, y_train = X[:offset], y[:offset]
     X_test, y_test = X[offset:], y[offset:]
 
-    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
-            'y_test': y_test}
+    data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
     return data
 
 
-regression_data = generate_data('regression')
-classification_data = generate_data('classification')
+regression_data = generate_data("regression")
+classification_data = generate_data("classification")
 
 
 ##############################################################################
@@ -110,26 +108,33 @@ def benchmark_influence(conf):
     prediction_times = []
     prediction_powers = []
     complexities = []
-    for param_value in conf['changing_param_values']:
-        conf['tuned_params'][conf['changing_param']] = param_value
-        estimator = conf['estimator'](**conf['tuned_params'])
+    for param_value in conf["changing_param_values"]:
+        conf["tuned_params"][conf["changing_param"]] = param_value
+        estimator = conf["estimator"](**conf["tuned_params"])
 
         print("Benchmarking %s" % estimator)
-        estimator.fit(conf['data']['X_train'], conf['data']['y_train'])
-        conf['postfit_hook'](estimator)
-        complexity = conf['complexity_computer'](estimator)
+        estimator.fit(conf["data"]["X_train"], conf["data"]["y_train"])
+        conf["postfit_hook"](estimator)
+        complexity = conf["complexity_computer"](estimator)
         complexities.append(complexity)
         start_time = time.time()
-        for _ in range(conf['n_samples']):
-            y_pred = estimator.predict(conf['data']['X_test'])
-        elapsed_time = (time.time() - start_time) / float(conf['n_samples'])
+        for _ in range(conf["n_samples"]):
+            y_pred = estimator.predict(conf["data"]["X_test"])
+        elapsed_time = (time.time() - start_time) / float(conf["n_samples"])
         prediction_times.append(elapsed_time)
-        pred_score = conf['prediction_performance_computer'](
-            conf['data']['y_test'], y_pred)
+        pred_score = conf["prediction_performance_computer"](
+            conf["data"]["y_test"], y_pred
+        )
         prediction_powers.append(pred_score)
-        print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % (
-            complexity, conf['prediction_performance_label'], pred_score,
-            elapsed_time))
+        print(
+            "Complexity: %d | %s: %.4f | Pred. Time: %fs\n"
+            % (
+                complexity,
+                conf["prediction_performance_label"],
+                pred_score,
+                elapsed_time,
+            )
+        )
     return prediction_powers, prediction_times, complexities
 
 
@@ -147,46 +152,58 @@ def benchmark_influence(conf):
 # different data.
 #
 
+
 def _count_nonzero_coefficients(estimator):
     a = estimator.coef_.toarray()
     return np.count_nonzero(a)
 
 
 configurations = [
-    {'estimator': SGDClassifier,
-     'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss':
-                      'modified_huber', 'fit_intercept': True, 'tol': 1e-3},
-     'changing_param': 'l1_ratio',
-     'changing_param_values': [0.25, 0.5, 0.75, 0.9],
-     'complexity_label': 'non_zero coefficients',
-     'complexity_computer': _count_nonzero_coefficients,
-     'prediction_performance_computer': hamming_loss,
-     'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)',
-     'postfit_hook': lambda x: x.sparsify(),
-     'data': classification_data,
-     'n_samples': 30},
-    {'estimator': NuSVR,
-     'tuned_params': {'C': 1e3, 'gamma': 2 ** -15},
-     'changing_param': 'nu',
-     'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9],
-     'complexity_label': 'n_support_vectors',
-     'complexity_computer': lambda x: len(x.support_vectors_),
-     'data': regression_data,
-     'postfit_hook': lambda x: x,
-     'prediction_performance_computer': mean_squared_error,
-     'prediction_performance_label': 'MSE',
-     'n_samples': 30},
-    {'estimator': GradientBoostingRegressor,
-     'tuned_params': {'loss': 'squared_error'},
-     'changing_param': 'n_estimators',
-     'changing_param_values': [10, 50, 100, 200, 500],
-     'complexity_label': 'n_trees',
-     'complexity_computer': lambda x: x.n_estimators,
-     'data': regression_data,
-     'postfit_hook': lambda x: x,
-     'prediction_performance_computer': mean_squared_error,
-     'prediction_performance_label': 'MSE',
-     'n_samples': 30},
+    {
+        "estimator": SGDClassifier,
+        "tuned_params": {
+            "penalty": "elasticnet",
+            "alpha": 0.001,
+            "loss": "modified_huber",
+            "fit_intercept": True,
+            "tol": 1e-3,
+        },
+        "changing_param": "l1_ratio",
+        "changing_param_values": [0.25, 0.5, 0.75, 0.9],
+        "complexity_label": "non_zero coefficients",
+        "complexity_computer": _count_nonzero_coefficients,
+        "prediction_performance_computer": hamming_loss,
+        "prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
+        "postfit_hook": lambda x: x.sparsify(),
+        "data": classification_data,
+        "n_samples": 30,
+    },
+    {
+        "estimator": NuSVR,
+        "tuned_params": {"C": 1e3, "gamma": 2 ** -15},
+        "changing_param": "nu",
+        "changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9],
+        "complexity_label": "n_support_vectors",
+        "complexity_computer": lambda x: len(x.support_vectors_),
+        "data": regression_data,
+        "postfit_hook": lambda x: x,
+        "prediction_performance_computer": mean_squared_error,
+        "prediction_performance_label": "MSE",
+        "n_samples": 30,
+    },
+    {
+        "estimator": GradientBoostingRegressor,
+        "tuned_params": {"loss": "squared_error"},
+        "changing_param": "n_estimators",
+        "changing_param_values": [10, 50, 100, 200, 500],
+        "complexity_label": "n_trees",
+        "complexity_computer": lambda x: x.n_estimators,
+        "data": regression_data,
+        "postfit_hook": lambda x: x,
+        "prediction_performance_computer": mean_squared_error,
+        "prediction_performance_label": "MSE",
+        "n_samples": 30,
+    },
 ]
 
 
@@ -209,6 +226,7 @@ def _count_nonzero_coefficients(estimator):
 # ensemble is not as detrimental.
 #
 
+
 def plot_influence(conf, mse_values, prediction_times, complexities):
     """
     Plot influence of model complexity on both accuracy and latency.
@@ -219,38 +237,37 @@ def plot_influence(conf, mse_values, prediction_times, complexities):
 
     # first axes (prediction error)
     ax1 = fig.add_subplot(111)
-    line1 = ax1.plot(complexities, mse_values, c='tab:blue', ls='-')[0]
-    ax1.set_xlabel('Model Complexity (%s)' % conf['complexity_label'])
-    y1_label = conf['prediction_performance_label']
+    line1 = ax1.plot(complexities, mse_values, c="tab:blue", ls="-")[0]
+    ax1.set_xlabel("Model Complexity (%s)" % conf["complexity_label"])
+    y1_label = conf["prediction_performance_label"]
     ax1.set_ylabel(y1_label)
 
-    ax1.spines['left'].set_color(line1.get_color())
+    ax1.spines["left"].set_color(line1.get_color())
     ax1.yaxis.label.set_color(line1.get_color())
-    ax1.tick_params(axis='y', colors=line1.get_color())
+    ax1.tick_params(axis="y", colors=line1.get_color())
 
     # second axes (latency)
     ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
-    line2 = ax2.plot(complexities, prediction_times, c='tab:orange', ls='-')[0]
+    line2 = ax2.plot(complexities, prediction_times, c="tab:orange", ls="-")[0]
     ax2.yaxis.tick_right()
     ax2.yaxis.set_label_position("right")
     y2_label = "Time (s)"
     ax2.set_ylabel(y2_label)
-    ax1.spines['right'].set_color(line2.get_color())
+    ax1.spines["right"].set_color(line2.get_color())
     ax2.yaxis.label.set_color(line2.get_color())
-    ax2.tick_params(axis='y', colors=line2.get_color())
+    ax2.tick_params(axis="y", colors=line2.get_color())
 
-    plt.legend((line1, line2), ("prediction error", "latency"),
-               loc='upper right')
+    plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right")
 
-    plt.title("Influence of varying '%s' on %s" % (conf['changing_param'],
-                                                   conf['estimator'].__name__))
+    plt.title(
+        "Influence of varying '%s' on %s"
+        % (conf["changing_param"], conf["estimator"].__name__)
+    )
 
 
 for conf in configurations:
-    prediction_performances, prediction_times, complexities = \
-        benchmark_influence(conf)
-    plot_influence(conf, prediction_performances, prediction_times,
-                   complexities)
+    prediction_performances, prediction_times, complexities = benchmark_influence(conf)
+    plot_influence(conf, prediction_performances, prediction_times, complexities)
 plt.show()
 
 
diff --git a/examples/applications/plot_out_of_core_classification.py b/examples/applications/plot_out_of_core_classification.py
index 62cf00c4a0daa..287188c35b807 100644
--- a/examples/applications/plot_out_of_core_classification.py
+++ b/examples/applications/plot_out_of_core_classification.py
@@ -41,7 +41,8 @@
 
 def _not_in_sphinx():
     # Hack to detect whether we are running by the sphinx builder
-    return '__file__' in globals()
+    return "__file__" in globals()
+
 
 # %%
 # Reuters Dataset related routines
@@ -55,17 +56,17 @@ def _not_in_sphinx():
 class ReutersParser(HTMLParser):
     """Utility class to parse a SGML file and yield documents one at a time."""
 
-    def __init__(self, encoding='latin-1'):
+    def __init__(self, encoding="latin-1"):
         HTMLParser.__init__(self)
         self._reset()
         self.encoding = encoding
 
     def handle_starttag(self, tag, attrs):
-        method = 'start_' + tag
+        method = "start_" + tag
         getattr(self, method, lambda x: None)(attrs)
 
     def handle_endtag(self, tag):
-        method = 'end_' + tag
+        method = "end_" + tag
         getattr(self, method, lambda: None)()
 
     def _reset(self):
@@ -99,10 +100,10 @@ def start_reuters(self, attributes):
         pass
 
     def end_reuters(self):
-        self.body = re.sub(r'\s+', r' ', self.body)
-        self.docs.append({'title': self.title,
-                          'body': self.body,
-                          'topics': self.topics})
+        self.body = re.sub(r"\s+", r" ", self.body)
+        self.docs.append(
+            {"title": self.title, "body": self.body, "topics": self.topics}
+        )
         self._reset()
 
     def start_title(self, attributes):
@@ -143,37 +144,36 @@ def stream_reuters_documents(data_path=None):
 
     """
 
-    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
-                    'reuters21578-mld/reuters21578.tar.gz')
-    ARCHIVE_FILENAME = 'reuters21578.tar.gz'
+    DOWNLOAD_URL = (
+        "http://archive.ics.uci.edu/ml/machine-learning-databases/"
+        "reuters21578-mld/reuters21578.tar.gz"
+    )
+    ARCHIVE_FILENAME = "reuters21578.tar.gz"
 
     if data_path is None:
         data_path = os.path.join(get_data_home(), "reuters")
     if not os.path.exists(data_path):
         """Download the dataset."""
-        print("downloading dataset (once and for all) into %s" %
-              data_path)
+        print("downloading dataset (once and for all) into %s" % data_path)
         os.mkdir(data_path)
 
         def progress(blocknum, bs, size):
-            total_sz_mb = '%.2f MB' % (size / 1e6)
-            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
+            total_sz_mb = "%.2f MB" % (size / 1e6)
+            current_sz_mb = "%.2f MB" % ((blocknum * bs) / 1e6)
             if _not_in_sphinx():
-                sys.stdout.write(
-                    '\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb))
+                sys.stdout.write("\rdownloaded %s / %s" % (current_sz_mb, total_sz_mb))
 
         archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
-        urlretrieve(DOWNLOAD_URL, filename=archive_path,
-                    reporthook=progress)
+        urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress)
         if _not_in_sphinx():
-            sys.stdout.write('\r')
+            sys.stdout.write("\r")
         print("untarring Reuters dataset...")
-        tarfile.open(archive_path, 'r:gz').extractall(data_path)
+        tarfile.open(archive_path, "r:gz").extractall(data_path)
         print("done.")
 
     parser = ReutersParser()
     for filename in glob(os.path.join(data_path, "*.sgm")):
-        for doc in parser.parse(open(filename, 'rb')):
+        for doc in parser.parse(open(filename, "rb")):
             yield doc
 
 
@@ -184,8 +184,9 @@ def progress(blocknum, bs, size):
 # Create the vectorizer and limit the number of features to a reasonable
 # maximum
 
-vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
-                               alternate_sign=False)
+vectorizer = HashingVectorizer(
+    decode_error="ignore", n_features=2 ** 18, alternate_sign=False
+)
 
 
 # Iterator over parsed Reuters SGML files.
@@ -196,14 +197,14 @@ def progress(blocknum, bs, size):
 # files. For other datasets, one should take care of creating a test set with
 # a realistic portion of positive instances.
 all_classes = np.array([0, 1])
-positive_class = 'acq'
+positive_class = "acq"
 
 # Here are some classifiers that support the `partial_fit` method
 partial_fit_classifiers = {
-    'SGD': SGDClassifier(max_iter=5),
-    'Perceptron': Perceptron(),
-    'NB Multinomial': MultinomialNB(alpha=0.01),
-    'Passive-Aggressive': PassiveAggressiveClassifier(),
+    "SGD": SGDClassifier(max_iter=5),
+    "Perceptron": Perceptron(),
+    "NB Multinomial": MultinomialNB(alpha=0.01),
+    "Passive-Aggressive": PassiveAggressiveClassifier(),
 }
 
 
@@ -213,9 +214,11 @@ def get_minibatch(doc_iter, size, pos_class=positive_class):
     Note: size is before excluding invalid docs with no topics assigned.
 
     """
-    data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
-            for doc in itertools.islice(doc_iter, size)
-            if doc['topics']]
+    data = [
+        ("{title}\n\n{body}".format(**doc), pos_class in doc["topics"])
+        for doc in itertools.islice(doc_iter, size)
+        if doc["topics"]
+    ]
     if not len(data):
         return np.asarray([], dtype=int), np.asarray([], dtype=int)
     X_text, y = zip(*data)
@@ -231,7 +234,7 @@ def iter_minibatches(doc_iter, minibatch_size):
 
 
 # test data statistics
-test_stats = {'n_test': 0, 'n_test_pos': 0}
+test_stats = {"n_test": 0, "n_test_pos": 0}
 
 # First we hold out a number of examples to estimate accuracy
 n_test_documents = 1000
@@ -241,28 +244,34 @@ def iter_minibatches(doc_iter, minibatch_size):
 tick = time.time()
 X_test = vectorizer.transform(X_test_text)
 vectorizing_time = time.time() - tick
-test_stats['n_test'] += len(y_test)
-test_stats['n_test_pos'] += sum(y_test)
+test_stats["n_test"] += len(y_test)
+test_stats["n_test_pos"] += sum(y_test)
 print("Test set is %d documents (%d positive)" % (len(y_test), sum(y_test)))
 
 
 def progress(cls_name, stats):
     """Report progress information, return a string."""
-    duration = time.time() - stats['t0']
+    duration = time.time() - stats["t0"]
     s = "%20s classifier : \t" % cls_name
     s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
     s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
     s += "accuracy: %(accuracy).3f " % stats
-    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
+    s += "in %.2fs (%5d docs/s)" % (duration, stats["n_train"] / duration)
     return s
 
 
 cls_stats = {}
 
 for cls_name in partial_fit_classifiers:
-    stats = {'n_train': 0, 'n_train_pos': 0,
-             'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),
-             'runtime_history': [(0, 0)], 'total_fit_time': 0.0}
+    stats = {
+        "n_train": 0,
+        "n_train_pos": 0,
+        "accuracy": 0.0,
+        "accuracy_history": [(0, 0)],
+        "t0": time.time(),
+        "runtime_history": [(0, 0)],
+        "total_fit_time": 0.0,
+    }
     cls_stats[cls_name] = stats
 
 get_minibatch(data_stream, n_test_documents)
@@ -291,23 +300,24 @@ def progress(cls_name, stats):
         cls.partial_fit(X_train, y_train, classes=all_classes)
 
         # accumulate test accuracy stats
-        cls_stats[cls_name]['total_fit_time'] += time.time() - tick
-        cls_stats[cls_name]['n_train'] += X_train.shape[0]
-        cls_stats[cls_name]['n_train_pos'] += sum(y_train)
+        cls_stats[cls_name]["total_fit_time"] += time.time() - tick
+        cls_stats[cls_name]["n_train"] += X_train.shape[0]
+        cls_stats[cls_name]["n_train_pos"] += sum(y_train)
         tick = time.time()
-        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
-        cls_stats[cls_name]['prediction_time'] = time.time() - tick
-        acc_history = (cls_stats[cls_name]['accuracy'],
-                       cls_stats[cls_name]['n_train'])
-        cls_stats[cls_name]['accuracy_history'].append(acc_history)
-        run_history = (cls_stats[cls_name]['accuracy'],
-                       total_vect_time + cls_stats[cls_name]['total_fit_time'])
-        cls_stats[cls_name]['runtime_history'].append(run_history)
+        cls_stats[cls_name]["accuracy"] = cls.score(X_test, y_test)
+        cls_stats[cls_name]["prediction_time"] = time.time() - tick
+        acc_history = (cls_stats[cls_name]["accuracy"], cls_stats[cls_name]["n_train"])
+        cls_stats[cls_name]["accuracy_history"].append(acc_history)
+        run_history = (
+            cls_stats[cls_name]["accuracy"],
+            total_vect_time + cls_stats[cls_name]["total_fit_time"],
+        )
+        cls_stats[cls_name]["runtime_history"].append(run_history)
 
         if i % 3 == 0:
             print(progress(cls_name, cls_stats[cls_name]))
     if i % 3 == 0:
-        print('\n')
+        print("\n")
 
 
 # %%
@@ -326,64 +336,66 @@ def plot_accuracy(x, y, x_legend):
     """Plot accuracy as a function of x."""
     x = np.array(x)
     y = np.array(y)
-    plt.title('Classification accuracy as a function of %s' % x_legend)
-    plt.xlabel('%s' % x_legend)
-    plt.ylabel('Accuracy')
+    plt.title("Classification accuracy as a function of %s" % x_legend)
+    plt.xlabel("%s" % x_legend)
+    plt.ylabel("Accuracy")
     plt.grid(True)
     plt.plot(x, y)
 
 
-rcParams['legend.fontsize'] = 10
+rcParams["legend.fontsize"] = 10
 cls_names = list(sorted(cls_stats.keys()))
 
 # Plot accuracy evolution
 plt.figure()
 for _, stats in sorted(cls_stats.items()):
     # Plot accuracy evolution with #examples
-    accuracy, n_examples = zip(*stats['accuracy_history'])
+    accuracy, n_examples = zip(*stats["accuracy_history"])
     plot_accuracy(n_examples, accuracy, "training examples (#)")
     ax = plt.gca()
     ax.set_ylim((0.8, 1))
-plt.legend(cls_names, loc='best')
+plt.legend(cls_names, loc="best")
 
 plt.figure()
 for _, stats in sorted(cls_stats.items()):
     # Plot accuracy evolution with runtime
-    accuracy, runtime = zip(*stats['runtime_history'])
-    plot_accuracy(runtime, accuracy, 'runtime (s)')
+    accuracy, runtime = zip(*stats["runtime_history"])
+    plot_accuracy(runtime, accuracy, "runtime (s)")
     ax = plt.gca()
     ax.set_ylim((0.8, 1))
-plt.legend(cls_names, loc='best')
+plt.legend(cls_names, loc="best")
 
 # Plot fitting times
 plt.figure()
 fig = plt.gcf()
-cls_runtime = [stats['total_fit_time']
-               for cls_name, stats in sorted(cls_stats.items())]
+cls_runtime = [stats["total_fit_time"] for cls_name, stats in sorted(cls_stats.items())]
 
 cls_runtime.append(total_vect_time)
-cls_names.append('Vectorization')
-bar_colors = ['b', 'g', 'r', 'c', 'm', 'y']
+cls_names.append("Vectorization")
+bar_colors = ["b", "g", "r", "c", "m", "y"]
 
 ax = plt.subplot(111)
-rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5,
-                     color=bar_colors)
+rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, color=bar_colors)
 
 ax.set_xticks(np.linspace(0, len(cls_names) - 1, len(cls_names)))
 ax.set_xticklabels(cls_names, fontsize=10)
 ymax = max(cls_runtime) * 1.2
 ax.set_ylim((0, ymax))
-ax.set_ylabel('runtime (s)')
-ax.set_title('Training Times')
+ax.set_ylabel("runtime (s)")
+ax.set_title("Training Times")
 
 
 def autolabel(rectangles):
     """attach some text vi autolabel on rectangles."""
     for rect in rectangles:
         height = rect.get_height()
-        ax.text(rect.get_x() + rect.get_width() / 2.,
-                1.05 * height, '%.4f' % height,
-                ha='center', va='bottom')
+        ax.text(
+            rect.get_x() + rect.get_width() / 2.0,
+            1.05 * height,
+            "%.4f" % height,
+            ha="center",
+            va="bottom",
+        )
         plt.setp(plt.xticks()[1], rotation=30)
 
 
@@ -396,23 +408,22 @@ def autolabel(rectangles):
 cls_runtime = []
 cls_names = list(sorted(cls_stats.keys()))
 for cls_name, stats in sorted(cls_stats.items()):
-    cls_runtime.append(stats['prediction_time'])
+    cls_runtime.append(stats["prediction_time"])
 cls_runtime.append(parsing_time)
-cls_names.append('Read/Parse\n+Feat.Extr.')
+cls_names.append("Read/Parse\n+Feat.Extr.")
 cls_runtime.append(vectorizing_time)
-cls_names.append('Hashing\n+Vect.')
+cls_names.append("Hashing\n+Vect.")
 
 ax = plt.subplot(111)
-rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5,
-                     color=bar_colors)
+rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, color=bar_colors)
 
 ax.set_xticks(np.linspace(0, len(cls_names) - 1, len(cls_names)))
 ax.set_xticklabels(cls_names, fontsize=8)
 plt.setp(plt.xticks()[1], rotation=30)
 ymax = max(cls_runtime) * 1.2
 ax.set_ylim((0, ymax))
-ax.set_ylabel('runtime (s)')
-ax.set_title('Prediction Times (%d instances)' % n_test_documents)
+ax.set_ylabel("runtime (s)")
+ax.set_title("Prediction Times (%d instances)" % n_test_documents)
 autolabel(rectangles)
 plt.tight_layout()
 plt.show()
diff --git a/examples/applications/plot_outlier_detection_wine.py b/examples/applications/plot_outlier_detection_wine.py
index 49210b800d1b6..182f613d11eaa 100644
--- a/examples/applications/plot_outlier_detection_wine.py
+++ b/examples/applications/plot_outlier_detection_wine.py
@@ -47,17 +47,18 @@
 
 # Define "classifiers" to be used
 classifiers = {
-    "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
-                                             contamination=0.25),
-    "Robust Covariance (Minimum Covariance Determinant)":
-    EllipticEnvelope(contamination=0.25),
-    "OCSVM": OneClassSVM(nu=0.25, gamma=0.35)}
-colors = ['m', 'g', 'b']
+    "Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
+    "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
+        contamination=0.25
+    ),
+    "OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
+}
+colors = ["m", "g", "b"]
 legend1 = {}
 legend2 = {}
 
 # Get data
-X1 = load_wine()['data'][:, [1, 2]]  # two clusters
+X1 = load_wine()["data"][:, [1, 2]]  # two clusters
 
 # Learn a frontier for outlier detection with several classifiers
 xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
@@ -67,7 +68,8 @@
     Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
     Z1 = Z1.reshape(xx1.shape)
     legend1[clf_name] = plt.contour(
-        xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])
+        xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
+    )
 
 legend1_values_list = list(legend1.values())
 legend1_keys_list = list(legend1.keys())
@@ -75,20 +77,30 @@
 # Plot the results (= shape of the data points cloud)
 plt.figure(1)  # two clusters
 plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X1[:, 0], X1[:, 1], color='black')
+plt.scatter(X1[:, 0], X1[:, 1], color="black")
 bbox_args = dict(boxstyle="round", fc="0.8")
 arrow_args = dict(arrowstyle="->")
-plt.annotate("outlying points", xy=(4, 2),
-             xycoords="data", textcoords="data",
-             xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args)
+plt.annotate(
+    "outlying points",
+    xy=(4, 2),
+    xycoords="data",
+    textcoords="data",
+    xytext=(3, 1.25),
+    bbox=bbox_args,
+    arrowprops=arrow_args,
+)
 plt.xlim((xx1.min(), xx1.max()))
 plt.ylim((yy1.min(), yy1.max()))
-plt.legend((legend1_values_list[0].collections[0],
-            legend1_values_list[1].collections[0],
-            legend1_values_list[2].collections[0]),
-           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
-           loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.legend(
+    (
+        legend1_values_list[0].collections[0],
+        legend1_values_list[1].collections[0],
+        legend1_values_list[2].collections[0],
+    ),
+    (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
+    loc="upper center",
+    prop=matplotlib.font_manager.FontProperties(size=11),
+)
 plt.ylabel("ash")
 plt.xlabel("malic_acid")
 
@@ -107,7 +119,7 @@
 # the data scatter matrix and the risk of over-fitting the data.
 
 # Get data
-X2 = load_wine()['data'][:, [6, 9]]  # "banana"-shaped
+X2 = load_wine()["data"][:, [6, 9]]  # "banana"-shaped
 
 # Learn a frontier for outlier detection with several classifiers
 xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
@@ -117,7 +129,8 @@
     Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
     Z2 = Z2.reshape(xx2.shape)
     legend2[clf_name] = plt.contour(
-        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
+        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]
+    )
 
 legend2_values_list = list(legend2.values())
 legend2_keys_list = list(legend2.keys())
@@ -125,15 +138,19 @@
 # Plot the results (= shape of the data points cloud)
 plt.figure(2)  # "banana" shape
 plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X2[:, 0], X2[:, 1], color='black')
+plt.scatter(X2[:, 0], X2[:, 1], color="black")
 plt.xlim((xx2.min(), xx2.max()))
 plt.ylim((yy2.min(), yy2.max()))
-plt.legend((legend2_values_list[0].collections[0],
-            legend2_values_list[1].collections[0],
-            legend2_values_list[2].collections[0]),
-           (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
-           loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.legend(
+    (
+        legend2_values_list[0].collections[0],
+        legend2_values_list[1].collections[0],
+        legend2_values_list[2].collections[0],
+    ),
+    (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
+    loc="upper center",
+    prop=matplotlib.font_manager.FontProperties(size=11),
+)
 plt.ylabel("color_intensity")
 plt.xlabel("flavanoids")
 
diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py
index e59402e47fe17..b9780e7974776 100644
--- a/examples/applications/plot_prediction_latency.py
+++ b/examples/applications/plot_prediction_latency.py
@@ -35,7 +35,7 @@
 
 def _not_in_sphinx():
     # Hack to detect whether we are running by the sphinx builder
-    return '__file__' in globals()
+    return "__file__" in globals()
 
 
 def atomic_benchmark_estimator(estimator, X_test, verbose=False):
@@ -48,8 +48,12 @@ def atomic_benchmark_estimator(estimator, X_test, verbose=False):
         estimator.predict(instance)
         runtimes[i] = time.time() - start
     if verbose:
-        print("atomic_benchmark runtimes:", min(runtimes), np.percentile(
-            runtimes, 50), max(runtimes))
+        print(
+            "atomic_benchmark runtimes:",
+            min(runtimes),
+            np.percentile(runtimes, 50),
+            max(runtimes),
+        )
     return runtimes
 
 
@@ -63,8 +67,12 @@ def bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose):
         runtimes[i] = time.time() - start
     runtimes = np.array(list(map(lambda x: x / float(n_instances), runtimes)))
     if verbose:
-        print("bulk_benchmark runtimes:", min(runtimes), np.percentile(
-            runtimes, 50), max(runtimes))
+        print(
+            "bulk_benchmark runtimes:",
+            min(runtimes),
+            np.percentile(runtimes, 50),
+            max(runtimes),
+        )
     return runtimes
 
 
@@ -85,8 +93,7 @@ def benchmark_estimator(estimator, X_test, n_bulk_repeats=30, verbose=False):
 
     """
     atomic_runtimes = atomic_benchmark_estimator(estimator, X_test, verbose)
-    bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats,
-                                             verbose)
+    bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose)
     return atomic_runtimes, bulk_runtimes
 
 
@@ -95,12 +102,14 @@ def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
     if verbose:
         print("generating dataset...")
 
-    X, y, coef = make_regression(n_samples=n_train + n_test,
-                                 n_features=n_features, noise=noise, coef=True)
+    X, y, coef = make_regression(
+        n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True
+    )
 
     random_seed = 13
     X_train, X_test, y_train, y_test = train_test_split(
-        X, y, train_size=n_train, test_size=n_test, random_state=random_seed)
+        X, y, train_size=n_train, test_size=n_test, random_state=random_seed
+    )
     X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)
 
     X_scaler = StandardScaler()
@@ -130,26 +139,32 @@ def boxplot_runtimes(runtimes, pred_type, configuration):
     """
 
     fig, ax1 = plt.subplots(figsize=(10, 6))
-    bp = plt.boxplot(runtimes, )
-
-    cls_infos = ['%s\n(%d %s)' % (estimator_conf['name'],
-                                  estimator_conf['complexity_computer'](
-                                      estimator_conf['instance']),
-                                  estimator_conf['complexity_label']) for
-                 estimator_conf in configuration['estimators']]
+    bp = plt.boxplot(
+        runtimes,
+    )
+
+    cls_infos = [
+        "%s\n(%d %s)"
+        % (
+            estimator_conf["name"],
+            estimator_conf["complexity_computer"](estimator_conf["instance"]),
+            estimator_conf["complexity_label"],
+        )
+        for estimator_conf in configuration["estimators"]
+    ]
     plt.setp(ax1, xticklabels=cls_infos)
-    plt.setp(bp['boxes'], color='black')
-    plt.setp(bp['whiskers'], color='black')
-    plt.setp(bp['fliers'], color='red', marker='+')
+    plt.setp(bp["boxes"], color="black")
+    plt.setp(bp["whiskers"], color="black")
+    plt.setp(bp["fliers"], color="red", marker="+")
 
-    ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-                   alpha=0.5)
+    ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
 
     ax1.set_axisbelow(True)
-    ax1.set_title('Prediction Time per Instance - %s, %d feats.' % (
-        pred_type.capitalize(),
-        configuration['n_features']))
-    ax1.set_ylabel('Prediction Time (us)')
+    ax1.set_title(
+        "Prediction Time per Instance - %s, %d feats."
+        % (pred_type.capitalize(), configuration["n_features"])
+    )
+    ax1.set_ylabel("Prediction Time (us)")
 
     plt.show()
 
@@ -157,24 +172,24 @@ def boxplot_runtimes(runtimes, pred_type, configuration):
 def benchmark(configuration):
     """Run the whole benchmark."""
     X_train, y_train, X_test, y_test = generate_dataset(
-        configuration['n_train'], configuration['n_test'],
-        configuration['n_features'])
+        configuration["n_train"], configuration["n_test"], configuration["n_features"]
+    )
 
     stats = {}
-    for estimator_conf in configuration['estimators']:
-        print("Benchmarking", estimator_conf['instance'])
-        estimator_conf['instance'].fit(X_train, y_train)
+    for estimator_conf in configuration["estimators"]:
+        print("Benchmarking", estimator_conf["instance"])
+        estimator_conf["instance"].fit(X_train, y_train)
         gc.collect()
-        a, b = benchmark_estimator(estimator_conf['instance'], X_test)
-        stats[estimator_conf['name']] = {'atomic': a, 'bulk': b}
+        a, b = benchmark_estimator(estimator_conf["instance"], X_test)
+        stats[estimator_conf["name"]] = {"atomic": a, "bulk": b}
 
-    cls_names = [estimator_conf['name'] for estimator_conf in configuration[
-        'estimators']]
-    runtimes = [1e6 * stats[clf_name]['atomic'] for clf_name in cls_names]
-    boxplot_runtimes(runtimes, 'atomic', configuration)
-    runtimes = [1e6 * stats[clf_name]['bulk'] for clf_name in cls_names]
-    boxplot_runtimes(runtimes, 'bulk (%d)' % configuration['n_test'],
-                     configuration)
+    cls_names = [
+        estimator_conf["name"] for estimator_conf in configuration["estimators"]
+    ]
+    runtimes = [1e6 * stats[clf_name]["atomic"] for clf_name in cls_names]
+    boxplot_runtimes(runtimes, "atomic", configuration)
+    runtimes = [1e6 * stats[clf_name]["bulk"] for clf_name in cls_names]
+    boxplot_runtimes(runtimes, "bulk (%d)" % configuration["n_test"], configuration)
 
 
 def n_feature_influence(estimators, n_train, n_test, n_features, percentile):
@@ -205,62 +220,72 @@ def n_feature_influence(estimators, n_train, n_test, n_features, percentile):
             estimator.fit(X_train, y_train)
             gc.collect()
             runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False)
-            percentiles[cls_name][n] = 1e6 * np.percentile(runtimes,
-                                                           percentile)
+            percentiles[cls_name][n] = 1e6 * np.percentile(runtimes, percentile)
     return percentiles
 
 
 def plot_n_features_influence(percentiles, percentile):
     fig, ax1 = plt.subplots(figsize=(10, 6))
-    colors = ['r', 'g', 'b']
+    colors = ["r", "g", "b"]
     for i, cls_name in enumerate(percentiles.keys()):
         x = np.array(sorted([n for n in percentiles[cls_name].keys()]))
         y = np.array([percentiles[cls_name][n] for n in x])
-        plt.plot(x, y, color=colors[i], )
-    ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-                   alpha=0.5)
+        plt.plot(
+            x,
+            y,
+            color=colors[i],
+        )
+    ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
     ax1.set_axisbelow(True)
-    ax1.set_title('Evolution of Prediction Time with #Features')
-    ax1.set_xlabel('#Features')
-    ax1.set_ylabel('Prediction Time at %d%%-ile (us)' % percentile)
+    ax1.set_title("Evolution of Prediction Time with #Features")
+    ax1.set_xlabel("#Features")
+    ax1.set_ylabel("Prediction Time at %d%%-ile (us)" % percentile)
     plt.show()
 
 
 def benchmark_throughputs(configuration, duration_secs=0.1):
     """benchmark throughput for different estimators."""
     X_train, y_train, X_test, y_test = generate_dataset(
-        configuration['n_train'], configuration['n_test'],
-        configuration['n_features'])
+        configuration["n_train"], configuration["n_test"], configuration["n_features"]
+    )
     throughputs = dict()
-    for estimator_config in configuration['estimators']:
-        estimator_config['instance'].fit(X_train, y_train)
+    for estimator_config in configuration["estimators"]:
+        estimator_config["instance"].fit(X_train, y_train)
         start_time = time.time()
         n_predictions = 0
         while (time.time() - start_time) < duration_secs:
-            estimator_config['instance'].predict(X_test[[0]])
+            estimator_config["instance"].predict(X_test[[0]])
             n_predictions += 1
-        throughputs[estimator_config['name']] = n_predictions / duration_secs
+        throughputs[estimator_config["name"]] = n_predictions / duration_secs
     return throughputs
 
 
 def plot_benchmark_throughput(throughputs, configuration):
     fig, ax = plt.subplots(figsize=(10, 6))
-    colors = ['r', 'g', 'b']
-    cls_infos = ['%s\n(%d %s)' % (estimator_conf['name'],
-                                  estimator_conf['complexity_computer'](
-                                      estimator_conf['instance']),
-                                  estimator_conf['complexity_label']) for
-                 estimator_conf in configuration['estimators']]
-    cls_values = [throughputs[estimator_conf['name']] for estimator_conf in
-                  configuration['estimators']]
+    colors = ["r", "g", "b"]
+    cls_infos = [
+        "%s\n(%d %s)"
+        % (
+            estimator_conf["name"],
+            estimator_conf["complexity_computer"](estimator_conf["instance"]),
+            estimator_conf["complexity_label"],
+        )
+        for estimator_conf in configuration["estimators"]
+    ]
+    cls_values = [
+        throughputs[estimator_conf["name"]]
+        for estimator_conf in configuration["estimators"]
+    ]
     plt.bar(range(len(throughputs)), cls_values, width=0.5, color=colors)
     ax.set_xticks(np.linspace(0.25, len(throughputs) - 0.75, len(throughputs)))
     ax.set_xticklabels(cls_infos, fontsize=10)
     ymax = max(cls_values) * 1.2
     ax.set_ylim((0, ymax))
-    ax.set_ylabel('Throughput (predictions/sec)')
-    ax.set_title('Prediction Throughput for different estimators (%d '
-                 'features)' % configuration['n_features'])
+    ax.set_ylabel("Throughput (predictions/sec)")
+    ax.set_title(
+        "Prediction Throughput for different estimators (%d features)"
+        % configuration["n_features"]
+    )
     plt.show()
 
 
@@ -272,33 +297,43 @@ def plot_benchmark_throughput(throughputs, configuration):
 # #############################################################################
 # Benchmark bulk/atomic prediction speed for various regressors
 configuration = {
-    'n_train': int(1e3),
-    'n_test': int(1e2),
-    'n_features': int(1e2),
-    'estimators': [
-        {'name': 'Linear Model',
-         'instance': SGDRegressor(penalty='elasticnet', alpha=0.01,
-                                  l1_ratio=0.25, tol=1e-4),
-         'complexity_label': 'non-zero coefficients',
-         'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)},
-        {'name': 'RandomForest',
-         'instance': RandomForestRegressor(),
-         'complexity_label': 'estimators',
-         'complexity_computer': lambda clf: clf.n_estimators},
-        {'name': 'SVR',
-         'instance': SVR(kernel='rbf'),
-         'complexity_label': 'support vectors',
-         'complexity_computer': lambda clf: len(clf.support_vectors_)},
-    ]
+    "n_train": int(1e3),
+    "n_test": int(1e2),
+    "n_features": int(1e2),
+    "estimators": [
+        {
+            "name": "Linear Model",
+            "instance": SGDRegressor(
+                penalty="elasticnet", alpha=0.01, l1_ratio=0.25, tol=1e-4
+            ),
+            "complexity_label": "non-zero coefficients",
+            "complexity_computer": lambda clf: np.count_nonzero(clf.coef_),
+        },
+        {
+            "name": "RandomForest",
+            "instance": RandomForestRegressor(),
+            "complexity_label": "estimators",
+            "complexity_computer": lambda clf: clf.n_estimators,
+        },
+        {
+            "name": "SVR",
+            "instance": SVR(kernel="rbf"),
+            "complexity_label": "support vectors",
+            "complexity_computer": lambda clf: len(clf.support_vectors_),
+        },
+    ],
 }
 benchmark(configuration)
 
 # benchmark n_features influence on prediction speed
 percentile = 90
-percentiles = n_feature_influence({'ridge': Ridge()},
-                                  configuration['n_train'],
-                                  configuration['n_test'],
-                                  [100, 250, 500], percentile)
+percentiles = n_feature_influence(
+    {"ridge": Ridge()},
+    configuration["n_train"],
+    configuration["n_test"],
+    [100, 250, 500],
+    percentile,
+)
 plot_n_features_influence(percentiles, percentile)
 
 # benchmark throughput
diff --git a/examples/applications/plot_species_distribution_modeling.py b/examples/applications/plot_species_distribution_modeling.py
index 4216f4fda0c2f..6dac08fe1942c 100644
--- a/examples/applications/plot_species_distribution_modeling.py
+++ b/examples/applications/plot_species_distribution_modeling.py
@@ -53,6 +53,7 @@
 # otherwise, we'll improvise later...
 try:
     from mpl_toolkits.basemap import Basemap
+
     basemap = True
 except ImportError:
     basemap = False
@@ -93,31 +94,34 @@ def create_species_bunch(species_name, train, test, coverages, xgrid, ygrid):
     This will use the test/train record arrays to extract the
     data specific to the given species name.
     """
-    bunch = Bunch(name=' '.join(species_name.split("_")[:2]))
-    species_name = species_name.encode('ascii')
+    bunch = Bunch(name=" ".join(species_name.split("_")[:2]))
+    species_name = species_name.encode("ascii")
     points = dict(test=test, train=train)
 
     for label, pts in points.items():
         # choose points associated with the desired species
-        pts = pts[pts['species'] == species_name]
-        bunch['pts_%s' % label] = pts
+        pts = pts[pts["species"] == species_name]
+        bunch["pts_%s" % label] = pts
 
         # determine coverage values for each of the training & testing points
-        ix = np.searchsorted(xgrid, pts['dd long'])
-        iy = np.searchsorted(ygrid, pts['dd lat'])
-        bunch['cov_%s' % label] = coverages[:, -iy, ix].T
+        ix = np.searchsorted(xgrid, pts["dd long"])
+        iy = np.searchsorted(ygrid, pts["dd lat"])
+        bunch["cov_%s" % label] = coverages[:, -iy, ix].T
 
     return bunch
 
 
-def plot_species_distribution(species=("bradypus_variegatus_0",
-                                       "microryzomys_minutus_0")):
+def plot_species_distribution(
+    species=("bradypus_variegatus_0", "microryzomys_minutus_0")
+):
     """
     Plot the species distribution.
     """
     if len(species) > 2:
-        print("Note: when more than two species are provided,"
-              " only the first two will be used")
+        print(
+            "Note: when more than two species are provided,"
+            " only the first two will be used"
+        )
 
     t0 = time()
 
@@ -131,19 +135,19 @@ def plot_species_distribution(species=("bradypus_variegatus_0",
     X, Y = np.meshgrid(xgrid, ygrid[::-1])
 
     # create a bunch for each species
-    BV_bunch = create_species_bunch(species[0],
-                                    data.train, data.test,
-                                    data.coverages, xgrid, ygrid)
-    MM_bunch = create_species_bunch(species[1],
-                                    data.train, data.test,
-                                    data.coverages, xgrid, ygrid)
+    BV_bunch = create_species_bunch(
+        species[0], data.train, data.test, data.coverages, xgrid, ygrid
+    )
+    MM_bunch = create_species_bunch(
+        species[1], data.train, data.test, data.coverages, xgrid, ygrid
+    )
 
     # background points (grid coordinates) for evaluation
     np.random.seed(13)
-    background_points = np.c_[np.random.randint(low=0, high=data.Ny,
-                                                size=10000),
-                              np.random.randint(low=0, high=data.Nx,
-                                                size=10000)].T
+    background_points = np.c_[
+        np.random.randint(low=0, high=data.Ny, size=10000),
+        np.random.randint(low=0, high=data.Nx, size=10000),
+    ].T
 
     # We'll make use of the fact that coverages[6] has measurements at all
     # land points.  This will help us decide between land and water.
@@ -160,7 +164,7 @@ def plot_species_distribution(species=("bradypus_variegatus_0",
         train_cover_std = (species.cov_train - mean) / std
 
         # Fit OneClassSVM
-        print(" - fit OneClassSVM ... ", end='')
+        print(" - fit OneClassSVM ... ", end="")
         clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5)
         clf.fit(train_cover_std)
         print("done.")
@@ -169,16 +173,21 @@ def plot_species_distribution(species=("bradypus_variegatus_0",
         plt.subplot(1, 2, i + 1)
         if basemap:
             print(" - plot coastlines using basemap")
-            m = Basemap(projection='cyl', llcrnrlat=Y.min(),
-                        urcrnrlat=Y.max(), llcrnrlon=X.min(),
-                        urcrnrlon=X.max(), resolution='c')
+            m = Basemap(
+                projection="cyl",
+                llcrnrlat=Y.min(),
+                urcrnrlat=Y.max(),
+                llcrnrlon=X.min(),
+                urcrnrlon=X.max(),
+                resolution="c",
+            )
             m.drawcoastlines()
             m.drawcountries()
         else:
             print(" - plot coastlines from coverage")
-            plt.contour(X, Y, land_reference,
-                        levels=[-9998], colors="k",
-                        linestyles="solid")
+            plt.contour(
+                X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid"
+            )
             plt.xticks([])
             plt.yticks([])
 
@@ -200,18 +209,28 @@ def plot_species_distribution(species=("bradypus_variegatus_0",
 
         # plot contours of the prediction
         plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
-        plt.colorbar(format='%.2f')
+        plt.colorbar(format="%.2f")
 
         # scatter training/testing points
-        plt.scatter(species.pts_train['dd long'], species.pts_train['dd lat'],
-                    s=2 ** 2, c='black',
-                    marker='^', label='train')
-        plt.scatter(species.pts_test['dd long'], species.pts_test['dd lat'],
-                    s=2 ** 2, c='black',
-                    marker='x', label='test')
+        plt.scatter(
+            species.pts_train["dd long"],
+            species.pts_train["dd lat"],
+            s=2 ** 2,
+            c="black",
+            marker="^",
+            label="train",
+        )
+        plt.scatter(
+            species.pts_test["dd long"],
+            species.pts_test["dd lat"],
+            s=2 ** 2,
+            c="black",
+            marker="x",
+            label="test",
+        )
         plt.legend()
         plt.title(species.name)
-        plt.axis('equal')
+        plt.axis("equal")
 
         # Compute AUC with regards to background points
         pred_background = Z[background_points[0], background_points[1]]
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index fb6f9b2ec27d8..5116d8939de5d 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -85,62 +85,63 @@
 # alphavantage.co ones.
 
 symbol_dict = {
-    'TOT': 'Total',
-    'XOM': 'Exxon',
-    'CVX': 'Chevron',
-    'COP': 'ConocoPhillips',
-    'VLO': 'Valero Energy',
-    'MSFT': 'Microsoft',
-    'IBM': 'IBM',
-    'TWX': 'Time Warner',
-    'CMCSA': 'Comcast',
-    'CVC': 'Cablevision',
-    'YHOO': 'Yahoo',
-    'DELL': 'Dell',
-    'HPQ': 'HP',
-    'AMZN': 'Amazon',
-    'TM': 'Toyota',
-    'CAJ': 'Canon',
-    'SNE': 'Sony',
-    'F': 'Ford',
-    'HMC': 'Honda',
-    'NAV': 'Navistar',
-    'NOC': 'Northrop Grumman',
-    'BA': 'Boeing',
-    'KO': 'Coca Cola',
-    'MMM': '3M',
-    'MCD': 'McDonald\'s',
-    'PEP': 'Pepsi',
-    'K': 'Kellogg',
-    'UN': 'Unilever',
-    'MAR': 'Marriott',
-    'PG': 'Procter Gamble',
-    'CL': 'Colgate-Palmolive',
-    'GE': 'General Electrics',
-    'WFC': 'Wells Fargo',
-    'JPM': 'JPMorgan Chase',
-    'AIG': 'AIG',
-    'AXP': 'American express',
-    'BAC': 'Bank of America',
-    'GS': 'Goldman Sachs',
-    'AAPL': 'Apple',
-    'SAP': 'SAP',
-    'CSCO': 'Cisco',
-    'TXN': 'Texas Instruments',
-    'XRX': 'Xerox',
-    'WMT': 'Wal-Mart',
-    'HD': 'Home Depot',
-    'GSK': 'GlaxoSmithKline',
-    'PFE': 'Pfizer',
-    'SNY': 'Sanofi-Aventis',
-    'NVS': 'Novartis',
-    'KMB': 'Kimberly-Clark',
-    'R': 'Ryder',
-    'GD': 'General Dynamics',
-    'RTN': 'Raytheon',
-    'CVS': 'CVS',
-    'CAT': 'Caterpillar',
-    'DD': 'DuPont de Nemours'}
+    "TOT": "Total",
+    "XOM": "Exxon",
+    "CVX": "Chevron",
+    "COP": "ConocoPhillips",
+    "VLO": "Valero Energy",
+    "MSFT": "Microsoft",
+    "IBM": "IBM",
+    "TWX": "Time Warner",
+    "CMCSA": "Comcast",
+    "CVC": "Cablevision",
+    "YHOO": "Yahoo",
+    "DELL": "Dell",
+    "HPQ": "HP",
+    "AMZN": "Amazon",
+    "TM": "Toyota",
+    "CAJ": "Canon",
+    "SNE": "Sony",
+    "F": "Ford",
+    "HMC": "Honda",
+    "NAV": "Navistar",
+    "NOC": "Northrop Grumman",
+    "BA": "Boeing",
+    "KO": "Coca Cola",
+    "MMM": "3M",
+    "MCD": "McDonald's",
+    "PEP": "Pepsi",
+    "K": "Kellogg",
+    "UN": "Unilever",
+    "MAR": "Marriott",
+    "PG": "Procter Gamble",
+    "CL": "Colgate-Palmolive",
+    "GE": "General Electrics",
+    "WFC": "Wells Fargo",
+    "JPM": "JPMorgan Chase",
+    "AIG": "AIG",
+    "AXP": "American express",
+    "BAC": "Bank of America",
+    "GS": "Goldman Sachs",
+    "AAPL": "Apple",
+    "SAP": "SAP",
+    "CSCO": "Cisco",
+    "TXN": "Texas Instruments",
+    "XRX": "Xerox",
+    "WMT": "Wal-Mart",
+    "HD": "Home Depot",
+    "GSK": "GlaxoSmithKline",
+    "PFE": "Pfizer",
+    "SNY": "Sanofi-Aventis",
+    "NVS": "Novartis",
+    "KMB": "Kimberly-Clark",
+    "R": "Ryder",
+    "GD": "General Dynamics",
+    "RTN": "Raytheon",
+    "CVS": "CVS",
+    "CAT": "Caterpillar",
+    "DD": "DuPont de Nemours",
+}
 
 
 symbols, names = np.array(sorted(symbol_dict.items())).T
@@ -148,13 +149,15 @@
 quotes = []
 
 for symbol in symbols:
-    print('Fetching quote history for %r' % symbol, file=sys.stderr)
-    url = ('https://raw.githubusercontent.com/scikit-learn/examples-data/'
-           'master/financial-data/{}.csv')
+    print("Fetching quote history for %r" % symbol, file=sys.stderr)
+    url = (
+        "https://raw.githubusercontent.com/scikit-learn/examples-data/"
+        "master/financial-data/{}.csv"
+    )
     quotes.append(pd.read_csv(url.format(symbol)))
 
-close_prices = np.vstack([q['close'] for q in quotes])
-open_prices = np.vstack([q['open'] for q in quotes])
+close_prices = np.vstack([q["close"] for q in quotes])
+open_prices = np.vstack([q["open"] for q in quotes])
 
 # The daily variations of the quotes are what carry most information
 variation = close_prices - open_prices
@@ -173,12 +176,11 @@
 # #############################################################################
 # Cluster using affinity propagation
 
-_, labels = cluster.affinity_propagation(edge_model.covariance_,
-                                         random_state=0)
+_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
 n_labels = labels.max()
 
 for i in range(n_labels + 1):
-    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
+    print("Cluster %i: %s" % ((i + 1), ", ".join(names[labels == i])))
 
 # #############################################################################
 # Find a low-dimension embedding for visualization: find the best position of
@@ -188,46 +190,48 @@
 # initiated with random vectors that we don't control). In addition, we
 # use a large number of neighbors to capture the large-scale structure.
 node_position_model = manifold.LocallyLinearEmbedding(
-    n_components=2, eigen_solver='dense', n_neighbors=6)
+    n_components=2, eigen_solver="dense", n_neighbors=6
+)
 
 embedding = node_position_model.fit_transform(X.T).T
 
 # #############################################################################
 # Visualization
-plt.figure(1, facecolor='w', figsize=(10, 8))
+plt.figure(1, facecolor="w", figsize=(10, 8))
 plt.clf()
-ax = plt.axes([0., 0., 1., 1.])
-plt.axis('off')
+ax = plt.axes([0.0, 0.0, 1.0, 1.0])
+plt.axis("off")
 
 # Display a graph of the partial correlations
 partial_correlations = edge_model.precision_.copy()
 d = 1 / np.sqrt(np.diag(partial_correlations))
 partial_correlations *= d
 partial_correlations *= d[:, np.newaxis]
-non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
+non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02
 
 # Plot the nodes using the coordinates of our embedding
-plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
-            cmap=plt.cm.nipy_spectral)
+plt.scatter(
+    embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.nipy_spectral
+)
 
 # Plot the edges
 start_idx, end_idx = np.where(non_zero)
 # a sequence of (*line0*, *line1*, *line2*), where::
 #            linen = (x0, y0), (x1, y1), ... (xm, ym)
-segments = [[embedding[:, start], embedding[:, stop]]
-            for start, stop in zip(start_idx, end_idx)]
+segments = [
+    [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx)
+]
 values = np.abs(partial_correlations[non_zero])
-lc = LineCollection(segments,
-                    zorder=0, cmap=plt.cm.hot_r,
-                    norm=plt.Normalize(0, .7 * values.max()))
+lc = LineCollection(
+    segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max())
+)
 lc.set_array(values)
 lc.set_linewidths(15 * values)
 ax.add_collection(lc)
 
 # Add a label to each node. The challenge here is that we want to
 # position the labels to avoid overlap with other labels
-for index, (name, label, (x, y)) in enumerate(
-        zip(names, labels, embedding.T)):
+for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)):
 
     dx = x - embedding[0]
     dx[index] = 1
@@ -236,27 +240,38 @@
     this_dx = dx[np.argmin(np.abs(dy))]
     this_dy = dy[np.argmin(np.abs(dx))]
     if this_dx > 0:
-        horizontalalignment = 'left'
-        x = x + .002
+        horizontalalignment = "left"
+        x = x + 0.002
     else:
-        horizontalalignment = 'right'
-        x = x - .002
+        horizontalalignment = "right"
+        x = x - 0.002
     if this_dy > 0:
-        verticalalignment = 'bottom'
-        y = y + .002
+        verticalalignment = "bottom"
+        y = y + 0.002
     else:
-        verticalalignment = 'top'
-        y = y - .002
-    plt.text(x, y, name, size=10,
-             horizontalalignment=horizontalalignment,
-             verticalalignment=verticalalignment,
-             bbox=dict(facecolor='w',
-                       edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
-                       alpha=.6))
-
-plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
-         embedding[0].max() + .10 * embedding[0].ptp(),)
-plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
-         embedding[1].max() + .03 * embedding[1].ptp())
+        verticalalignment = "top"
+        y = y - 0.002
+    plt.text(
+        x,
+        y,
+        name,
+        size=10,
+        horizontalalignment=horizontalalignment,
+        verticalalignment=verticalalignment,
+        bbox=dict(
+            facecolor="w",
+            edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
+            alpha=0.6,
+        ),
+    )
+
+plt.xlim(
+    embedding[0].min() - 0.15 * embedding[0].ptp(),
+    embedding[0].max() + 0.10 * embedding[0].ptp(),
+)
+plt.ylim(
+    embedding[1].min() - 0.03 * embedding[1].ptp(),
+    embedding[1].max() + 0.03 * embedding[1].ptp(),
+)
 
 plt.show()
diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py
index 0c218c1b0fa4a..8e96dbff3dafb 100644
--- a/examples/applications/plot_tomography_l1_reconstruction.py
+++ b/examples/applications/plot_tomography_l1_reconstruction.py
@@ -57,14 +57,14 @@ def _weights(x, dx=1, orig=0):
 
 def _generate_center_coordinates(l_x):
     X, Y = np.mgrid[:l_x, :l_x].astype(np.float64)
-    center = l_x / 2.
+    center = l_x / 2.0
     X += 0.5 - center
     Y += 0.5 - center
     return X, Y
 
 
 def build_projection_operator(l_x, n_dir):
-    """ Compute the tomography design matrix.
+    """Compute the tomography design matrix.
 
     Parameters
     ----------
@@ -83,8 +83,7 @@ def build_projection_operator(l_x, n_dir):
     angles = np.linspace(0, np.pi, n_dir, endpoint=False)
     data_inds, weights, camera_inds = [], [], []
     data_unravel_indices = np.arange(l_x ** 2)
-    data_unravel_indices = np.hstack((data_unravel_indices,
-                                      data_unravel_indices))
+    data_unravel_indices = np.hstack((data_unravel_indices, data_unravel_indices))
     for i, angle in enumerate(angles):
         Xrot = np.cos(angle) * X - np.sin(angle) * Y
         inds, w = _weights(Xrot, dx=1, orig=X.min())
@@ -97,11 +96,11 @@ def build_projection_operator(l_x, n_dir):
 
 
 def generate_synthetic_data():
-    """ Synthetic binary data """
+    """Synthetic binary data"""
     rs = np.random.RandomState(0)
     n_pts = 36
     x, y = np.ogrid[0:l, 0:l]
-    mask_outer = (x - l / 2.) ** 2 + (y - l / 2.) ** 2 < (l / 2.) ** 2
+    mask_outer = (x - l / 2.0) ** 2 + (y - l / 2.0) ** 2 < (l / 2.0) ** 2
     mask = np.zeros((l, l))
     points = l * rs.rand(2, n_pts)
     mask[(points[0]).astype(int), (points[1]).astype(int)] = 1
@@ -131,19 +130,18 @@ def generate_synthetic_data():
 
 plt.figure(figsize=(8, 3.3))
 plt.subplot(131)
-plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest')
-plt.axis('off')
-plt.title('original image')
+plt.imshow(data, cmap=plt.cm.gray, interpolation="nearest")
+plt.axis("off")
+plt.title("original image")
 plt.subplot(132)
-plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation='nearest')
-plt.title('L2 penalization')
-plt.axis('off')
+plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation="nearest")
+plt.title("L2 penalization")
+plt.axis("off")
 plt.subplot(133)
-plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation='nearest')
-plt.title('L1 penalization')
-plt.axis('off')
+plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation="nearest")
+plt.title("L1 penalization")
+plt.axis("off")
 
-plt.subplots_adjust(hspace=0.01, wspace=0.01, top=1, bottom=0, left=0,
-                    right=1)
+plt.subplots_adjust(hspace=0.01, wspace=0.01, top=1, bottom=0, left=0, right=1)
 
 plt.show()
diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py
index c677fa3b6650a..48b69d710226b 100644
--- a/examples/applications/plot_topics_extraction_with_nmf_lda.py
+++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py
@@ -43,17 +43,16 @@ def plot_top_words(model, feature_names, n_top_words, title):
     fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
     axes = axes.flatten()
     for topic_idx, topic in enumerate(model.components_):
-        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
+        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
         top_features = [feature_names[i] for i in top_features_ind]
         weights = topic[top_features_ind]
 
         ax = axes[topic_idx]
         ax.barh(top_features, weights, height=0.7)
-        ax.set_title(f'Topic {topic_idx +1}',
-                     fontdict={'fontsize': 30})
+        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
         ax.invert_yaxis()
-        ax.tick_params(axis='both', which='major', labelsize=20)
-        for i in 'top right left'.split():
+        ax.tick_params(axis="both", which="major", labelsize=20)
+        for i in "top right left".split():
             ax.spines[i].set_visible(False)
         fig.suptitle(title, fontsize=40)
 
@@ -68,69 +67,91 @@ def plot_top_words(model, feature_names, n_top_words, title):
 
 print("Loading dataset...")
 t0 = time()
-data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
-                             remove=('headers', 'footers', 'quotes'),
-                             return_X_y=True)
+data, _ = fetch_20newsgroups(
+    shuffle=True,
+    random_state=1,
+    remove=("headers", "footers", "quotes"),
+    return_X_y=True,
+)
 data_samples = data[:n_samples]
 print("done in %0.3fs." % (time() - t0))
 
 # Use tf-idf features for NMF.
 print("Extracting tf-idf features for NMF...")
-tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
-                                   max_features=n_features,
-                                   stop_words='english')
+tfidf_vectorizer = TfidfVectorizer(
+    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
+)
 t0 = time()
 tfidf = tfidf_vectorizer.fit_transform(data_samples)
 print("done in %0.3fs." % (time() - t0))
 
 # Use tf (raw term count) features for LDA.
 print("Extracting tf features for LDA...")
-tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
-                                max_features=n_features,
-                                stop_words='english')
+tf_vectorizer = CountVectorizer(
+    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
+)
 t0 = time()
 tf = tf_vectorizer.fit_transform(data_samples)
 print("done in %0.3fs." % (time() - t0))
 print()
 
 # Fit the NMF model
-print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
-      "n_samples=%d and n_features=%d..."
-      % (n_samples, n_features))
+print(
+    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
+    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
+)
 t0 = time()
-nmf = NMF(n_components=n_components, random_state=1,
-          alpha=.1, l1_ratio=.5).fit(tfidf)
+nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
 
 tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
-plot_top_words(nmf, tfidf_feature_names, n_top_words,
-               'Topics in NMF model (Frobenius norm)')
+plot_top_words(
+    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
+)
 
 # Fit the NMF model
-print('\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler "
-      "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
-      % (n_samples, n_features))
+print(
+    "\n" * 2,
+    "Fitting the NMF model (generalized Kullback-Leibler "
+    "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
+    % (n_samples, n_features),
+)
 t0 = time()
-nmf = NMF(n_components=n_components, random_state=1,
-          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
-          l1_ratio=.5).fit(tfidf)
+nmf = NMF(
+    n_components=n_components,
+    random_state=1,
+    beta_loss="kullback-leibler",
+    solver="mu",
+    max_iter=1000,
+    alpha=0.1,
+    l1_ratio=0.5,
+).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
 tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
-plot_top_words(nmf, tfidf_feature_names, n_top_words,
-               'Topics in NMF model (generalized Kullback-Leibler divergence)')
-
-print('\n' * 2, "Fitting LDA models with tf features, "
-      "n_samples=%d and n_features=%d..."
-      % (n_samples, n_features))
-lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
-                                learning_method='online',
-                                learning_offset=50.,
-                                random_state=0)
+plot_top_words(
+    nmf,
+    tfidf_feature_names,
+    n_top_words,
+    "Topics in NMF model (generalized Kullback-Leibler divergence)",
+)
+
+print(
+    "\n" * 2,
+    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
+    % (n_samples, n_features),
+)
+lda = LatentDirichletAllocation(
+    n_components=n_components,
+    max_iter=5,
+    learning_method="online",
+    learning_offset=50.0,
+    random_state=0,
+)
 t0 = time()
 lda.fit(tf)
 print("done in %0.3fs." % (time() - t0))
 
 tf_feature_names = tf_vectorizer.get_feature_names_out()
-plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
+plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")
diff --git a/examples/applications/svm_gui.py b/examples/applications/svm_gui.py
index d9065b0fe8db4..f480b36ad1d94 100644
--- a/examples/applications/svm_gui.py
+++ b/examples/applications/svm_gui.py
@@ -21,14 +21,16 @@
 # License: BSD 3 clause
 
 import matplotlib
-matplotlib.use('TkAgg')
+
+matplotlib.use("TkAgg")
 from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
+
 try:
     from matplotlib.backends.backend_tkagg import NavigationToolbar2Tk
 except ImportError:
     # NavigationToolbar2TkAgg was deprecated in matplotlib 2.2
     from matplotlib.backends.backend_tkagg import (
-        NavigationToolbar2TkAgg as NavigationToolbar2Tk
+        NavigationToolbar2TkAgg as NavigationToolbar2Tk,
     )
 from matplotlib.figure import Figure
 from matplotlib.contour import ContourSet
@@ -58,12 +60,12 @@ def __init__(self):
         self.surface_type = 0
 
     def changed(self, event):
-        """Notify the observers. """
+        """Notify the observers."""
         for observer in self.observers:
             observer.update(event, self)
 
     def add_observer(self, observer):
-        """Register an observer. """
+        """Register an observer."""
         self.observers.append(observer)
 
     def set_surface(self, surface):
@@ -96,14 +98,23 @@ def fit(self):
         degree = int(self.degree.get())
         kernel_map = {0: "linear", 1: "rbf", 2: "poly"}
         if len(np.unique(y)) == 1:
-            clf = svm.OneClassSVM(kernel=kernel_map[self.kernel.get()],
-                                  gamma=gamma, coef0=coef0, degree=degree)
+            clf = svm.OneClassSVM(
+                kernel=kernel_map[self.kernel.get()],
+                gamma=gamma,
+                coef0=coef0,
+                degree=degree,
+            )
             clf.fit(X)
         else:
-            clf = svm.SVC(kernel=kernel_map[self.kernel.get()], C=C,
-                          gamma=gamma, coef0=coef0, degree=degree)
+            clf = svm.SVC(
+                kernel=kernel_map[self.kernel.get()],
+                C=C,
+                gamma=gamma,
+                coef0=coef0,
+                degree=degree,
+            )
             clf.fit(X, y)
-        if hasattr(clf, 'score'):
+        if hasattr(clf, "score"):
             print("Accuracy:", clf.score(X, y) * 100)
         X1, X2, Z = self.decision_surface(clf)
         self.model.clf = clf
@@ -134,13 +145,13 @@ def add_example(self, x, y, label):
         self.refit()
 
     def refit(self):
-        """Refit the model if already fitted. """
+        """Refit the model if already fitted."""
         if self.fitted:
             self.fit()
 
 
 class View:
-    """Test docstring. """
+    """Test docstring."""
 
     def __init__(self, root, controller):
         f = Figure()
@@ -157,7 +168,7 @@ def __init__(self, root, controller):
             canvas.show()
         canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)
         canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)
-        canvas.mpl_connect('button_press_event', self.onclick)
+        canvas.mpl_connect("button_press_event", self.onclick)
         toolbar = NavigationToolbar2Tk(canvas, root)
         toolbar.update()
         self.controllbar = ControllBar(root, controller)
@@ -184,9 +195,9 @@ def onclick(self, event):
     def update_example(self, model, idx):
         x, y, l = model.data[idx]
         if l == 1:
-            color = 'w'
+            color = "w"
         elif l == -1:
-            color = 'k'
+            color = "k"
         self.ax.plot([x], [y], "%so" % color, scalex=0.0, scaley=0.0)
 
     def update(self, event, model):
@@ -227,25 +238,33 @@ def plot_support_vectors(self, support_vectors):
         """Plot the support vectors by placing circles over the
         corresponding data points and adds the circle collection
         to the contours list."""
-        cs = self.ax.scatter(support_vectors[:, 0], support_vectors[:, 1],
-                             s=80, edgecolors="k", facecolors="none")
+        cs = self.ax.scatter(
+            support_vectors[:, 0],
+            support_vectors[:, 1],
+            s=80,
+            edgecolors="k",
+            facecolors="none",
+        )
         self.contours.append(cs)
 
     def plot_decision_surface(self, surface, type):
         X1, X2, Z = surface
         if type == 0:
             levels = [-1.0, 0.0, 1.0]
-            linestyles = ['dashed', 'solid', 'dashed']
-            colors = 'k'
-            self.contours.append(self.ax.contour(X1, X2, Z, levels,
-                                                 colors=colors,
-                                                 linestyles=linestyles))
+            linestyles = ["dashed", "solid", "dashed"]
+            colors = "k"
+            self.contours.append(
+                self.ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
+            )
         elif type == 1:
-            self.contours.append(self.ax.contourf(X1, X2, Z, 10,
-                                                  cmap=matplotlib.cm.bone,
-                                                  origin='lower', alpha=0.85))
-            self.contours.append(self.ax.contour(X1, X2, Z, [0.0], colors='k',
-                                                 linestyles=['solid']))
+            self.contours.append(
+                self.ax.contourf(
+                    X1, X2, Z, 10, cmap=matplotlib.cm.bone, origin="lower", alpha=0.85
+                )
+            )
+            self.contours.append(
+                self.ax.contour(X1, X2, Z, [0.0], colors="k", linestyles=["solid"])
+            )
         else:
             raise ValueError("surface type unknown")
 
@@ -254,12 +273,27 @@ class ControllBar:
     def __init__(self, root, controller):
         fm = Tk.Frame(root)
         kernel_group = Tk.Frame(fm)
-        Tk.Radiobutton(kernel_group, text="Linear", variable=controller.kernel,
-                       value=0, command=controller.refit).pack(anchor=Tk.W)
-        Tk.Radiobutton(kernel_group, text="RBF", variable=controller.kernel,
-                       value=1, command=controller.refit).pack(anchor=Tk.W)
-        Tk.Radiobutton(kernel_group, text="Poly", variable=controller.kernel,
-                       value=2, command=controller.refit).pack(anchor=Tk.W)
+        Tk.Radiobutton(
+            kernel_group,
+            text="Linear",
+            variable=controller.kernel,
+            value=0,
+            command=controller.refit,
+        ).pack(anchor=Tk.W)
+        Tk.Radiobutton(
+            kernel_group,
+            text="RBF",
+            variable=controller.kernel,
+            value=1,
+            command=controller.refit,
+        ).pack(anchor=Tk.W)
+        Tk.Radiobutton(
+            kernel_group,
+            text="Poly",
+            variable=controller.kernel,
+            value=2,
+            command=controller.refit,
+        ).pack(anchor=Tk.W)
         kernel_group.pack(side=Tk.LEFT)
 
         valbox = Tk.Frame(fm)
@@ -267,8 +301,7 @@ def __init__(self, root, controller):
         controller.complexity.set("1.0")
         c = Tk.Frame(valbox)
         Tk.Label(c, text="C:", anchor="e", width=7).pack(side=Tk.LEFT)
-        Tk.Entry(c, width=6, textvariable=controller.complexity).pack(
-            side=Tk.LEFT)
+        Tk.Entry(c, width=6, textvariable=controller.complexity).pack(side=Tk.LEFT)
         c.pack()
 
         controller.gamma = Tk.StringVar()
@@ -294,29 +327,42 @@ def __init__(self, root, controller):
         valbox.pack(side=Tk.LEFT)
 
         cmap_group = Tk.Frame(fm)
-        Tk.Radiobutton(cmap_group, text="Hyperplanes",
-                       variable=controller.surface_type, value=0,
-                       command=controller.refit).pack(anchor=Tk.W)
-        Tk.Radiobutton(cmap_group, text="Surface",
-                       variable=controller.surface_type, value=1,
-                       command=controller.refit).pack(anchor=Tk.W)
+        Tk.Radiobutton(
+            cmap_group,
+            text="Hyperplanes",
+            variable=controller.surface_type,
+            value=0,
+            command=controller.refit,
+        ).pack(anchor=Tk.W)
+        Tk.Radiobutton(
+            cmap_group,
+            text="Surface",
+            variable=controller.surface_type,
+            value=1,
+            command=controller.refit,
+        ).pack(anchor=Tk.W)
 
         cmap_group.pack(side=Tk.LEFT)
 
-        train_button = Tk.Button(fm, text='Fit', width=5,
-                                 command=controller.fit)
+        train_button = Tk.Button(fm, text="Fit", width=5, command=controller.fit)
         train_button.pack()
         fm.pack(side=Tk.LEFT)
-        Tk.Button(fm, text='Clear', width=5,
-                  command=controller.clear_data).pack(side=Tk.LEFT)
+        Tk.Button(fm, text="Clear", width=5, command=controller.clear_data).pack(
+            side=Tk.LEFT
+        )
 
 
 def get_parser():
     from optparse import OptionParser
+
     op = OptionParser()
-    op.add_option("--output",
-                  action="store", type="str", dest="output",
-                  help="Path where to dump data.")
+    op.add_option(
+        "--output",
+        action="store",
+        type="str",
+        dest="output",
+        help="Path where to dump data.",
+    )
     return op
 
 
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index 1a148497af1b4..1d86076431ed8 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -65,7 +65,7 @@
     if not os.path.exists(filename):
         print("Downloading data from '%s', please wait..." % url)
         opener = urlopen(url)
-        open(filename, 'wb').write(opener.read())
+        open(filename, "wb").write(opener.read())
         print()
 
 
@@ -163,7 +163,8 @@ def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
 
 # stop after 5M links to make it possible to work in RAM
 X, redirects, index_map = get_adjacency_matrix(
-    redirects_filename, page_links_filename, limit=5000000)
+    redirects_filename, page_links_filename, limit=5000000
+)
 names = {i: name for name, i in index_map.items()}
 
 print("Computing the principal singular vectors using randomized_svd")
@@ -195,16 +196,17 @@ def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
 
     print("Normalizing the graph")
     for i in incoming_counts.nonzero()[0]:
-        X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i]
-    dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),
-                                 1.0 / n, 0)).ravel()
+        X.data[X.indptr[i] : X.indptr[i + 1]] *= 1.0 / incoming_counts[i]
+    dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0), 1.0 / n, 0)).ravel()
 
-    scores = np.full(n, 1. / n, dtype=np.float32)  # initial guess
+    scores = np.full(n, 1.0 / n, dtype=np.float32)  # initial guess
     for i in range(max_iter):
         print("power iteration #%d" % i)
         prev_scores = scores
-        scores = (alpha * (scores * X + np.dot(dangle, prev_scores))
-                  + (1 - alpha) * prev_scores.sum() / n)
+        scores = (
+            alpha * (scores * X + np.dot(dangle, prev_scores))
+            + (1 - alpha) * prev_scores.sum() / n
+        )
         # check convergence: normalized l_inf norm
         scores_max = np.abs(scores).max()
         if scores_max == 0.0:
diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
index c01807e345928..eb2c0c8dafb50 100644
--- a/examples/bicluster/plot_bicluster_newsgroups.py
+++ b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -38,7 +38,7 @@
 
 
 def number_normalizer(tokens):
-    """ Map all numeric tokens to a placeholder.
+    """Map all numeric tokens to a placeholder.
 
     For many applications, tokens that begin with a number are not directly
     useful, but the fact that such a token exists can be relevant.  By applying
@@ -54,22 +54,35 @@ def build_tokenizer(self):
 
 
 # exclude 'comp.os.ms-windows.misc'
-categories = ['alt.atheism', 'comp.graphics',
-              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
-              'comp.windows.x', 'misc.forsale', 'rec.autos',
-              'rec.motorcycles', 'rec.sport.baseball',
-              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
-              'sci.med', 'sci.space', 'soc.religion.christian',
-              'talk.politics.guns', 'talk.politics.mideast',
-              'talk.politics.misc', 'talk.religion.misc']
+categories = [
+    "alt.atheism",
+    "comp.graphics",
+    "comp.sys.ibm.pc.hardware",
+    "comp.sys.mac.hardware",
+    "comp.windows.x",
+    "misc.forsale",
+    "rec.autos",
+    "rec.motorcycles",
+    "rec.sport.baseball",
+    "rec.sport.hockey",
+    "sci.crypt",
+    "sci.electronics",
+    "sci.med",
+    "sci.space",
+    "soc.religion.christian",
+    "talk.politics.guns",
+    "talk.politics.mideast",
+    "talk.politics.misc",
+    "talk.religion.misc",
+]
 newsgroups = fetch_20newsgroups(categories=categories)
 y_true = newsgroups.target
 
-vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
-cocluster = SpectralCoclustering(n_clusters=len(categories),
-                                 svd_method='arpack', random_state=0)
-kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
-                         random_state=0)
+vectorizer = NumberNormalizingVectorizer(stop_words="english", min_df=5)
+cocluster = SpectralCoclustering(
+    n_clusters=len(categories), svd_method="arpack", random_state=0
+)
+kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0)
 
 print("Vectorizing...")
 X = vectorizer.fit_transform(newsgroups.data)
@@ -78,16 +91,20 @@ def build_tokenizer(self):
 start_time = time()
 cocluster.fit(X)
 y_cocluster = cocluster.row_labels_
-print("Done in {:.2f}s. V-measure: {:.4f}".format(
-    time() - start_time,
-    v_measure_score(y_cocluster, y_true)))
+print(
+    "Done in {:.2f}s. V-measure: {:.4f}".format(
+        time() - start_time, v_measure_score(y_cocluster, y_true)
+    )
+)
 
 print("MiniBatchKMeans...")
 start_time = time()
 y_kmeans = kmeans.fit_predict(X)
-print("Done in {:.2f}s. V-measure: {:.4f}".format(
-    time() - start_time,
-    v_measure_score(y_kmeans, y_true)))
+print(
+    "Done in {:.2f}s. V-measure: {:.4f}".format(
+        time() - start_time, v_measure_score(y_kmeans, y_true)
+    )
+)
 
 feature_names = vectorizer.get_feature_names_out()
 document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
@@ -97,14 +114,14 @@ def bicluster_ncut(i):
     rows, cols = cocluster.get_indices(i)
     if not (np.any(rows) and np.any(cols)):
         import sys
+
         return sys.float_info.max
     row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]
     col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
     # Note: the following is identical to X[rows[:, np.newaxis],
     # cols].sum() but much faster in scipy <= 0.16
     weight = X[rows][:, cols].sum()
-    cut = (X[row_complement][:, cols].sum() +
-           X[rows][:, col_complement].sum())
+    cut = X[row_complement][:, cols].sum() + X[rows][:, col_complement].sum()
     return cut / weight
 
 
@@ -116,8 +133,7 @@ def most_common(d):
     return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
 
 
-bicluster_ncuts = list(bicluster_ncut(i)
-                       for i in range(len(newsgroups.target_names)))
+bicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names)))
 best_idx = np.argsort(bicluster_ncuts)[:5]
 
 print()
@@ -133,20 +149,24 @@ def most_common(d):
     counter = defaultdict(int)
     for i in cluster_docs:
         counter[document_names[i]] += 1
-    cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name)
-                           for name, c in most_common(counter)[:3])
+    cat_string = ", ".join(
+        "{:.0f}% {}".format(float(c) / n_rows * 100, name)
+        for name, c in most_common(counter)[:3]
+    )
 
     # words
     out_of_cluster_docs = cocluster.row_labels_ != cluster
     out_of_cluster_docs = np.where(out_of_cluster_docs)[0]
     word_col = X[:, cluster_words]
-    word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
-                           word_col[out_of_cluster_docs, :].sum(axis=0))
+    word_scores = np.array(
+        word_col[cluster_docs, :].sum(axis=0)
+        - word_col[out_of_cluster_docs, :].sum(axis=0)
+    )
     word_scores = word_scores.ravel()
-    important_words = list(feature_names[cluster_words[i]]
-                           for i in word_scores.argsort()[:-11:-1])
+    important_words = list(
+        feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]
+    )
 
-    print("bicluster {} : {} documents, {} words".format(
-        idx, n_rows, n_cols))
+    print("bicluster {} : {} documents, {} words".format(idx, n_rows, n_cols))
     print("categories   : {}".format(cat_string))
-    print("words        : {}\n".format(', '.join(important_words)))
+    print("words        : {}\n".format(", ".join(important_words)))
diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py
index abc63879a8420..754853749a784 100644
--- a/examples/bicluster/plot_spectral_biclustering.py
+++ b/examples/bicluster/plot_spectral_biclustering.py
@@ -30,8 +30,8 @@
 
 n_clusters = (4, 3)
 data, rows, columns = make_checkerboard(
-    shape=(300, 300), n_clusters=n_clusters, noise=10,
-    shuffle=False, random_state=0)
+    shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0
+)
 
 plt.matshow(data, cmap=plt.cm.Blues)
 plt.title("Original dataset")
@@ -45,11 +45,9 @@
 plt.matshow(data, cmap=plt.cm.Blues)
 plt.title("Shuffled dataset")
 
-model = SpectralBiclustering(n_clusters=n_clusters, method='log',
-                             random_state=0)
+model = SpectralBiclustering(n_clusters=n_clusters, method="log", random_state=0)
 model.fit(data)
-score = consensus_score(model.biclusters_,
-                        (rows[:, row_idx], columns[:, col_idx]))
+score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx]))
 
 print("consensus score: {:.1f}".format(score))
 
@@ -59,9 +57,10 @@
 plt.matshow(fit_data, cmap=plt.cm.Blues)
 plt.title("After biclustering; rearranged to show biclusters")
 
-plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
-                     np.sort(model.column_labels_) + 1),
-            cmap=plt.cm.Blues)
+plt.matshow(
+    np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1),
+    cmap=plt.cm.Blues,
+)
 plt.title("Checkerboard structure of rearranged data")
 
 plt.show()
diff --git a/examples/bicluster/plot_spectral_coclustering.py b/examples/bicluster/plot_spectral_coclustering.py
index 0681d51e0bfd3..26494bf4f1c4f 100644
--- a/examples/bicluster/plot_spectral_coclustering.py
+++ b/examples/bicluster/plot_spectral_coclustering.py
@@ -27,8 +27,8 @@
 from sklearn.metrics import consensus_score
 
 data, rows, columns = make_biclusters(
-    shape=(300, 300), n_clusters=5, noise=5,
-    shuffle=False, random_state=0)
+    shape=(300, 300), n_clusters=5, noise=5, shuffle=False, random_state=0
+)
 
 plt.matshow(data, cmap=plt.cm.Blues)
 plt.title("Original dataset")
@@ -44,8 +44,7 @@
 
 model = SpectralCoclustering(n_clusters=5, random_state=0)
 model.fit(data)
-score = consensus_score(model.biclusters_,
-                        (rows[:, row_idx], columns[:, col_idx]))
+score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx]))
 
 print("consensus score: {:.3f}".format(score))
 
diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py
index e8d8d5184a178..c1e1acea0c7c9 100644
--- a/examples/calibration/plot_calibration.py
+++ b/examples/calibration/plot_calibration.py
@@ -47,16 +47,16 @@
 # half positive samples and half negative samples. Probability in this
 # blob is therefore 0.5.
 centers = [(-5, -5), (0, 0), (5, 5)]
-X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False,
-                  random_state=42)
+X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False, random_state=42)
 
-y[:n_samples // 2] = 0
-y[n_samples // 2:] = 1
+y[: n_samples // 2] = 0
+y[n_samples // 2 :] = 1
 sample_weight = np.random.RandomState(42).rand(y.shape[0])
 
 # split train, test for calibration
-X_train, X_test, y_train, y_test, sw_train, sw_test = \
-    train_test_split(X, y, sample_weight, test_size=0.9, random_state=42)
+X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
+    X, y, sample_weight, test_size=0.9, random_state=42
+)
 
 # Gaussian Naive-Bayes with no calibration
 clf = GaussianNB()
@@ -64,12 +64,12 @@
 prob_pos_clf = clf.predict_proba(X_test)[:, 1]
 
 # Gaussian Naive-Bayes with isotonic calibration
-clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic')
+clf_isotonic = CalibratedClassifierCV(clf, cv=2, method="isotonic")
 clf_isotonic.fit(X_train, y_train, sample_weight=sw_train)
 prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1]
 
 # Gaussian Naive-Bayes with sigmoid calibration
-clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
+clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method="sigmoid")
 clf_sigmoid.fit(X_train, y_train, sample_weight=sw_train)
 prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1]
 
@@ -78,12 +78,10 @@
 clf_score = brier_score_loss(y_test, prob_pos_clf, sample_weight=sw_test)
 print("No calibration: %1.3f" % clf_score)
 
-clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic,
-                                      sample_weight=sw_test)
+clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sample_weight=sw_test)
 print("With isotonic calibration: %1.3f" % clf_isotonic_score)
 
-clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid,
-                                     sample_weight=sw_test)
+clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sample_weight=sw_test)
 print("With sigmoid calibration: %1.3f" % clf_sigmoid_score)
 
 # #############################################################################
@@ -94,26 +92,42 @@
 for this_y, color in zip(y_unique, colors):
     this_X = X_train[y_train == this_y]
     this_sw = sw_train[y_train == this_y]
-    plt.scatter(this_X[:, 0], this_X[:, 1], s=this_sw * 50,
-                c=color[np.newaxis, :],
-                alpha=0.5, edgecolor='k',
-                label="Class %s" % this_y)
+    plt.scatter(
+        this_X[:, 0],
+        this_X[:, 1],
+        s=this_sw * 50,
+        c=color[np.newaxis, :],
+        alpha=0.5,
+        edgecolor="k",
+        label="Class %s" % this_y,
+    )
 plt.legend(loc="best")
 plt.title("Data")
 
 plt.figure()
-order = np.lexsort((prob_pos_clf, ))
-plt.plot(prob_pos_clf[order], 'r', label='No calibration (%1.3f)' % clf_score)
-plt.plot(prob_pos_isotonic[order], 'g', linewidth=3,
-         label='Isotonic calibration (%1.3f)' % clf_isotonic_score)
-plt.plot(prob_pos_sigmoid[order], 'b', linewidth=3,
-         label='Sigmoid calibration (%1.3f)' % clf_sigmoid_score)
-plt.plot(np.linspace(0, y_test.size, 51)[1::2],
-         y_test[order].reshape(25, -1).mean(1),
-         'k', linewidth=3, label=r'Empirical')
+order = np.lexsort((prob_pos_clf,))
+plt.plot(prob_pos_clf[order], "r", label="No calibration (%1.3f)" % clf_score)
+plt.plot(
+    prob_pos_isotonic[order],
+    "g",
+    linewidth=3,
+    label="Isotonic calibration (%1.3f)" % clf_isotonic_score,
+)
+plt.plot(
+    prob_pos_sigmoid[order],
+    "b",
+    linewidth=3,
+    label="Sigmoid calibration (%1.3f)" % clf_sigmoid_score,
+)
+plt.plot(
+    np.linspace(0, y_test.size, 51)[1::2],
+    y_test[order].reshape(25, -1).mean(1),
+    "k",
+    linewidth=3,
+    label=r"Empirical",
+)
 plt.ylim([-0.05, 1.05])
-plt.xlabel("Instances sorted according to predicted probability "
-           "(uncalibrated GNB)")
+plt.xlabel("Instances sorted according to predicted probability (uncalibrated GNB)")
 plt.ylabel("P(y=1)")
 plt.legend(loc="upper left")
 plt.title("Gaussian naive Bayes probabilities")
diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py
index d4bfda5a3a55d..c52cff1a858b0 100644
--- a/examples/calibration/plot_calibration_curve.py
+++ b/examples/calibration/plot_calibration_curve.py
@@ -29,11 +29,13 @@
 from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
 
-X, y = make_classification(n_samples=100_000, n_features=20, n_informative=2,
-                           n_redundant=10, random_state=42)
+X, y = make_classification(
+    n_samples=100_000, n_features=20, n_informative=2, n_redundant=10, random_state=42
+)
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99,
-                                                    random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.99, random_state=42
+)
 
 # %%
 # Calibration curves
@@ -62,33 +64,40 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import GaussianNB
 
-lr = LogisticRegression(C=1.)
+lr = LogisticRegression(C=1.0)
 gnb = GaussianNB()
-gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method='isotonic')
-gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method='sigmoid')
+gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method="isotonic")
+gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method="sigmoid")
 
-clf_list = [(lr, 'Logistic'),
-            (gnb, 'Naive Bayes'),
-            (gnb_isotonic, 'Naive Bayes + Isotonic'),
-            (gnb_sigmoid, 'Naive Bayes + Sigmoid')]
+clf_list = [
+    (lr, "Logistic"),
+    (gnb, "Naive Bayes"),
+    (gnb_isotonic, "Naive Bayes + Isotonic"),
+    (gnb_sigmoid, "Naive Bayes + Sigmoid"),
+]
 
 # %%
 fig = plt.figure(figsize=(10, 10))
 gs = GridSpec(4, 2)
-colors = plt.cm.get_cmap('Dark2')
+colors = plt.cm.get_cmap("Dark2")
 
 ax_calibration_curve = fig.add_subplot(gs[:2, :2])
 calibration_displays = {}
 for i, (clf, name) in enumerate(clf_list):
     clf.fit(X_train, y_train)
     display = CalibrationDisplay.from_estimator(
-        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
-        color=colors(i)
+        clf,
+        X_test,
+        y_test,
+        n_bins=10,
+        name=name,
+        ax=ax_calibration_curve,
+        color=colors(i),
     )
     calibration_displays[name] = display
 
 ax_calibration_curve.grid()
-ax_calibration_curve.set_title('Calibration plots (Naive Bayes)')
+ax_calibration_curve.set_title("Calibration plots (Naive Bayes)")
 
 # Add histogram
 grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
@@ -97,8 +106,11 @@
     ax = fig.add_subplot(gs[row, col])
 
     ax.hist(
-        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
-        color=colors(i)
+        calibration_displays[name].y_prob,
+        range=(0, 1),
+        bins=10,
+        label=name,
+        color=colors(i),
     )
     ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
 
@@ -128,8 +140,14 @@
 
 import pandas as pd
 
-from sklearn.metrics import (precision_score, recall_score, f1_score,
-                             brier_score_loss, log_loss, roc_auc_score)
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+    f1_score,
+    brier_score_loss,
+    log_loss,
+    roc_auc_score,
+)
 
 scores = defaultdict(list)
 for i, (clf, name) in enumerate(clf_list):
@@ -204,15 +222,17 @@ def predict_proba(self, X):
 
 # %%
 
-lr = LogisticRegression(C=1.)
+lr = LogisticRegression(C=1.0)
 svc = NaivelyCalibratedLinearSVC(max_iter=10_000)
-svc_isotonic = CalibratedClassifierCV(svc, cv=2, method='isotonic')
-svc_sigmoid = CalibratedClassifierCV(svc, cv=2, method='sigmoid')
+svc_isotonic = CalibratedClassifierCV(svc, cv=2, method="isotonic")
+svc_sigmoid = CalibratedClassifierCV(svc, cv=2, method="sigmoid")
 
-clf_list = [(lr, 'Logistic'),
-            (svc, 'SVC'),
-            (svc_isotonic, 'SVC + Isotonic'),
-            (svc_sigmoid, 'SVC + Sigmoid')]
+clf_list = [
+    (lr, "Logistic"),
+    (svc, "SVC"),
+    (svc_isotonic, "SVC + Isotonic"),
+    (svc_sigmoid, "SVC + Sigmoid"),
+]
 
 # %%
 fig = plt.figure(figsize=(10, 10))
@@ -223,13 +243,18 @@ def predict_proba(self, X):
 for i, (clf, name) in enumerate(clf_list):
     clf.fit(X_train, y_train)
     display = CalibrationDisplay.from_estimator(
-        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
-        color=colors(i)
+        clf,
+        X_test,
+        y_test,
+        n_bins=10,
+        name=name,
+        ax=ax_calibration_curve,
+        color=colors(i),
     )
     calibration_displays[name] = display
 
 ax_calibration_curve.grid()
-ax_calibration_curve.set_title('Calibration plots (SVC)')
+ax_calibration_curve.set_title("Calibration plots (SVC)")
 
 # Add histogram
 grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
@@ -238,8 +263,11 @@ def predict_proba(self, X):
     ax = fig.add_subplot(gs[row, col])
 
     ax.hist(
-        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
-        color=colors(i)
+        calibration_displays[name].y_prob,
+        range=(0, 1),
+        bins=10,
+        label=name,
+        color=colors(i),
     )
     ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
 
diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py
index ef1a53056009d..a8713dfc1f849 100644
--- a/examples/calibration/plot_calibration_multiclass.py
+++ b/examples/calibration/plot_calibration_multiclass.py
@@ -34,8 +34,9 @@ class of an instance (red: class 1, green: class 2, blue: class 3).
 
 np.random.seed(0)
 
-X, y = make_blobs(n_samples=2000, n_features=2, centers=3, random_state=42,
-                  cluster_std=5.0)
+X, y = make_blobs(
+    n_samples=2000, n_features=2, centers=3, random_state=42, cluster_std=5.0
+)
 X_train, y_train = X[:600], y[:600]
 X_valid, y_valid = X[600:1000], y[600:1000]
 X_train_valid, y_train_valid = X[:1000], y[:1000]
@@ -82,58 +83,96 @@ class of an instance (red: class 1, green: class 2, blue: class 3).
 cal_clf_probs = cal_clf.predict_proba(X_test)
 # Plot arrows
 for i in range(clf_probs.shape[0]):
-    plt.arrow(clf_probs[i, 0], clf_probs[i, 1],
-              cal_clf_probs[i, 0] - clf_probs[i, 0],
-              cal_clf_probs[i, 1] - clf_probs[i, 1],
-              color=colors[y_test[i]], head_width=1e-2)
+    plt.arrow(
+        clf_probs[i, 0],
+        clf_probs[i, 1],
+        cal_clf_probs[i, 0] - clf_probs[i, 0],
+        cal_clf_probs[i, 1] - clf_probs[i, 1],
+        color=colors[y_test[i]],
+        head_width=1e-2,
+    )
 
 # Plot perfect predictions, at each vertex
-plt.plot([1.0], [0.0], 'ro', ms=20, label="Class 1")
-plt.plot([0.0], [1.0], 'go', ms=20, label="Class 2")
-plt.plot([0.0], [0.0], 'bo', ms=20, label="Class 3")
+plt.plot([1.0], [0.0], "ro", ms=20, label="Class 1")
+plt.plot([0.0], [1.0], "go", ms=20, label="Class 2")
+plt.plot([0.0], [0.0], "bo", ms=20, label="Class 3")
 
 # Plot boundaries of unit simplex
-plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex")
+plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex")
 
 # Annotate points 6 points around the simplex, and mid point inside simplex
-plt.annotate(r'($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)',
-             xy=(1.0/3, 1.0/3), xytext=(1.0/3, .23), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.plot([1.0/3], [1.0/3], 'ko', ms=5)
-plt.annotate(r'($\frac{1}{2}$, $0$, $\frac{1}{2}$)',
-             xy=(.5, .0), xytext=(.5, .1), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.annotate(r'($0$, $\frac{1}{2}$, $\frac{1}{2}$)',
-             xy=(.0, .5), xytext=(.1, .5), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.annotate(r'($\frac{1}{2}$, $\frac{1}{2}$, $0$)',
-             xy=(.5, .5), xytext=(.6, .6), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.annotate(r'($0$, $0$, $1$)',
-             xy=(0, 0), xytext=(.1, .1), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.annotate(r'($1$, $0$, $0$)',
-             xy=(1, 0), xytext=(1, .1), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
-plt.annotate(r'($0$, $1$, $0$)',
-             xy=(0, 1), xytext=(.1, 1), xycoords='data',
-             arrowprops=dict(facecolor='black', shrink=0.05),
-             horizontalalignment='center', verticalalignment='center')
+plt.annotate(
+    r"($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)",
+    xy=(1.0 / 3, 1.0 / 3),
+    xytext=(1.0 / 3, 0.23),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.plot([1.0 / 3], [1.0 / 3], "ko", ms=5)
+plt.annotate(
+    r"($\frac{1}{2}$, $0$, $\frac{1}{2}$)",
+    xy=(0.5, 0.0),
+    xytext=(0.5, 0.1),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.annotate(
+    r"($0$, $\frac{1}{2}$, $\frac{1}{2}$)",
+    xy=(0.0, 0.5),
+    xytext=(0.1, 0.5),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.annotate(
+    r"($\frac{1}{2}$, $\frac{1}{2}$, $0$)",
+    xy=(0.5, 0.5),
+    xytext=(0.6, 0.6),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.annotate(
+    r"($0$, $0$, $1$)",
+    xy=(0, 0),
+    xytext=(0.1, 0.1),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.annotate(
+    r"($1$, $0$, $0$)",
+    xy=(1, 0),
+    xytext=(1, 0.1),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
+plt.annotate(
+    r"($0$, $1$, $0$)",
+    xy=(0, 1),
+    xytext=(0.1, 1),
+    xycoords="data",
+    arrowprops=dict(facecolor="black", shrink=0.05),
+    horizontalalignment="center",
+    verticalalignment="center",
+)
 # Add grid
 plt.grid(False)
 for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
-    plt.plot([0, x], [x, 0], 'k', alpha=0.2)
-    plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2)
-    plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2)
+    plt.plot([0, x], [x, 0], "k", alpha=0.2)
+    plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2)
+    plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2)
 
-plt.title("Change of predicted probabilities on test samples "
-          "after sigmoid calibration")
+plt.title("Change of predicted probabilities on test samples after sigmoid calibration")
 plt.xlabel("Probability class 1")
 plt.ylabel("Probability class 2")
 plt.xlim(-0.05, 1.05)
@@ -193,9 +232,12 @@ class of an instance (red: class 1, green: class 2, blue: class 3).
 
 # Use the three class-wise calibrators to compute calibrated probabilities
 calibrated_classifier = cal_clf.calibrated_classifiers_[0]
-prediction = np.vstack([calibrator.predict(this_p)
-                        for calibrator, this_p in
-                        zip(calibrated_classifier.calibrators, p.T)]).T
+prediction = np.vstack(
+    [
+        calibrator.predict(this_p)
+        for calibrator, this_p in zip(calibrated_classifier.calibrators, p.T)
+    ]
+).T
 
 # Re-normalize the calibrated predictions to make sure they stay inside the
 # simplex. This same renormalization step is performed internally by the
@@ -204,18 +246,23 @@ class of an instance (red: class 1, green: class 2, blue: class 3).
 
 # Plot changes in predicted probabilities induced by the calibrators
 for i in range(prediction.shape[0]):
-    plt.arrow(p[i, 0], p[i, 1],
-              prediction[i, 0] - p[i, 0], prediction[i, 1] - p[i, 1],
-              head_width=1e-2, color=colors[np.argmax(p[i])])
+    plt.arrow(
+        p[i, 0],
+        p[i, 1],
+        prediction[i, 0] - p[i, 0],
+        prediction[i, 1] - p[i, 1],
+        head_width=1e-2,
+        color=colors[np.argmax(p[i])],
+    )
 
 # Plot the boundaries of the unit simplex
-plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex")
+plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex")
 
 plt.grid(False)
 for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
-    plt.plot([0, x], [x, 0], 'k', alpha=0.2)
-    plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2)
-    plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2)
+    plt.plot([0, x], [x, 0], "k", alpha=0.2)
+    plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2)
+    plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2)
 
 plt.title("Learned sigmoid calibration map")
 plt.xlabel("Probability class 1")
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 7ee4eaf4da7df..f866e45e0ba2b 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -32,13 +32,15 @@
 from sklearn.model_selection import train_test_split
 
 X, y = make_classification(
-  n_samples=100_000, n_features=20, n_informative=2, n_redundant=2,
-  random_state=42
+    n_samples=100_000, n_features=20, n_informative=2, n_redundant=2, random_state=42
 )
 
 train_samples = 100  # Samples used for training the models
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, shuffle=False, test_size=100_000 - train_samples,
+    X,
+    y,
+    shuffle=False,
+    test_size=100_000 - train_samples,
 )
 
 # %%
@@ -92,10 +94,12 @@ def predict_proba(self, X):
 svc = NaivelyCalibratedLinearSVC(C=1.0)
 rfc = RandomForestClassifier()
 
-clf_list = [(lr, 'Logistic'),
-            (gnb, 'Naive Bayes'),
-            (svc, 'SVC'),
-            (rfc, 'Random forest')]
+clf_list = [
+    (lr, "Logistic"),
+    (gnb, "Naive Bayes"),
+    (svc, "SVC"),
+    (rfc, "Random forest"),
+]
 
 # %%
 
@@ -104,20 +108,25 @@ def predict_proba(self, X):
 
 fig = plt.figure(figsize=(10, 10))
 gs = GridSpec(4, 2)
-colors = plt.cm.get_cmap('Dark2')
+colors = plt.cm.get_cmap("Dark2")
 
 ax_calibration_curve = fig.add_subplot(gs[:2, :2])
 calibration_displays = {}
 for i, (clf, name) in enumerate(clf_list):
     clf.fit(X_train, y_train)
     display = CalibrationDisplay.from_estimator(
-        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
-        color=colors(i)
+        clf,
+        X_test,
+        y_test,
+        n_bins=10,
+        name=name,
+        ax=ax_calibration_curve,
+        color=colors(i),
     )
     calibration_displays[name] = display
 
 ax_calibration_curve.grid()
-ax_calibration_curve.set_title('Calibration plots')
+ax_calibration_curve.set_title("Calibration plots")
 
 # Add histogram
 grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
@@ -126,8 +135,11 @@ def predict_proba(self, X):
     ax = fig.add_subplot(gs[row, col])
 
     ax.hist(
-        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
-        color=colors(i)
+        calibration_displays[name].y_prob,
+        range=(0, 1),
+        bins=10,
+        label=name,
+        color=colors(i),
     )
     ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
 
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index ea4df9e6fb583..f88c749cb2b40 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -41,27 +41,23 @@
 
 # Create different classifiers.
 classifiers = {
-    'L1 logistic': LogisticRegression(C=C, penalty='l1',
-                                      solver='saga',
-                                      multi_class='multinomial',
-                                      max_iter=10000),
-    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
-                                                    solver='saga',
-                                                    multi_class='multinomial',
-                                                    max_iter=10000),
-    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
-                                            solver='saga',
-                                            multi_class='ovr',
-                                            max_iter=10000),
-    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
-                      random_state=0),
-    'GPC': GaussianProcessClassifier(kernel)
+    "L1 logistic": LogisticRegression(
+        C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000
+    ),
+    "L2 logistic (Multinomial)": LogisticRegression(
+        C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000
+    ),
+    "L2 logistic (OvR)": LogisticRegression(
+        C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000
+    ),
+    "Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0),
+    "GPC": GaussianProcessClassifier(kernel),
 }
 
 n_classifiers = len(classifiers)
 
 plt.figure(figsize=(3 * 2, n_classifiers * 2))
-plt.subplots_adjust(bottom=.2, top=.95)
+plt.subplots_adjust(bottom=0.2, top=0.95)
 
 xx = np.linspace(3, 9, 100)
 yy = np.linspace(1, 5, 100).T
@@ -83,16 +79,17 @@
         plt.title("Class %d" % k)
         if k == 0:
             plt.ylabel(name)
-        imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),
-                                   extent=(3, 9, 1, 5), origin='lower')
+        imshow_handle = plt.imshow(
+            probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower"
+        )
         plt.xticks(())
         plt.yticks(())
-        idx = (y_pred == k)
+        idx = y_pred == k
         if idx.any():
-            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')
+            plt.scatter(X[idx, 0], X[idx, 1], marker="o", c="w", edgecolor="k")
 
 ax = plt.axes([0.15, 0.04, 0.7, 0.05])
 plt.title("Probability")
-plt.colorbar(imshow_handle, cax=ax, orientation='horizontal')
+plt.colorbar(imshow_handle, cax=ax, orientation="horizontal")
 
 plt.show()
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index 83019e821dae5..1b38b7427b6c6 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -44,11 +44,20 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 
-h = .02  # step size in the mesh
-
-names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
-         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
-         "Naive Bayes", "QDA"]
+h = 0.02  # step size in the mesh
+
+names = [
+    "Nearest Neighbors",
+    "Linear SVM",
+    "RBF SVM",
+    "Gaussian Process",
+    "Decision Tree",
+    "Random Forest",
+    "Neural Net",
+    "AdaBoost",
+    "Naive Bayes",
+    "QDA",
+]
 
 classifiers = [
     KNeighborsClassifier(3),
@@ -60,18 +69,21 @@
     MLPClassifier(alpha=1, max_iter=1000),
     AdaBoostClassifier(),
     GaussianNB(),
-    QuadraticDiscriminantAnalysis()]
+    QuadraticDiscriminantAnalysis(),
+]
 
-X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
-                           random_state=1, n_clusters_per_class=1)
+X, y = make_classification(
+    n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
+)
 rng = np.random.RandomState(2)
 X += 2 * rng.uniform(size=X.shape)
 linearly_separable = (X, y)
 
-datasets = [make_moons(noise=0.3, random_state=0),
-            make_circles(noise=0.2, factor=0.5, random_state=1),
-            linearly_separable
-            ]
+datasets = [
+    make_moons(noise=0.3, random_state=0),
+    make_circles(noise=0.2, factor=0.5, random_state=1),
+    linearly_separable,
+]
 
 figure = plt.figure(figsize=(27, 9))
 i = 1
@@ -80,26 +92,26 @@
     # preprocess dataset, split into training and test part
     X, y = ds
     X = StandardScaler().fit_transform(X)
-    X_train, X_test, y_train, y_test = \
-        train_test_split(X, y, test_size=.4, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.4, random_state=42
+    )
 
-    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
     # just plot the dataset first
     cm = plt.cm.RdBu
-    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
+    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
     ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
     if ds_cnt == 0:
         ax.set_title("Input data")
     # Plot the training points
-    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
-               edgecolors='k')
+    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
     # Plot the testing points
-    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
-               edgecolors='k')
+    ax.scatter(
+        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
+    )
     ax.set_xlim(xx.min(), xx.max())
     ax.set_ylim(yy.min(), yy.max())
     ax.set_xticks(())
@@ -121,14 +133,21 @@
 
         # Put the result into a color plot
         Z = Z.reshape(xx.shape)
-        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
 
         # Plot the training points
-        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
-                   edgecolors='k')
+        ax.scatter(
+            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
+        )
         # Plot the testing points
-        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
-                   edgecolors='k', alpha=0.6)
+        ax.scatter(
+            X_test[:, 0],
+            X_test[:, 1],
+            c=y_test,
+            cmap=cm_bright,
+            edgecolors="k",
+            alpha=0.6,
+        )
 
         ax.set_xlim(xx.min(), xx.max())
         ax.set_ylim(yy.min(), yy.max())
@@ -136,8 +155,13 @@
         ax.set_yticks(())
         if ds_cnt == 0:
             ax.set_title(name)
-        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
-                size=15, horizontalalignment='right')
+        ax.text(
+            xx.max() - 0.3,
+            yy.min() + 0.3,
+            ("%.2f" % score).lstrip("0"),
+            size=15,
+            horizontalalignment="right",
+        )
         i += 1
 
 plt.tight_layout()
diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
index 8cb61df2c9736..1c4f150c37374 100644
--- a/examples/classification/plot_digits_classification.py
+++ b/examples/classification/plot_digits_classification.py
@@ -38,8 +38,8 @@
 _, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
 for ax, image, label in zip(axes, digits.images, digits.target):
     ax.set_axis_off()
-    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
-    ax.set_title('Training: %i' % label)
+    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
+    ax.set_title("Training: %i" % label)
 
 ###############################################################################
 # Classification
@@ -65,7 +65,8 @@
 
 # Split data into 50% train and 50% test subsets
 X_train, X_test, y_train, y_test = train_test_split(
-    data, digits.target, test_size=0.5, shuffle=False)
+    data, digits.target, test_size=0.5, shuffle=False
+)
 
 # Learn the digits on the train subset
 clf.fit(X_train, y_train)
@@ -81,15 +82,17 @@
 for ax, image, prediction in zip(axes, X_test, predicted):
     ax.set_axis_off()
     image = image.reshape(8, 8)
-    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
-    ax.set_title(f'Prediction: {prediction}')
+    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
+    ax.set_title(f"Prediction: {prediction}")
 
 ###############################################################################
 # :func:`~sklearn.metrics.classification_report` builds a text report showing
 # the main classification metrics.
 
-print(f"Classification report for classifier {clf}:\n"
-      f"{metrics.classification_report(y_test, predicted)}\n")
+print(
+    f"Classification report for classifier {clf}:\n"
+    f"{metrics.classification_report(y_test, predicted)}\n"
+)
 
 ###############################################################################
 # We can also plot a :ref:`confusion matrix <confusion_matrix>` of the
diff --git a/examples/classification/plot_lda.py b/examples/classification/plot_lda.py
index ad16e7b0d2efa..856f2e206e9c9 100644
--- a/examples/classification/plot_lda.py
+++ b/examples/classification/plot_lda.py
@@ -45,13 +45,12 @@ def generate_data(n_samples, n_features):
     for _ in range(n_averages):
         X, y = generate_data(n_train, n_features)
 
-        clf1 = LinearDiscriminantAnalysis(solver='lsqr',
-                                          shrinkage='auto').fit(X, y)
-        clf2 = LinearDiscriminantAnalysis(solver='lsqr',
-                                          shrinkage=None).fit(X, y)
+        clf1 = LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto").fit(X, y)
+        clf2 = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=None).fit(X, y)
         oa = OAS(store_precision=False, assume_centered=False)
-        clf3 = LinearDiscriminantAnalysis(solver='lsqr',
-                                          covariance_estimator=oa).fit(X, y)
+        clf3 = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=oa).fit(
+            X, y
+        )
 
         X, y = generate_data(n_test, n_features)
         score_clf1 += clf1.score(X, y)
@@ -64,18 +63,37 @@ def generate_data(n_samples, n_features):
 
 features_samples_ratio = np.array(n_features_range) / n_train
 
-plt.plot(features_samples_ratio, acc_clf1, linewidth=2,
-         label="Linear Discriminant Analysis with Ledoit Wolf", color='navy')
-plt.plot(features_samples_ratio, acc_clf2, linewidth=2,
-         label="Linear Discriminant Analysis", color='gold')
-plt.plot(features_samples_ratio, acc_clf3, linewidth=2,
-         label="Linear Discriminant Analysis with OAS", color='red')
-
-plt.xlabel('n_features / n_samples')
-plt.ylabel('Classification accuracy')
-
-plt.legend(loc=3, prop={'size': 12})
-plt.suptitle('Linear Discriminant Analysis vs. ' + '\n'
-             + 'Shrinkage Linear Discriminant Analysis vs. ' + '\n'
-             + 'OAS Linear Discriminant Analysis (1 discriminative feature)')
+plt.plot(
+    features_samples_ratio,
+    acc_clf1,
+    linewidth=2,
+    label="Linear Discriminant Analysis with Ledoit Wolf",
+    color="navy",
+)
+plt.plot(
+    features_samples_ratio,
+    acc_clf2,
+    linewidth=2,
+    label="Linear Discriminant Analysis",
+    color="gold",
+)
+plt.plot(
+    features_samples_ratio,
+    acc_clf3,
+    linewidth=2,
+    label="Linear Discriminant Analysis with OAS",
+    color="red",
+)
+
+plt.xlabel("n_features / n_samples")
+plt.ylabel("Classification accuracy")
+
+plt.legend(loc=3, prop={"size": 12})
+plt.suptitle(
+    "Linear Discriminant Analysis vs. "
+    + "\n"
+    + "Shrinkage Linear Discriminant Analysis vs. "
+    + "\n"
+    + "OAS Linear Discriminant Analysis (1 discriminative feature)"
+)
 plt.show()
diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py
index d02adb03e6028..951f6b8c4cd29 100644
--- a/examples/classification/plot_lda_qda.py
+++ b/examples/classification/plot_lda_qda.py
@@ -23,33 +23,40 @@ class has its own standard deviation with QDA.
 # #############################################################################
 # Colormap
 cmap = colors.LinearSegmentedColormap(
-    'red_blue_classes',
-    {'red': [(0, 1, 1), (1, 0.7, 0.7)],
-     'green': [(0, 0.7, 0.7), (1, 0.7, 0.7)],
-     'blue': [(0, 0.7, 0.7), (1, 1, 1)]})
+    "red_blue_classes",
+    {
+        "red": [(0, 1, 1), (1, 0.7, 0.7)],
+        "green": [(0, 0.7, 0.7), (1, 0.7, 0.7)],
+        "blue": [(0, 0.7, 0.7), (1, 1, 1)],
+    },
+)
 plt.cm.register_cmap(cmap=cmap)
 
 
 # #############################################################################
 # Generate datasets
 def dataset_fixed_cov():
-    '''Generate 2 Gaussians samples with the same covariance matrix'''
+    """Generate 2 Gaussians samples with the same covariance matrix"""
     n, dim = 300, 2
     np.random.seed(0)
-    C = np.array([[0., -0.23], [0.83, .23]])
-    X = np.r_[np.dot(np.random.randn(n, dim), C),
-              np.dot(np.random.randn(n, dim), C) + np.array([1, 1])]
+    C = np.array([[0.0, -0.23], [0.83, 0.23]])
+    X = np.r_[
+        np.dot(np.random.randn(n, dim), C),
+        np.dot(np.random.randn(n, dim), C) + np.array([1, 1]),
+    ]
     y = np.hstack((np.zeros(n), np.ones(n)))
     return X, y
 
 
 def dataset_cov():
-    '''Generate 2 Gaussians samples with different covariance matrices'''
+    """Generate 2 Gaussians samples with different covariance matrices"""
     n, dim = 300, 2
     np.random.seed(0)
-    C = np.array([[0., -1.], [2.5, .7]]) * 2.
-    X = np.r_[np.dot(np.random.randn(n, dim), C),
-              np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])]
+    C = np.array([[0.0, -1.0], [2.5, 0.7]]) * 2.0
+    X = np.r_[
+        np.dot(np.random.randn(n, dim), C),
+        np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4]),
+    ]
     y = np.hstack((np.zeros(n), np.ones(n)))
     return X, y
 
@@ -59,46 +66,58 @@ def dataset_cov():
 def plot_data(lda, X, y, y_pred, fig_index):
     splot = plt.subplot(2, 2, fig_index)
     if fig_index == 1:
-        plt.title('Linear Discriminant Analysis')
-        plt.ylabel('Data with\n fixed covariance')
+        plt.title("Linear Discriminant Analysis")
+        plt.ylabel("Data with\n fixed covariance")
     elif fig_index == 2:
-        plt.title('Quadratic Discriminant Analysis')
+        plt.title("Quadratic Discriminant Analysis")
     elif fig_index == 3:
-        plt.ylabel('Data with\n varying covariances')
+        plt.ylabel("Data with\n varying covariances")
 
-    tp = (y == y_pred)  # True Positive
+    tp = y == y_pred  # True Positive
     tp0, tp1 = tp[y == 0], tp[y == 1]
     X0, X1 = X[y == 0], X[y == 1]
     X0_tp, X0_fp = X0[tp0], X0[~tp0]
     X1_tp, X1_fp = X1[tp1], X1[~tp1]
 
     # class 0: dots
-    plt.scatter(X0_tp[:, 0], X0_tp[:, 1], marker='.', color='red')
-    plt.scatter(X0_fp[:, 0], X0_fp[:, 1], marker='x',
-                s=20, color='#990000')  # dark red
+    plt.scatter(X0_tp[:, 0], X0_tp[:, 1], marker=".", color="red")
+    plt.scatter(X0_fp[:, 0], X0_fp[:, 1], marker="x", s=20, color="#990000")  # dark red
 
     # class 1: dots
-    plt.scatter(X1_tp[:, 0], X1_tp[:, 1], marker='.', color='blue')
-    plt.scatter(X1_fp[:, 0], X1_fp[:, 1], marker='x',
-                s=20, color='#000099')  # dark blue
+    plt.scatter(X1_tp[:, 0], X1_tp[:, 1], marker=".", color="blue")
+    plt.scatter(
+        X1_fp[:, 0], X1_fp[:, 1], marker="x", s=20, color="#000099"
+    )  # dark blue
 
     # class 0 and 1 : areas
     nx, ny = 200, 100
     x_min, x_max = plt.xlim()
     y_min, y_max = plt.ylim()
-    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
-                         np.linspace(y_min, y_max, ny))
+    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny))
     Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
     Z = Z[:, 1].reshape(xx.shape)
-    plt.pcolormesh(xx, yy, Z, cmap='red_blue_classes',
-                   norm=colors.Normalize(0., 1.), zorder=0)
-    plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')
+    plt.pcolormesh(
+        xx, yy, Z, cmap="red_blue_classes", norm=colors.Normalize(0.0, 1.0), zorder=0
+    )
+    plt.contour(xx, yy, Z, [0.5], linewidths=2.0, colors="white")
 
     # means
-    plt.plot(lda.means_[0][0], lda.means_[0][1],
-             '*', color='yellow', markersize=15, markeredgecolor='grey')
-    plt.plot(lda.means_[1][0], lda.means_[1][1],
-             '*', color='yellow', markersize=15, markeredgecolor='grey')
+    plt.plot(
+        lda.means_[0][0],
+        lda.means_[0][1],
+        "*",
+        color="yellow",
+        markersize=15,
+        markeredgecolor="grey",
+    )
+    plt.plot(
+        lda.means_[1][0],
+        lda.means_[1][1],
+        "*",
+        color="yellow",
+        markersize=15,
+        markeredgecolor="grey",
+    )
 
     return splot
 
@@ -109,9 +128,15 @@ def plot_ellipse(splot, mean, cov, color):
     angle = np.arctan(u[1] / u[0])
     angle = 180 * angle / np.pi  # convert to degrees
     # filled Gaussian at 2 standard deviation
-    ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
-                              180 + angle, facecolor=color,
-                              edgecolor='black', linewidth=2)
+    ell = mpl.patches.Ellipse(
+        mean,
+        2 * v[0] ** 0.5,
+        2 * v[1] ** 0.5,
+        180 + angle,
+        facecolor=color,
+        edgecolor="black",
+        linewidth=2,
+    )
     ell.set_clip_box(splot.bbox)
     ell.set_alpha(0.2)
     splot.add_artist(ell)
@@ -120,32 +145,35 @@ def plot_ellipse(splot, mean, cov, color):
 
 
 def plot_lda_cov(lda, splot):
-    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
-    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')
+    plot_ellipse(splot, lda.means_[0], lda.covariance_, "red")
+    plot_ellipse(splot, lda.means_[1], lda.covariance_, "blue")
 
 
 def plot_qda_cov(qda, splot):
-    plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red')
-    plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue')
+    plot_ellipse(splot, qda.means_[0], qda.covariance_[0], "red")
+    plot_ellipse(splot, qda.means_[1], qda.covariance_[1], "blue")
 
 
-plt.figure(figsize=(10, 8), facecolor='white')
-plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis',
-             y=0.98, fontsize=15)
+plt.figure(figsize=(10, 8), facecolor="white")
+plt.suptitle(
+    "Linear Discriminant Analysis vs Quadratic Discriminant Analysis",
+    y=0.98,
+    fontsize=15,
+)
 for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
     # Linear Discriminant Analysis
     lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
     y_pred = lda.fit(X, y).predict(X)
     splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
     plot_lda_cov(lda, splot)
-    plt.axis('tight')
+    plt.axis("tight")
 
     # Quadratic Discriminant Analysis
     qda = QuadraticDiscriminantAnalysis(store_covariance=True)
     y_pred = qda.fit(X, y).predict(X)
     splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
     plot_qda_cov(qda, splot)
-    plt.axis('tight')
+    plt.axis("tight")
 plt.tight_layout()
 plt.subplots_adjust(top=0.92)
 plt.show()
diff --git a/examples/cluster/plot_adjusted_for_chance_measures.py b/examples/cluster/plot_adjusted_for_chance_measures.py
index c84266378bb3f..0b77144ef5256 100644
--- a/examples/cluster/plot_adjusted_for_chance_measures.py
+++ b/examples/cluster/plot_adjusted_for_chance_measures.py
@@ -31,8 +31,9 @@
 from sklearn import metrics
 
 
-def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
-                             fixed_n_classes=None, n_runs=5, seed=42):
+def uniform_labelings_scores(
+    score_func, n_samples, n_clusters_range, fixed_n_classes=None, n_runs=5, seed=42
+):
     """Compute score for 2 random uniform cluster labelings.
 
     Both random labelings have the same number of clusters for each value
@@ -77,20 +78,24 @@ def ami_score(U, V):
 plots = []
 names = []
 for score_func in score_funcs:
-    print("Computing %s for %d values of n_clusters and n_samples=%d"
-          % (score_func.__name__, len(n_clusters_range), n_samples))
+    print(
+        "Computing %s for %d values of n_clusters and n_samples=%d"
+        % (score_func.__name__, len(n_clusters_range), n_samples)
+    )
 
     t0 = time()
     scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
     print("done in %0.3fs" % (time() - t0))
-    plots.append(plt.errorbar(
-        n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0])
+    plots.append(
+        plt.errorbar(n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0]
+    )
     names.append(score_func.__name__)
 
-plt.title("Clustering measures for 2 random uniform labelings\n"
-          "with equal number of clusters")
-plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
-plt.ylabel('Score value')
+plt.title(
+    "Clustering measures for 2 random uniform labelings\nwith equal number of clusters"
+)
+plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples)
+plt.ylabel("Score value")
 plt.legend(plots, names)
 plt.ylim(bottom=-0.05, top=1.05)
 
@@ -107,21 +112,27 @@ def ami_score(U, V):
 plots = []
 names = []
 for score_func in score_funcs:
-    print("Computing %s for %d values of n_clusters and n_samples=%d"
-          % (score_func.__name__, len(n_clusters_range), n_samples))
+    print(
+        "Computing %s for %d values of n_clusters and n_samples=%d"
+        % (score_func.__name__, len(n_clusters_range), n_samples)
+    )
 
     t0 = time()
-    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,
-                                      fixed_n_classes=n_classes)
+    scores = uniform_labelings_scores(
+        score_func, n_samples, n_clusters_range, fixed_n_classes=n_classes
+    )
     print("done in %0.3fs" % (time() - t0))
-    plots.append(plt.errorbar(
-        n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0])
+    plots.append(
+        plt.errorbar(n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0]
+    )
     names.append(score_func.__name__)
 
-plt.title("Clustering measures for random uniform labeling\n"
-          "against reference assignment with %d classes" % n_classes)
-plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)
-plt.ylabel('Score value')
+plt.title(
+    "Clustering measures for random uniform labeling\n"
+    "against reference assignment with %d classes" % n_classes
+)
+plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples)
+plt.ylabel("Score value")
 plt.ylim(bottom=-0.05, top=1.05)
 plt.legend(plots, names)
 plt.show()
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 101a60b74ec06..799d8d2d949b7 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -17,8 +17,9 @@
 # #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
-X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
-                            random_state=0)
+X, labels_true = make_blobs(
+    n_samples=300, centers=centers, cluster_std=0.5, random_state=0
+)
 
 # #############################################################################
 # Compute Affinity Propagation
@@ -28,35 +29,44 @@
 
 n_clusters_ = len(cluster_centers_indices)
 
-print('Estimated number of clusters: %d' % n_clusters_)
+print("Estimated number of clusters: %d" % n_clusters_)
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
 print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
 print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
-print("Adjusted Rand Index: %0.3f"
-      % metrics.adjusted_rand_score(labels_true, labels))
-print("Adjusted Mutual Information: %0.3f"
-      % metrics.adjusted_mutual_info_score(labels_true, labels))
-print("Silhouette Coefficient: %0.3f"
-      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
+print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
+print(
+    "Adjusted Mutual Information: %0.3f"
+    % metrics.adjusted_mutual_info_score(labels_true, labels)
+)
+print(
+    "Silhouette Coefficient: %0.3f"
+    % metrics.silhouette_score(X, labels, metric="sqeuclidean")
+)
 
 # #############################################################################
 # Plot result
 import matplotlib.pyplot as plt
 from itertools import cycle
 
-plt.close('all')
+plt.close("all")
 plt.figure(1)
 plt.clf()
 
-colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
+colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk")
 for k, col in zip(range(n_clusters_), colors):
     class_members = labels == k
     cluster_center = X[cluster_centers_indices[k]]
-    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
-    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
-             markeredgecolor='k', markersize=14)
+    plt.plot(X[class_members, 0], X[class_members, 1], col + ".")
+    plt.plot(
+        cluster_center[0],
+        cluster_center[1],
+        "o",
+        markerfacecolor=col,
+        markeredgecolor="k",
+        markersize=14,
+    )
     for x in X[class_members]:
         plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
 
-plt.title('Estimated number of clusters: %d' % n_clusters_)
+plt.title("Estimated number of clusters: %d" % n_clusters_)
 plt.show()
diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py
index 5b846591bee70..bc3adbf376bb6 100644
--- a/examples/cluster/plot_agglomerative_clustering.py
+++ b/examples/cluster/plot_agglomerative_clustering.py
@@ -40,7 +40,7 @@
 
 
 X = np.concatenate((x, y))
-X += .7 * np.random.randn(2, n_samples)
+X += 0.7 * np.random.randn(2, n_samples)
 X = X.T
 
 # Create a graph capturing local connectivity. Larger number of neighbors
@@ -53,28 +53,28 @@
 for connectivity in (None, knn_graph):
     for n_clusters in (30, 3):
         plt.figure(figsize=(10, 4))
-        for index, linkage in enumerate(('average',
-                                         'complete',
-                                         'ward',
-                                         'single')):
+        for index, linkage in enumerate(("average", "complete", "ward", "single")):
             plt.subplot(1, 4, index + 1)
-            model = AgglomerativeClustering(linkage=linkage,
-                                            connectivity=connectivity,
-                                            n_clusters=n_clusters)
+            model = AgglomerativeClustering(
+                linkage=linkage, connectivity=connectivity, n_clusters=n_clusters
+            )
             t0 = time.time()
             model.fit(X)
             elapsed_time = time.time() - t0
-            plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
-                        cmap=plt.cm.nipy_spectral)
-            plt.title('linkage=%s\n(time %.2fs)' % (linkage, elapsed_time),
-                      fontdict=dict(verticalalignment='top'))
-            plt.axis('equal')
-            plt.axis('off')
+            plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.nipy_spectral)
+            plt.title(
+                "linkage=%s\n(time %.2fs)" % (linkage, elapsed_time),
+                fontdict=dict(verticalalignment="top"),
+            )
+            plt.axis("equal")
+            plt.axis("off")
 
-            plt.subplots_adjust(bottom=0, top=.83, wspace=0,
-                                left=0, right=1)
-            plt.suptitle('n_cluster=%i, connectivity=%r' %
-                         (n_clusters, connectivity is not None), size=17)
+            plt.subplots_adjust(bottom=0, top=0.83, wspace=0, left=0, right=1)
+            plt.suptitle(
+                "n_cluster=%i, connectivity=%r"
+                % (n_clusters, connectivity is not None),
+                size=17,
+            )
 
 
 plt.show()
diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py
index 704cf08106055..4901403a4d54b 100644
--- a/examples/cluster/plot_agglomerative_clustering_metrics.py
+++ b/examples/cluster/plot_agglomerative_clustering_metrics.py
@@ -54,17 +54,21 @@ def sqr(x):
 
 X = list()
 y = list()
-for i, (phi, a) in enumerate([(.5, .15), (.5, .6), (.3, .2)]):
+for i, (phi, a) in enumerate([(0.5, 0.15), (0.5, 0.6), (0.3, 0.2)]):
     for _ in range(30):
-        phase_noise = .01 * np.random.normal()
-        amplitude_noise = .04 * np.random.normal()
+        phase_noise = 0.01 * np.random.normal()
+        amplitude_noise = 0.04 * np.random.normal()
         additional_noise = 1 - 2 * np.random.rand(n_features)
         # Make the noise sparse
-        additional_noise[np.abs(additional_noise) < .997] = 0
-
-        X.append(12 * ((a + amplitude_noise)
-                 * (sqr(6 * (t + phi + phase_noise)))
-                 + additional_noise))
+        additional_noise[np.abs(additional_noise) < 0.997] = 0
+
+        X.append(
+            12
+            * (
+                (a + amplitude_noise) * (sqr(6 * (t + phi + phase_noise)))
+                + additional_noise
+            )
+        )
         y.append(i)
 
 X = np.array(X)
@@ -72,20 +76,19 @@ def sqr(x):
 
 n_clusters = 3
 
-labels = ('Waveform 1', 'Waveform 2', 'Waveform 3')
+labels = ("Waveform 1", "Waveform 2", "Waveform 3")
 
 # Plot the ground-truth labelling
 plt.figure()
 plt.axes([0, 0, 1, 1])
-for l, c, n in zip(range(n_clusters), 'rgb',
-                   labels):
-    lines = plt.plot(X[y == l].T, c=c, alpha=.5)
+for l, c, n in zip(range(n_clusters), "rgb", labels):
+    lines = plt.plot(X[y == l].T, c=c, alpha=0.5)
     lines[0].set_label(n)
 
-plt.legend(loc='best')
+plt.legend(loc="best")
 
-plt.axis('tight')
-plt.axis('off')
+plt.axis("tight")
+plt.axis("off")
 plt.suptitle("Ground truth", size=20)
 
 
@@ -95,17 +98,21 @@ def sqr(x):
     plt.figure(figsize=(5, 4.5))
     for i in range(n_clusters):
         for j in range(n_clusters):
-            avg_dist[i, j] = pairwise_distances(X[y == i], X[y == j],
-                                                metric=metric).mean()
+            avg_dist[i, j] = pairwise_distances(
+                X[y == i], X[y == j], metric=metric
+            ).mean()
     avg_dist /= avg_dist.max()
     for i in range(n_clusters):
         for j in range(n_clusters):
-            plt.text(i, j, '%5.3f' % avg_dist[i, j],
-                     verticalalignment='center',
-                     horizontalalignment='center')
-
-    plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2,
-               vmin=0)
+            plt.text(
+                i,
+                j,
+                "%5.3f" % avg_dist[i, j],
+                verticalalignment="center",
+                horizontalalignment="center",
+            )
+
+    plt.imshow(avg_dist, interpolation="nearest", cmap=plt.cm.gnuplot2, vmin=0)
     plt.xticks(range(n_clusters), labels, rotation=45)
     plt.yticks(range(n_clusters), labels)
     plt.colorbar()
@@ -115,15 +122,16 @@ def sqr(x):
 
 # Plot clustering results
 for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
-    model = AgglomerativeClustering(n_clusters=n_clusters,
-                                    linkage="average", affinity=metric)
+    model = AgglomerativeClustering(
+        n_clusters=n_clusters, linkage="average", affinity=metric
+    )
     model.fit(X)
     plt.figure()
     plt.axes([0, 0, 1, 1])
-    for l, c in zip(np.arange(model.n_clusters), 'rgbk'):
-        plt.plot(X[model.labels_ == l].T, c=c, alpha=.5)
-    plt.axis('tight')
-    plt.axis('off')
+    for l, c in zip(np.arange(model.n_clusters), "rgbk"):
+        plt.plot(X[model.labels_ == l].T, c=c, alpha=0.5)
+    plt.axis("tight")
+    plt.axis("off")
     plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20)
 
 
diff --git a/examples/cluster/plot_agglomerative_dendrogram.py b/examples/cluster/plot_agglomerative_dendrogram.py
index 401d28803b946..94ede3c2451c0 100644
--- a/examples/cluster/plot_agglomerative_dendrogram.py
+++ b/examples/cluster/plot_agglomerative_dendrogram.py
@@ -31,8 +31,9 @@ def plot_dendrogram(model, **kwargs):
                 current_count += counts[child_idx - n_samples]
         counts[i] = current_count
 
-    linkage_matrix = np.column_stack([model.children_, model.distances_,
-                                      counts]).astype(float)
+    linkage_matrix = np.column_stack(
+        [model.children_, model.distances_, counts]
+    ).astype(float)
 
     # Plot the corresponding dendrogram
     dendrogram(linkage_matrix, **kwargs)
@@ -45,8 +46,8 @@ def plot_dendrogram(model, **kwargs):
 model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
 
 model = model.fit(X)
-plt.title('Hierarchical Clustering Dendrogram')
+plt.title("Hierarchical Clustering Dendrogram")
 # plot the top three levels of the dendrogram
-plot_dendrogram(model, truncate_mode='level', p=3)
+plot_dendrogram(model, truncate_mode="level", p=3)
 plt.xlabel("Number of points in node (or index of point if no parenthesis).")
 plt.show()
diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py
index c4648ee5bd795..67c554c4469f2 100644
--- a/examples/cluster/plot_birch_vs_minibatchkmeans.py
+++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py
@@ -33,8 +33,7 @@
 xx = np.linspace(-22, 22, 10)
 yy = np.linspace(-22, 22, 10)
 xx, yy = np.meshgrid(xx, yy)
-n_centres = np.hstack((np.ravel(xx)[:, np.newaxis],
-                       np.ravel(yy)[:, np.newaxis]))
+n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis]))
 
 # Generate blobs to do a comparison between MiniBatchKMeans and BIRCH.
 X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0)
@@ -47,16 +46,17 @@
 
 # Compute clustering with BIRCH with and without the final clustering step
 # and plot.
-birch_models = [Birch(threshold=1.7, n_clusters=None),
-                Birch(threshold=1.7, n_clusters=100)]
-final_step = ['without global clustering', 'with global clustering']
+birch_models = [
+    Birch(threshold=1.7, n_clusters=None),
+    Birch(threshold=1.7, n_clusters=100),
+]
+final_step = ["without global clustering", "with global clustering"]
 
 for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):
     t = time()
     birch_model.fit(X)
     time_ = time() - t
-    print("BIRCH %s as the final step took %0.2f seconds" % (
-          info, (time() - t)))
+    print("BIRCH %s as the final step took %0.2f seconds" % (info, (time() - t)))
 
     # Plot result
     labels = birch_model.labels_
@@ -67,20 +67,24 @@
     ax = fig.add_subplot(1, 3, ind + 1)
     for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
         mask = labels == k
-        ax.scatter(X[mask, 0], X[mask, 1],
-                   c='w', edgecolor=col, marker='.', alpha=0.5)
+        ax.scatter(X[mask, 0], X[mask, 1], c="w", edgecolor=col, marker=".", alpha=0.5)
         if birch_model.n_clusters is None:
-            ax.scatter(this_centroid[0], this_centroid[1], marker='+',
-                       c='k', s=25)
+            ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25)
     ax.set_ylim([-25, 25])
     ax.set_xlim([-25, 25])
     ax.set_autoscaley_on(False)
-    ax.set_title('BIRCH %s' % info)
+    ax.set_title("BIRCH %s" % info)
 
 # Compute clustering with MiniBatchKMeans.
-mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100,
-                      n_init=10, max_no_improvement=10, verbose=0,
-                      random_state=0)
+mbk = MiniBatchKMeans(
+    init="k-means++",
+    n_clusters=100,
+    batch_size=100,
+    n_init=10,
+    max_no_improvement=10,
+    verbose=0,
+    random_state=0,
+)
 t0 = time()
 mbk.fit(X)
 t_mini_batch = time() - t0
@@ -88,13 +92,10 @@
 mbk_means_labels_unique = np.unique(mbk.labels_)
 
 ax = fig.add_subplot(1, 3, 3)
-for this_centroid, k, col in zip(mbk.cluster_centers_,
-                                 range(n_clusters), colors_):
+for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters), colors_):
     mask = mbk.labels_ == k
-    ax.scatter(X[mask, 0], X[mask, 1], marker='.',
-               c='w', edgecolor=col, alpha=0.5)
-    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
-               c='k', s=25)
+    ax.scatter(X[mask, 0], X[mask, 1], marker=".", c="w", edgecolor=col, alpha=0.5)
+    ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25)
 ax.set_xlim([-25, 25])
 ax.set_ylim([-25, 25])
 ax.set_title("MiniBatchKMeans")
diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py
index 4025d9e8b8591..43b9a7e333e45 100644
--- a/examples/cluster/plot_cluster_comparison.py
+++ b/examples/cluster/plot_cluster_comparison.py
@@ -42,9 +42,8 @@
 # of the algorithms, but not too big to avoid too long running times
 # ============
 n_samples = 1500
-noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
-                                      noise=.05)
-noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
+noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
+noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
 blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
 no_structure = np.random.rand(n_samples, 2), None
 
@@ -56,40 +55,68 @@
 aniso = (X_aniso, y)
 
 # blobs with varied variances
-varied = datasets.make_blobs(n_samples=n_samples,
-                             cluster_std=[1.0, 2.5, 0.5],
-                             random_state=random_state)
+varied = datasets.make_blobs(
+    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
+)
 
 # ============
 # Set up cluster parameters
 # ============
 plt.figure(figsize=(9 * 2 + 3, 13))
-plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05,
-                    hspace=.01)
+plt.subplots_adjust(
+    left=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
+)
 
 plot_num = 1
 
-default_base = {'quantile': .3,
-                'eps': .3,
-                'damping': .9,
-                'preference': -200,
-                'n_neighbors': 10,
-                'n_clusters': 3,
-                'min_samples': 20,
-                'xi': 0.05,
-                'min_cluster_size': 0.1}
+default_base = {
+    "quantile": 0.3,
+    "eps": 0.3,
+    "damping": 0.9,
+    "preference": -200,
+    "n_neighbors": 10,
+    "n_clusters": 3,
+    "min_samples": 20,
+    "xi": 0.05,
+    "min_cluster_size": 0.1,
+}
 
 datasets = [
-    (noisy_circles, {'damping': .77, 'preference': -240,
-                     'quantile': .2, 'n_clusters': 2,
-                     'min_samples': 20, 'xi': 0.25}),
-    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
-    (varied, {'eps': .18, 'n_neighbors': 2,
-              'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
-    (aniso, {'eps': .15, 'n_neighbors': 2,
-             'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
+    (
+        noisy_circles,
+        {
+            "damping": 0.77,
+            "preference": -240,
+            "quantile": 0.2,
+            "n_clusters": 2,
+            "min_samples": 20,
+            "xi": 0.25,
+        },
+    ),
+    (noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}),
+    (
+        varied,
+        {
+            "eps": 0.18,
+            "n_neighbors": 2,
+            "min_samples": 5,
+            "xi": 0.035,
+            "min_cluster_size": 0.2,
+        },
+    ),
+    (
+        aniso,
+        {
+            "eps": 0.15,
+            "n_neighbors": 2,
+            "min_samples": 20,
+            "xi": 0.1,
+            "min_cluster_size": 0.2,
+        },
+    ),
     (blobs, {}),
-    (no_structure, {})]
+    (no_structure, {}),
+]
 
 for i_dataset, (dataset, algo_params) in enumerate(datasets):
     # update parameters with dataset-specific values
@@ -102,11 +129,12 @@
     X = StandardScaler().fit_transform(X)
 
     # estimate bandwidth for mean shift
-    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
+    bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])
 
     # connectivity matrix for structured Ward
     connectivity = kneighbors_graph(
-        X, n_neighbors=params['n_neighbors'], include_self=False)
+        X, n_neighbors=params["n_neighbors"], include_self=False
+    )
     # make connectivity symmetric
     connectivity = 0.5 * (connectivity + connectivity.T)
 
@@ -114,38 +142,46 @@
     # Create cluster objects
     # ============
     ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
-    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
+    two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
     ward = cluster.AgglomerativeClustering(
-        n_clusters=params['n_clusters'], linkage='ward',
-        connectivity=connectivity)
+        n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
+    )
     spectral = cluster.SpectralClustering(
-        n_clusters=params['n_clusters'], eigen_solver='arpack',
-        affinity="nearest_neighbors")
-    dbscan = cluster.DBSCAN(eps=params['eps'])
-    optics = cluster.OPTICS(min_samples=params['min_samples'],
-                            xi=params['xi'],
-                            min_cluster_size=params['min_cluster_size'])
+        n_clusters=params["n_clusters"],
+        eigen_solver="arpack",
+        affinity="nearest_neighbors",
+    )
+    dbscan = cluster.DBSCAN(eps=params["eps"])
+    optics = cluster.OPTICS(
+        min_samples=params["min_samples"],
+        xi=params["xi"],
+        min_cluster_size=params["min_cluster_size"],
+    )
     affinity_propagation = cluster.AffinityPropagation(
-        damping=params['damping'], preference=params['preference'],
-        random_state=0)
+        damping=params["damping"], preference=params["preference"], random_state=0
+    )
     average_linkage = cluster.AgglomerativeClustering(
-        linkage="average", affinity="cityblock",
-        n_clusters=params['n_clusters'], connectivity=connectivity)
-    birch = cluster.Birch(n_clusters=params['n_clusters'])
+        linkage="average",
+        affinity="cityblock",
+        n_clusters=params["n_clusters"],
+        connectivity=connectivity,
+    )
+    birch = cluster.Birch(n_clusters=params["n_clusters"])
     gmm = mixture.GaussianMixture(
-        n_components=params['n_clusters'], covariance_type='full')
+        n_components=params["n_clusters"], covariance_type="full"
+    )
 
     clustering_algorithms = (
-        ('MiniBatch\nKMeans', two_means),
-        ('Affinity\nPropagation', affinity_propagation),
-        ('MeanShift', ms),
-        ('Spectral\nClustering', spectral),
-        ('Ward', ward),
-        ('Agglomerative\nClustering', average_linkage),
-        ('DBSCAN', dbscan),
-        ('OPTICS', optics),
-        ('BIRCH', birch),
-        ('Gaussian\nMixture', gmm)
+        ("MiniBatch\nKMeans", two_means),
+        ("Affinity\nPropagation", affinity_propagation),
+        ("MeanShift", ms),
+        ("Spectral\nClustering", spectral),
+        ("Ward", ward),
+        ("Agglomerative\nClustering", average_linkage),
+        ("DBSCAN", dbscan),
+        ("OPTICS", optics),
+        ("BIRCH", birch),
+        ("Gaussian\nMixture", gmm),
     )
 
     for name, algorithm in clustering_algorithms:
@@ -155,19 +191,21 @@
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",
-                message="the number of connected components of the " +
-                "connectivity matrix is [0-9]{1,2}" +
-                " > 1. Completing it to avoid stopping the tree early.",
-                category=UserWarning)
+                message="the number of connected components of the "
+                + "connectivity matrix is [0-9]{1,2}"
+                + " > 1. Completing it to avoid stopping the tree early.",
+                category=UserWarning,
+            )
             warnings.filterwarnings(
                 "ignore",
-                message="Graph is not fully connected, spectral embedding" +
-                " may not work as expected.",
-                category=UserWarning)
+                message="Graph is not fully connected, spectral embedding"
+                + " may not work as expected.",
+                category=UserWarning,
+            )
             algorithm.fit(X)
 
         t1 = time.time()
-        if hasattr(algorithm, 'labels_'):
+        if hasattr(algorithm, "labels_"):
             y_pred = algorithm.labels_.astype(int)
         else:
             y_pred = algorithm.predict(X)
@@ -176,10 +214,26 @@
         if i_dataset == 0:
             plt.title(name, size=18)
 
-        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
-                                             '#f781bf', '#a65628', '#984ea3',
-                                             '#999999', '#e41a1c', '#dede00']),
-                                      int(max(y_pred) + 1))))
+        colors = np.array(
+            list(
+                islice(
+                    cycle(
+                        [
+                            "#377eb8",
+                            "#ff7f00",
+                            "#4daf4a",
+                            "#f781bf",
+                            "#a65628",
+                            "#984ea3",
+                            "#999999",
+                            "#e41a1c",
+                            "#dede00",
+                        ]
+                    ),
+                    int(max(y_pred) + 1),
+                )
+            )
+        )
         # add black color for outliers (if any)
         colors = np.append(colors, ["#000000"])
         plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
@@ -188,9 +242,14 @@
         plt.ylim(-2.5, 2.5)
         plt.xticks(())
         plt.yticks(())
-        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
-                 transform=plt.gca().transAxes, size=15,
-                 horizontalalignment='right')
+        plt.text(
+            0.99,
+            0.01,
+            ("%.2fs" % (t1 - t0)).lstrip("0"),
+            transform=plt.gca().transAxes,
+            size=15,
+            horizontalalignment="right",
+        )
         plot_num += 1
 
 plt.show()
diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py
index f20d1a2374008..2e2844cf3fcd3 100755
--- a/examples/cluster/plot_cluster_iris.py
+++ b/examples/cluster/plot_cluster_iris.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+
 # Though the following import is not directly being used, it is required
 # for 3D projection to work
 from mpl_toolkits.mplot3d import Axes3D
@@ -38,55 +39,56 @@
 X = iris.data
 y = iris.target
 
-estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
-              ('k_means_iris_3', KMeans(n_clusters=3)),
-              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
-                                               init='random'))]
+estimators = [
+    ("k_means_iris_8", KMeans(n_clusters=8)),
+    ("k_means_iris_3", KMeans(n_clusters=3)),
+    ("k_means_iris_bad_init", KMeans(n_clusters=3, n_init=1, init="random")),
+]
 
 fignum = 1
-titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
+titles = ["8 clusters", "3 clusters", "3 clusters, bad initialization"]
 for name, est in estimators:
     fig = plt.figure(fignum, figsize=(4, 3))
-    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
+    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
     est.fit(X)
     labels = est.labels_
 
-    ax.scatter(X[:, 3], X[:, 0], X[:, 2],
-               c=labels.astype(float), edgecolor='k')
+    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor="k")
 
     ax.w_xaxis.set_ticklabels([])
     ax.w_yaxis.set_ticklabels([])
     ax.w_zaxis.set_ticklabels([])
-    ax.set_xlabel('Petal width')
-    ax.set_ylabel('Sepal length')
-    ax.set_zlabel('Petal length')
+    ax.set_xlabel("Petal width")
+    ax.set_ylabel("Sepal length")
+    ax.set_zlabel("Petal length")
     ax.set_title(titles[fignum - 1])
     ax.dist = 12
     fignum = fignum + 1
 
 # Plot the ground truth
 fig = plt.figure(fignum, figsize=(4, 3))
-ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
-
-for name, label in [('Setosa', 0),
-                    ('Versicolour', 1),
-                    ('Virginica', 2)]:
-    ax.text3D(X[y == label, 3].mean(),
-              X[y == label, 0].mean(),
-              X[y == label, 2].mean() + 2, name,
-              horizontalalignment='center',
-              bbox=dict(alpha=.2, edgecolor='w', facecolor='w'))
+ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
+
+for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
+    ax.text3D(
+        X[y == label, 3].mean(),
+        X[y == label, 0].mean(),
+        X[y == label, 2].mean() + 2,
+        name,
+        horizontalalignment="center",
+        bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"),
+    )
 # Reorder the labels to have colors matching the cluster results
 y = np.choose(y, [1, 2, 0]).astype(float)
-ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor='k')
+ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k")
 
 ax.w_xaxis.set_ticklabels([])
 ax.w_yaxis.set_ticklabels([])
 ax.w_zaxis.set_ticklabels([])
-ax.set_xlabel('Petal width')
-ax.set_ylabel('Sepal length')
-ax.set_zlabel('Petal length')
-ax.set_title('Ground Truth')
+ax.set_xlabel("Petal width")
+ax.set_ylabel("Sepal length")
+ax.set_zlabel("Petal length")
+ax.set_title("Ground Truth")
 ax.dist = 12
 
 fig.show()
diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py
index 9fb9b11be2753..09cd8974e3eab 100644
--- a/examples/cluster/plot_coin_segmentation.py
+++ b/examples/cluster/plot_coin_segmentation.py
@@ -36,8 +36,8 @@
 from sklearn.utils.fixes import parse_version
 
 # these were introduced in skimage-0.14
-if parse_version(skimage.__version__) >= parse_version('0.14'):
-    rescale_params = {'anti_aliasing': False, 'multichannel': False}
+if parse_version(skimage.__version__) >= parse_version("0.14"):
+    rescale_params = {"anti_aliasing": False, "multichannel": False}
 else:
     rescale_params = {}
 
@@ -48,8 +48,7 @@
 # Applying a Gaussian filter for smoothing prior to down-scaling
 # reduces aliasing artifacts.
 smoothened_coins = gaussian_filter(orig_coins, sigma=2)
-rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect",
-                         **rescale_params)
+rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params)
 
 # Convert the image into a graph with the value of the gradient on the
 # edges.
@@ -69,21 +68,21 @@
 # %%
 # Visualize the resulting regions
 
-for assign_labels in ('kmeans', 'discretize'):
+for assign_labels in ("kmeans", "discretize"):
     t0 = time.time()
-    labels = spectral_clustering(graph, n_clusters=N_REGIONS,
-                                 assign_labels=assign_labels, random_state=42)
+    labels = spectral_clustering(
+        graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=42
+    )
     t1 = time.time()
     labels = labels.reshape(rescaled_coins.shape)
 
     plt.figure(figsize=(5, 5))
     plt.imshow(rescaled_coins, cmap=plt.cm.gray)
     for l in range(N_REGIONS):
-        plt.contour(labels == l,
-                    colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
+        plt.contour(labels == l, colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
     plt.xticks(())
     plt.yticks(())
-    title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0))
+    title = "Spectral clustering: %s, %.2fs" % (assign_labels, (t1 - t0))
     print(title)
     plt.title(title)
 plt.show()
diff --git a/examples/cluster/plot_coin_ward_segmentation.py b/examples/cluster/plot_coin_ward_segmentation.py
index 218e6fb4c8d86..b674700fb3726 100644
--- a/examples/cluster/plot_coin_ward_segmentation.py
+++ b/examples/cluster/plot_coin_ward_segmentation.py
@@ -30,8 +30,8 @@
 from sklearn.utils.fixes import parse_version
 
 # these were introduced in skimage-0.14
-if parse_version(skimage.__version__) >= parse_version('0.14'):
-    rescale_params = {'anti_aliasing': False, 'multichannel': False}
+if parse_version(skimage.__version__) >= parse_version("0.14"):
+    rescale_params = {"anti_aliasing": False, "multichannel": False}
 else:
     rescale_params = {}
 
@@ -43,8 +43,7 @@
 # Applying a Gaussian filter for smoothing prior to down-scaling
 # reduces aliasing artifacts.
 smoothened_coins = gaussian_filter(orig_coins, sigma=2)
-rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect",
-                         **rescale_params)
+rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params)
 
 X = np.reshape(rescaled_coins, (-1, 1))
 
@@ -57,8 +56,9 @@
 print("Compute structured hierarchical clustering...")
 st = time.time()
 n_clusters = 27  # number of regions
-ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
-                               connectivity=connectivity)
+ward = AgglomerativeClustering(
+    n_clusters=n_clusters, linkage="ward", connectivity=connectivity
+)
 ward.fit(X)
 label = np.reshape(ward.labels_, rescaled_coins.shape)
 print("Elapsed time: ", time.time() - st)
@@ -70,8 +70,12 @@
 plt.figure(figsize=(5, 5))
 plt.imshow(rescaled_coins, cmap=plt.cm.gray)
 for l in range(n_clusters):
-    plt.contour(label == l,
-                colors=[plt.cm.nipy_spectral(l / float(n_clusters)), ])
+    plt.contour(
+        label == l,
+        colors=[
+            plt.cm.nipy_spectral(l / float(n_clusters)),
+        ],
+    )
 plt.xticks(())
 plt.yticks(())
 plt.show()
diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py
index 384e58f75e328..90b75a8cd1352 100644
--- a/examples/cluster/plot_color_quantization.py
+++ b/examples/cluster/plot_color_quantization.py
@@ -64,9 +64,7 @@
 codebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)
 print("Predicting color indices on the full image (random)")
 t0 = time()
-labels_random = pairwise_distances_argmin(codebook_random,
-                                          image_array,
-                                          axis=0)
+labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)
 print(f"done in {time() - t0:0.3f}s.")
 
 
@@ -78,19 +76,19 @@ def recreate_image(codebook, labels, w, h):
 # Display all results, alongside original image
 plt.figure(1)
 plt.clf()
-plt.axis('off')
-plt.title('Original image (96,615 colors)')
+plt.axis("off")
+plt.title("Original image (96,615 colors)")
 plt.imshow(china)
 
 plt.figure(2)
 plt.clf()
-plt.axis('off')
-plt.title(f'Quantized image ({n_colors} colors, K-Means)')
+plt.axis("off")
+plt.title(f"Quantized image ({n_colors} colors, K-Means)")
 plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
 
 plt.figure(3)
 plt.clf()
-plt.axis('off')
-plt.title(f'Quantized image ({n_colors} colors, Random)')
+plt.axis("off")
+plt.title(f"Quantized image ({n_colors} colors, Random)")
 plt.imshow(recreate_image(codebook_random, labels_random, w, h))
 plt.show()
diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
index d7cfc3ec524b3..29d5f1b768210 100644
--- a/examples/cluster/plot_dbscan.py
+++ b/examples/cluster/plot_dbscan.py
@@ -20,8 +20,9 @@
 # #############################################################################
 # Generate sample data
 centers = [[1, 1], [-1, -1], [1, -1]]
-X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
-                            random_state=0)
+X, labels_true = make_blobs(
+    n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+)
 
 X = StandardScaler().fit_transform(X)
 
@@ -36,17 +37,17 @@
 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 n_noise_ = list(labels).count(-1)
 
-print('Estimated number of clusters: %d' % n_clusters_)
-print('Estimated number of noise points: %d' % n_noise_)
+print("Estimated number of clusters: %d" % n_clusters_)
+print("Estimated number of noise points: %d" % n_noise_)
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
 print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
 print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
-print("Adjusted Rand Index: %0.3f"
-      % metrics.adjusted_rand_score(labels_true, labels))
-print("Adjusted Mutual Information: %0.3f"
-      % metrics.adjusted_mutual_info_score(labels_true, labels))
-print("Silhouette Coefficient: %0.3f"
-      % metrics.silhouette_score(X, labels))
+print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
+print(
+    "Adjusted Mutual Information: %0.3f"
+    % metrics.adjusted_mutual_info_score(labels_true, labels)
+)
+print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
 
 # #############################################################################
 # Plot result
@@ -54,22 +55,33 @@
 
 # Black removed and is used for noise instead.
 unique_labels = set(labels)
-colors = [plt.cm.Spectral(each)
-          for each in np.linspace(0, 1, len(unique_labels))]
+colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
 for k, col in zip(unique_labels, colors):
     if k == -1:
         # Black used for noise.
         col = [0, 0, 0, 1]
 
-    class_member_mask = (labels == k)
+    class_member_mask = labels == k
 
     xy = X[class_member_mask & core_samples_mask]
-    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-             markeredgecolor='k', markersize=14)
+    plt.plot(
+        xy[:, 0],
+        xy[:, 1],
+        "o",
+        markerfacecolor=tuple(col),
+        markeredgecolor="k",
+        markersize=14,
+    )
 
     xy = X[class_member_mask & ~core_samples_mask]
-    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-             markeredgecolor='k', markersize=6)
-
-plt.title('Estimated number of clusters: %d' % n_clusters_)
+    plt.plot(
+        xy[:, 0],
+        xy[:, 1],
+        "o",
+        markerfacecolor=tuple(col),
+        markeredgecolor="k",
+        markersize=6,
+    )
+
+plt.title("Estimated number of clusters: %d" % n_clusters_)
 plt.show()
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
index ac144e7213cc5..242774550d063 100644
--- a/examples/cluster/plot_dict_face_patches.py
+++ b/examples/cluster/plot_dict_face_patches.py
@@ -36,7 +36,7 @@
 # #############################################################################
 # Learn the dictionary of images
 
-print('Learning the dictionary... ')
+print("Learning the dictionary... ")
 rng = np.random.RandomState(0)
 kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True)
 patch_size = (20, 20)
@@ -48,8 +48,7 @@
 index = 0
 for _ in range(6):
     for img in faces.images:
-        data = extract_patches_2d(img, patch_size, max_patches=50,
-                                  random_state=rng)
+        data = extract_patches_2d(img, patch_size, max_patches=50, random_state=rng)
         data = np.reshape(data, (len(data), -1))
         buffer.append(data)
         index += 1
@@ -60,25 +59,25 @@
             kmeans.partial_fit(data)
             buffer = []
         if index % 100 == 0:
-            print('Partial fit of %4i out of %i'
-                  % (index, 6 * len(faces.images)))
+            print("Partial fit of %4i out of %i" % (index, 6 * len(faces.images)))
 
 dt = time.time() - t0
-print('done in %.2fs.' % dt)
+print("done in %.2fs." % dt)
 
 # #############################################################################
 # Plot the results
 plt.figure(figsize=(4.2, 4))
 for i, patch in enumerate(kmeans.cluster_centers_):
     plt.subplot(9, 9, i + 1)
-    plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray,
-               interpolation='nearest')
+    plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray, interpolation="nearest")
     plt.xticks(())
     plt.yticks(())
 
 
-plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' %
-             (dt, 8 * len(faces.images)), fontsize=16)
+plt.suptitle(
+    "Patches of faces\nTrain time %.1fs on %d patches" % (dt, 8 * len(faces.images)),
+    fontsize=16,
+)
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 plt.show()
diff --git a/examples/cluster/plot_digits_agglomeration.py b/examples/cluster/plot_digits_agglomeration.py
index ce8bd8daf3bf0..c6590c0e24771 100644
--- a/examples/cluster/plot_digits_agglomeration.py
+++ b/examples/cluster/plot_digits_agglomeration.py
@@ -26,8 +26,7 @@
 X = np.reshape(images, (len(images), -1))
 connectivity = grid_to_graph(*images[0].shape)
 
-agglo = cluster.FeatureAgglomeration(connectivity=connectivity,
-                                     n_clusters=32)
+agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32)
 
 agglo.fit(X)
 X_reduced = agglo.transform(X)
@@ -36,26 +35,28 @@
 images_restored = np.reshape(X_restored, images.shape)
 plt.figure(1, figsize=(4, 3.5))
 plt.clf()
-plt.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91)
+plt.subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.91)
 for i in range(4):
     plt.subplot(3, 4, i + 1)
-    plt.imshow(images[i], cmap=plt.cm.gray, vmax=16, interpolation='nearest')
+    plt.imshow(images[i], cmap=plt.cm.gray, vmax=16, interpolation="nearest")
     plt.xticks(())
     plt.yticks(())
     if i == 1:
-        plt.title('Original data')
+        plt.title("Original data")
     plt.subplot(3, 4, 4 + i + 1)
-    plt.imshow(images_restored[i], cmap=plt.cm.gray, vmax=16,
-               interpolation='nearest')
+    plt.imshow(images_restored[i], cmap=plt.cm.gray, vmax=16, interpolation="nearest")
     if i == 1:
-        plt.title('Agglomerated data')
+        plt.title("Agglomerated data")
     plt.xticks(())
     plt.yticks(())
 
 plt.subplot(3, 4, 10)
-plt.imshow(np.reshape(agglo.labels_, images[0].shape),
-           interpolation='nearest', cmap=plt.cm.nipy_spectral)
+plt.imshow(
+    np.reshape(agglo.labels_, images[0].shape),
+    interpolation="nearest",
+    cmap=plt.cm.nipy_spectral,
+)
 plt.xticks(())
 plt.yticks(())
-plt.title('Labels')
+plt.title("Labels")
 plt.show()
diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py
index 44cd821b58e19..c5d78e362fb38 100644
--- a/examples/cluster/plot_digits_linkage.py
+++ b/examples/cluster/plot_digits_linkage.py
@@ -41,9 +41,9 @@ def nudge_images(X, y):
     # methods, but we multiply the size of the dataset only by 2, as the
     # cost of the hierarchical clustering methods are strongly
     # super-linear in n_samples
-    shift = lambda x: ndimage.shift(x.reshape((8, 8)),
-                                    .3 * np.random.normal(size=2),
-                                    mode='constant').ravel()
+    shift = lambda x: ndimage.shift(
+        x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode="constant"
+    ).ravel()
     X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])
     Y = np.concatenate([y, y], axis=0)
     return X, Y
@@ -60,15 +60,19 @@ def plot_clustering(X_red, labels, title=None):
 
     plt.figure(figsize=(6, 4))
     for i in range(X_red.shape[0]):
-        plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
-                 color=plt.cm.nipy_spectral(labels[i] / 10.),
-                 fontdict={'weight': 'bold', 'size': 9})
+        plt.text(
+            X_red[i, 0],
+            X_red[i, 1],
+            str(y[i]),
+            color=plt.cm.nipy_spectral(labels[i] / 10.0),
+            fontdict={"weight": "bold", "size": 9},
+        )
 
     plt.xticks([])
     plt.yticks([])
     if title is not None:
         plt.title(title, size=17)
-    plt.axis('off')
+    plt.axis("off")
     plt.tight_layout(rect=[0, 0.03, 1, 0.95])
 
 
@@ -80,7 +84,7 @@ def plot_clustering(X_red, labels, title=None):
 
 from sklearn.cluster import AgglomerativeClustering
 
-for linkage in ('ward', 'average', 'complete', 'single'):
+for linkage in ("ward", "average", "complete", "single"):
     clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
     t0 = time()
     clustering.fit(X_red)
diff --git a/examples/cluster/plot_face_compress.py b/examples/cluster/plot_face_compress.py
index 4eed00d623f9b..8b2e329ed257d 100644
--- a/examples/cluster/plot_face_compress.py
+++ b/examples/cluster/plot_face_compress.py
@@ -27,6 +27,7 @@
 
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
+
     face = face(gray=True)
 except ImportError:
     face = sp.face(gray=True)
@@ -58,7 +59,7 @@
 # equal bins face
 regular_values = np.linspace(0, 256, n_clusters + 1)
 regular_labels = np.searchsorted(regular_values, face) - 1
-regular_values = .5 * (regular_values[1:] + regular_values[:-1])  # mean
+regular_values = 0.5 * (regular_values[1:] + regular_values[:-1])  # mean
 regular_face = np.choose(regular_labels.ravel(), regular_values, mode="clip")
 regular_face.shape = face.shape
 plt.figure(3, figsize=(3, 2.2))
@@ -67,15 +68,15 @@
 # histogram
 plt.figure(4, figsize=(3, 2.2))
 plt.clf()
-plt.axes([.01, .01, .98, .98])
-plt.hist(X, bins=256, color='.5', edgecolor='.5')
+plt.axes([0.01, 0.01, 0.98, 0.98])
+plt.hist(X, bins=256, color=".5", edgecolor=".5")
 plt.yticks(())
 plt.xticks(regular_values)
 values = np.sort(values)
 for center_1, center_2 in zip(values[:-1], values[1:]):
-    plt.axvline(.5 * (center_1 + center_2), color='b')
+    plt.axvline(0.5 * (center_1 + center_2), color="b")
 
 for center_1, center_2 in zip(regular_values[:-1], regular_values[1:]):
-    plt.axvline(.5 * (center_1 + center_2), color='b', linestyle='--')
+    plt.axvline(0.5 * (center_1 + center_2), color="b", linestyle="--")
 
 plt.show()
diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
index e148647446613..afb31751ce7a5 100644
--- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
+++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py
@@ -39,13 +39,13 @@
 n_samples = 200
 size = 40  # image size
 roi_size = 15
-snr = 5.
+snr = 5.0
 np.random.seed(0)
 mask = np.ones([size, size], dtype=bool)
 
 coef = np.zeros((size, size))
-coef[0:roi_size, 0:roi_size] = -1.
-coef[-roi_size:, -roi_size:] = 1.
+coef[0:roi_size, 0:roi_size] = -1.0
+coef[-roi_size:, -roi_size:] = 1.0
 
 X = np.random.randn(n_samples, size ** 2)
 for x in X:  # smooth data
@@ -55,7 +55,7 @@
 
 y = np.dot(X, coef.ravel())
 noise = np.random.randn(y.shape[0])
-noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
+noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2)
 y += noise_coef * noise  # add noise
 
 # #############################################################################
@@ -67,11 +67,10 @@
 
 # Ward agglomeration followed by BayesianRidge
 connectivity = grid_to_graph(n_x=size, n_y=size)
-ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
-                            memory=mem)
-clf = Pipeline([('ward', ward), ('ridge', ridge)])
+ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)
+clf = Pipeline([("ward", ward), ("ridge", ridge)])
 # Select the optimal number of parcels with grid search
-clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
+clf = GridSearchCV(clf, {"ward__n_clusters": [10, 20, 30]}, n_jobs=1, cv=cv)
 clf.fit(X, y)  # set the best parameters
 coef_ = clf.best_estimator_.steps[-1][1].coef_
 coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
@@ -80,9 +79,9 @@
 # Anova univariate feature selection followed by BayesianRidge
 f_regression = mem.cache(feature_selection.f_regression)  # caching function
 anova = feature_selection.SelectPercentile(f_regression)
-clf = Pipeline([('anova', anova), ('ridge', ridge)])
+clf = Pipeline([("anova", anova), ("ridge", ridge)])
 # Select the optimal percentage of features with grid search
-clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
+clf = GridSearchCV(clf, {"anova__percentile": [5, 10, 20]}, cv=cv)
 clf.fit(X, y)  # set the best parameters
 coef_ = clf.best_estimator_.steps[-1][1].coef_
 coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
@@ -90,7 +89,7 @@
 
 # #############################################################################
 # Inverse the transformation to plot the results on an image
-plt.close('all')
+plt.close("all")
 plt.figure(figsize=(7.3, 2.7))
 plt.subplot(1, 3, 1)
 plt.imshow(coef, interpolation="nearest", cmap=plt.cm.RdBu_r)
diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
index 2ff04d523855a..4360f89faee3c 100644
--- a/examples/cluster/plot_kmeans_assumptions.py
+++ b/examples/cluster/plot_kmeans_assumptions.py
@@ -43,9 +43,9 @@
 plt.title("Anisotropicly Distributed Blobs")
 
 # Different variance
-X_varied, y_varied = make_blobs(n_samples=n_samples,
-                                cluster_std=[1.0, 2.5, 0.5],
-                                random_state=random_state)
+X_varied, y_varied = make_blobs(
+    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
+)
 y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
 
 plt.subplot(223)
@@ -54,8 +54,7 @@
 
 # Unevenly sized blobs
 X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
-y_pred = KMeans(n_clusters=3,
-                random_state=random_state).fit_predict(X_filtered)
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)
 
 plt.subplot(224)
 plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index 9fc0abebc4464..8190c6e2792e0 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -39,9 +39,7 @@
 data, labels = load_digits(return_X_y=True)
 (n_samples, n_features), n_digits = data.shape, np.unique(labels).size
 
-print(
-    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
-)
+print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}")
 
 # %%
 # Define our evaluation benchmark
@@ -95,13 +93,18 @@ def bench_k_means(kmeans, name, data, labels):
 
     # The silhouette score requires the full dataset
     results += [
-        metrics.silhouette_score(data, estimator[-1].labels_,
-                                 metric="euclidean", sample_size=300,)
+        metrics.silhouette_score(
+            data,
+            estimator[-1].labels_,
+            metric="euclidean",
+            sample_size=300,
+        )
     ]
 
     # Show the results
-    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
-                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
+    formatter_result = (
+        "{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}"
+    )
     print(formatter_result.format(*results))
 
 
@@ -122,11 +125,10 @@ def bench_k_means(kmeans, name, data, labels):
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 
-print(82 * '_')
-print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
+print(82 * "_")
+print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette")
 
-kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
-                random_state=0)
+kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4, random_state=0)
 bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
 
 kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
@@ -136,7 +138,7 @@ def bench_k_means(kmeans, name, data, labels):
 kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
 bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
 
-print(82 * '_')
+print(82 * "_")
 
 # %%
 # Visualize the results on PCA-reduced data
@@ -153,7 +155,7 @@ def bench_k_means(kmeans, name, data, labels):
 kmeans.fit(reduced_data)
 
 # Step size of the mesh. Decrease to increase the quality of the VQ.
-h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
+h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].
 
 # Plot the decision boundary. For that, we will assign a color to each
 x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
@@ -167,17 +169,31 @@ def bench_k_means(kmeans, name, data, labels):
 Z = Z.reshape(xx.shape)
 plt.figure(1)
 plt.clf()
-plt.imshow(Z, interpolation="nearest",
-           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
-           cmap=plt.cm.Paired, aspect="auto", origin="lower")
+plt.imshow(
+    Z,
+    interpolation="nearest",
+    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+    cmap=plt.cm.Paired,
+    aspect="auto",
+    origin="lower",
+)
 
-plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
+plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
 # Plot the centroids as a white X
 centroids = kmeans.cluster_centers_
-plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
-            color="w", zorder=10)
-plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
-          "Centroids are marked with white cross")
+plt.scatter(
+    centroids[:, 0],
+    centroids[:, 1],
+    marker="x",
+    s=169,
+    linewidths=3,
+    color="w",
+    zorder=10,
+)
+plt.title(
+    "K-means clustering on the digits dataset (PCA-reduced data)\n"
+    "Centroids are marked with white cross"
+)
 plt.xlim(x_min, x_max)
 plt.ylim(y_min, y_max)
 plt.xticks(())
diff --git a/examples/cluster/plot_kmeans_plusplus.py b/examples/cluster/plot_kmeans_plusplus.py
index d9821db2d452e..8afbb62dfdda4 100644
--- a/examples/cluster/plot_kmeans_plusplus.py
+++ b/examples/cluster/plot_kmeans_plusplus.py
@@ -19,26 +19,23 @@
 n_samples = 4000
 n_components = 4
 
-X, y_true = make_blobs(n_samples=n_samples,
-                       centers=n_components,
-                       cluster_std=0.60,
-                       random_state=0)
+X, y_true = make_blobs(
+    n_samples=n_samples, centers=n_components, cluster_std=0.60, random_state=0
+)
 X = X[:, ::-1]
 
 # Calculate seeds from kmeans++
-centers_init, indices = kmeans_plusplus(X, n_clusters=4,
-                                        random_state=0)
+centers_init, indices = kmeans_plusplus(X, n_clusters=4, random_state=0)
 
 # Plot init seeds along side sample data
 plt.figure(1)
-colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm']
+colors = ["#4EACC5", "#FF9C34", "#4E9A06", "m"]
 
 for k, col in enumerate(colors):
     cluster_data = y_true == k
-    plt.scatter(X[cluster_data, 0], X[cluster_data, 1],
-                c=col, marker='.', s=10)
+    plt.scatter(X[cluster_data, 0], X[cluster_data, 1], c=col, marker=".", s=10)
 
-plt.scatter(centers_init[:, 0], centers_init[:, 1], c='b', s=50)
+plt.scatter(centers_init[:, 0], centers_init[:, 1], c="b", s=50)
 plt.title("K-Means++ Initialization")
 plt.xticks([])
 plt.yticks([])
diff --git a/examples/cluster/plot_kmeans_silhouette_analysis.py b/examples/cluster/plot_kmeans_silhouette_analysis.py
index 8cd65cfb6a865..63efe0c406fd8 100644
--- a/examples/cluster/plot_kmeans_silhouette_analysis.py
+++ b/examples/cluster/plot_kmeans_silhouette_analysis.py
@@ -42,13 +42,15 @@
 # Generating the sample data from make_blobs
 # This particular setting has one distinct cluster and 3 clusters placed close
 # together.
-X, y = make_blobs(n_samples=500,
-                  n_features=2,
-                  centers=4,
-                  cluster_std=1,
-                  center_box=(-10.0, 10.0),
-                  shuffle=True,
-                  random_state=1)  # For reproducibility
+X, y = make_blobs(
+    n_samples=500,
+    n_features=2,
+    centers=4,
+    cluster_std=1,
+    center_box=(-10.0, 10.0),
+    shuffle=True,
+    random_state=1,
+)  # For reproducibility
 
 range_n_clusters = [2, 3, 4, 5, 6]
 
@@ -74,8 +76,12 @@
     # This gives a perspective into the density and separation of the formed
     # clusters
     silhouette_avg = silhouette_score(X, cluster_labels)
-    print("For n_clusters =", n_clusters,
-          "The average silhouette_score is :", silhouette_avg)
+    print(
+        "For n_clusters =",
+        n_clusters,
+        "The average silhouette_score is :",
+        silhouette_avg,
+    )
 
     # Compute the silhouette scores for each sample
     sample_silhouette_values = silhouette_samples(X, cluster_labels)
@@ -84,8 +90,7 @@
     for i in range(n_clusters):
         # Aggregate the silhouette scores for samples belonging to
         # cluster i, and sort them
-        ith_cluster_silhouette_values = \
-            sample_silhouette_values[cluster_labels == i]
+        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
 
         ith_cluster_silhouette_values.sort()
 
@@ -93,9 +98,14 @@
         y_upper = y_lower + size_cluster_i
 
         color = cm.nipy_spectral(float(i) / n_clusters)
-        ax1.fill_betweenx(np.arange(y_lower, y_upper),
-                          0, ith_cluster_silhouette_values,
-                          facecolor=color, edgecolor=color, alpha=0.7)
+        ax1.fill_betweenx(
+            np.arange(y_lower, y_upper),
+            0,
+            ith_cluster_silhouette_values,
+            facecolor=color,
+            edgecolor=color,
+            alpha=0.7,
+        )
 
         # Label the silhouette plots with their cluster numbers at the middle
         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
@@ -115,25 +125,35 @@
 
     # 2nd Plot showing the actual clusters formed
     colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
-    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
-                c=colors, edgecolor='k')
+    ax2.scatter(
+        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
+    )
 
     # Labeling the clusters
     centers = clusterer.cluster_centers_
     # Draw white circles at cluster centers
-    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
-                c="white", alpha=1, s=200, edgecolor='k')
+    ax2.scatter(
+        centers[:, 0],
+        centers[:, 1],
+        marker="o",
+        c="white",
+        alpha=1,
+        s=200,
+        edgecolor="k",
+    )
 
     for i, c in enumerate(centers):
-        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
-                    s=50, edgecolor='k')
+        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
 
     ax2.set_title("The visualization of the clustered data.")
     ax2.set_xlabel("Feature space for the 1st feature")
     ax2.set_ylabel("Feature space for the 2nd feature")
 
-    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
-                  "with n_clusters = %d" % n_clusters),
-                 fontsize=14, fontweight='bold')
+    plt.suptitle(
+        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
+        % n_clusters,
+        fontsize=14,
+        fontweight="bold",
+    )
 
 plt.show()
diff --git a/examples/cluster/plot_kmeans_stability_low_dim_dense.py b/examples/cluster/plot_kmeans_stability_low_dim_dense.py
index a6771f6350135..dd9f32d01485d 100644
--- a/examples/cluster/plot_kmeans_stability_low_dim_dense.py
+++ b/examples/cluster/plot_kmeans_stability_low_dim_dense.py
@@ -54,19 +54,18 @@
 
 def make_data(random_state, n_samples_per_center, grid_size, scale):
     random_state = check_random_state(random_state)
-    centers = np.array([[i, j]
-                        for i in range(grid_size)
-                        for j in range(grid_size)])
+    centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)])
     n_clusters_true, n_features = centers.shape
 
     noise = random_state.normal(
-        scale=scale, size=(n_samples_per_center, centers.shape[1]))
+        scale=scale, size=(n_samples_per_center, centers.shape[1])
+    )
 
     X = np.concatenate([c + noise for c in centers])
-    y = np.concatenate([[i] * n_samples_per_center
-                        for i in range(n_clusters_true)])
+    y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)])
     return shuffle(X, y, random_state=random_state)
 
+
 # Part 1: Quantitative evaluation of various init methods
 
 
@@ -75,10 +74,10 @@ def make_data(random_state, n_samples_per_center, grid_size, scale):
 legends = []
 
 cases = [
-    (KMeans, 'k-means++', {}),
-    (KMeans, 'random', {}),
-    (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}),
-    (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}),
+    (KMeans, "k-means++", {}),
+    (KMeans, "random", {}),
+    (MiniBatchKMeans, "k-means++", {"max_no_improvement": 3}),
+    (MiniBatchKMeans, "random", {"max_no_improvement": 3, "init_size": 500}),
 ]
 
 for factory, init, params in cases:
@@ -88,33 +87,46 @@ def make_data(random_state, n_samples_per_center, grid_size, scale):
     for run_id in range(n_runs):
         X, y = make_data(run_id, n_samples_per_center, grid_size, scale)
         for i, n_init in enumerate(n_init_range):
-            km = factory(n_clusters=n_clusters, init=init, random_state=run_id,
-                         n_init=n_init, **params).fit(X)
+            km = factory(
+                n_clusters=n_clusters,
+                init=init,
+                random_state=run_id,
+                n_init=n_init,
+                **params,
+            ).fit(X)
             inertia[i, run_id] = km.inertia_
     p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))
     plots.append(p[0])
     legends.append("%s with %s init" % (factory.__name__, init))
 
-plt.xlabel('n_init')
-plt.ylabel('inertia')
+plt.xlabel("n_init")
+plt.ylabel("inertia")
 plt.legend(plots, legends)
 plt.title("Mean inertia for various k-means init across %d runs" % n_runs)
 
 # Part 2: Qualitative visual inspection of the convergence
 
 X, y = make_data(random_state, n_samples_per_center, grid_size, scale)
-km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,
-                     random_state=random_state).fit(X)
+km = MiniBatchKMeans(
+    n_clusters=n_clusters, init="random", n_init=1, random_state=random_state
+).fit(X)
 
 plt.figure()
 for k in range(n_clusters):
     my_members = km.labels_ == k
     color = cm.nipy_spectral(float(k) / n_clusters, 1)
-    plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)
+    plt.plot(X[my_members, 0], X[my_members, 1], "o", marker=".", c=color)
     cluster_center = km.cluster_centers_[k]
-    plt.plot(cluster_center[0], cluster_center[1], 'o',
-             markerfacecolor=color, markeredgecolor='k', markersize=6)
-    plt.title("Example cluster allocation with a single random init\n"
-              "with MiniBatchKMeans")
+    plt.plot(
+        cluster_center[0],
+        cluster_center[1],
+        "o",
+        markerfacecolor=color,
+        markeredgecolor="k",
+        markersize=6,
+    )
+    plt.title(
+        "Example cluster allocation with a single random init\nwith MiniBatchKMeans"
+    )
 
 plt.show()
diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py
index 390aa1fe889d2..0a4667855f2ec 100644
--- a/examples/cluster/plot_linkage_comparison.py
+++ b/examples/cluster/plot_linkage_comparison.py
@@ -40,9 +40,8 @@
 # of the algorithms, but not too big to avoid too long running times
 
 n_samples = 1500
-noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
-                                      noise=.05)
-noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
+noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
+noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
 blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
 no_structure = np.random.rand(n_samples, 2), None
 
@@ -54,30 +53,31 @@
 aniso = (X_aniso, y)
 
 # blobs with varied variances
-varied = datasets.make_blobs(n_samples=n_samples,
-                             cluster_std=[1.0, 2.5, 0.5],
-                             random_state=random_state)
+varied = datasets.make_blobs(
+    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
+)
 
 # %%
 # Run the clustering and plot
 
 # Set up cluster parameters
 plt.figure(figsize=(9 * 1.3 + 2, 14.5))
-plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
-                    hspace=.01)
+plt.subplots_adjust(
+    left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01
+)
 
 plot_num = 1
 
-default_base = {'n_neighbors': 10,
-                'n_clusters': 3}
+default_base = {"n_neighbors": 10, "n_clusters": 3}
 
 datasets = [
-    (noisy_circles, {'n_clusters': 2}),
-    (noisy_moons, {'n_clusters': 2}),
-    (varied, {'n_neighbors': 2}),
-    (aniso, {'n_neighbors': 2}),
+    (noisy_circles, {"n_clusters": 2}),
+    (noisy_moons, {"n_clusters": 2}),
+    (varied, {"n_neighbors": 2}),
+    (aniso, {"n_neighbors": 2}),
     (blobs, {}),
-    (no_structure, {})]
+    (no_structure, {}),
+]
 
 for i_dataset, (dataset, algo_params) in enumerate(datasets):
     # update parameters with dataset-specific values
@@ -93,19 +93,23 @@
     # Create cluster objects
     # ============
     ward = cluster.AgglomerativeClustering(
-        n_clusters=params['n_clusters'], linkage='ward')
+        n_clusters=params["n_clusters"], linkage="ward"
+    )
     complete = cluster.AgglomerativeClustering(
-        n_clusters=params['n_clusters'], linkage='complete')
+        n_clusters=params["n_clusters"], linkage="complete"
+    )
     average = cluster.AgglomerativeClustering(
-        n_clusters=params['n_clusters'], linkage='average')
+        n_clusters=params["n_clusters"], linkage="average"
+    )
     single = cluster.AgglomerativeClustering(
-        n_clusters=params['n_clusters'], linkage='single')
+        n_clusters=params["n_clusters"], linkage="single"
+    )
 
     clustering_algorithms = (
-        ('Single Linkage', single),
-        ('Average Linkage', average),
-        ('Complete Linkage', complete),
-        ('Ward Linkage', ward),
+        ("Single Linkage", single),
+        ("Average Linkage", average),
+        ("Complete Linkage", complete),
+        ("Ward Linkage", ward),
     )
 
     for name, algorithm in clustering_algorithms:
@@ -115,14 +119,15 @@
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",
-                message="the number of connected components of the " +
-                "connectivity matrix is [0-9]{1,2}" +
-                " > 1. Completing it to avoid stopping the tree early.",
-                category=UserWarning)
+                message="the number of connected components of the "
+                + "connectivity matrix is [0-9]{1,2}"
+                + " > 1. Completing it to avoid stopping the tree early.",
+                category=UserWarning,
+            )
             algorithm.fit(X)
 
         t1 = time.time()
-        if hasattr(algorithm, 'labels_'):
+        if hasattr(algorithm, "labels_"):
             y_pred = algorithm.labels_.astype(int)
         else:
             y_pred = algorithm.predict(X)
@@ -131,19 +136,40 @@
         if i_dataset == 0:
             plt.title(name, size=18)
 
-        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
-                                             '#f781bf', '#a65628', '#984ea3',
-                                             '#999999', '#e41a1c', '#dede00']),
-                                      int(max(y_pred) + 1))))
+        colors = np.array(
+            list(
+                islice(
+                    cycle(
+                        [
+                            "#377eb8",
+                            "#ff7f00",
+                            "#4daf4a",
+                            "#f781bf",
+                            "#a65628",
+                            "#984ea3",
+                            "#999999",
+                            "#e41a1c",
+                            "#dede00",
+                        ]
+                    ),
+                    int(max(y_pred) + 1),
+                )
+            )
+        )
         plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
 
         plt.xlim(-2.5, 2.5)
         plt.ylim(-2.5, 2.5)
         plt.xticks(())
         plt.yticks(())
-        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
-                 transform=plt.gca().transAxes, size=15,
-                 horizontalalignment='right')
+        plt.text(
+            0.99,
+            0.01,
+            ("%.2fs" % (t1 - t0)).lstrip("0"),
+            transform=plt.gca().transAxes,
+            size=15,
+            horizontalalignment="right",
+        )
         plot_num += 1
 
 plt.show()
diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py
index dce7b35503232..53e052ed71d36 100644
--- a/examples/cluster/plot_mean_shift.py
+++ b/examples/cluster/plot_mean_shift.py
@@ -45,12 +45,18 @@
 plt.figure(1)
 plt.clf()
 
-colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
+colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk")
 for k, col in zip(range(n_clusters_), colors):
     my_members = labels == k
     cluster_center = cluster_centers[k]
-    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
-    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
-             markeredgecolor='k', markersize=14)
-plt.title('Estimated number of clusters: %d' % n_clusters_)
+    plt.plot(X[my_members, 0], X[my_members, 1], col + ".")
+    plt.plot(
+        cluster_center[0],
+        cluster_center[1],
+        "o",
+        markerfacecolor=col,
+        markeredgecolor="k",
+        markersize=14,
+    )
+plt.title("Estimated number of clusters: %d" % n_clusters_)
 plt.show()
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
index e8f78556f80d2..99b5311c897d0 100644
--- a/examples/cluster/plot_mini_batch_kmeans.py
+++ b/examples/cluster/plot_mini_batch_kmeans.py
@@ -35,7 +35,7 @@
 # #############################################################################
 # Compute clustering with Means
 
-k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
+k_means = KMeans(init="k-means++", n_clusters=3, n_init=10)
 t0 = time.time()
 k_means.fit(X)
 t_batch = time.time() - t0
@@ -43,8 +43,14 @@
 # #############################################################################
 # Compute clustering with MiniBatchKMeans
 
-mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
-                      n_init=10, max_no_improvement=10, verbose=0)
+mbk = MiniBatchKMeans(
+    init="k-means++",
+    n_clusters=3,
+    batch_size=batch_size,
+    n_init=10,
+    max_no_improvement=10,
+    verbose=0,
+)
 t0 = time.time()
 mbk.fit(X)
 t_mini_batch = time.time() - t0
@@ -54,14 +60,13 @@
 
 fig = plt.figure(figsize=(8, 3))
 fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
-colors = ['#4EACC5', '#FF9C34', '#4E9A06']
+colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
 
 # We want to have the same colors for the same cluster from the
 # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
 # closest one.
 k_means_cluster_centers = k_means.cluster_centers_
-order = pairwise_distances_argmin(k_means.cluster_centers_,
-                                  mbk.cluster_centers_)
+order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
 mbk_means_cluster_centers = mbk.cluster_centers_[order]
 
 k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
@@ -72,44 +77,50 @@
 for k, col in zip(range(n_clusters), colors):
     my_members = k_means_labels == k
     cluster_center = k_means_cluster_centers[k]
-    ax.plot(X[my_members, 0], X[my_members, 1], 'w',
-            markerfacecolor=col, marker='.')
-    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
-            markeredgecolor='k', markersize=6)
-ax.set_title('KMeans')
+    ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".")
+    ax.plot(
+        cluster_center[0],
+        cluster_center[1],
+        "o",
+        markerfacecolor=col,
+        markeredgecolor="k",
+        markersize=6,
+    )
+ax.set_title("KMeans")
 ax.set_xticks(())
 ax.set_yticks(())
-plt.text(-3.5, 1.8,  'train time: %.2fs\ninertia: %f' % (
-    t_batch, k_means.inertia_))
+plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))
 
 # MiniBatchKMeans
 ax = fig.add_subplot(1, 3, 2)
 for k, col in zip(range(n_clusters), colors):
     my_members = mbk_means_labels == k
     cluster_center = mbk_means_cluster_centers[k]
-    ax.plot(X[my_members, 0], X[my_members, 1], 'w',
-            markerfacecolor=col, marker='.')
-    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
-            markeredgecolor='k', markersize=6)
-ax.set_title('MiniBatchKMeans')
+    ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".")
+    ax.plot(
+        cluster_center[0],
+        cluster_center[1],
+        "o",
+        markerfacecolor=col,
+        markeredgecolor="k",
+        markersize=6,
+    )
+ax.set_title("MiniBatchKMeans")
 ax.set_xticks(())
 ax.set_yticks(())
-plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
-         (t_mini_batch, mbk.inertia_))
+plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))
 
 # Initialise the different array to all False
-different = (mbk_means_labels == 4)
+different = mbk_means_labels == 4
 ax = fig.add_subplot(1, 3, 3)
 
 for k in range(n_clusters):
-    different += ((k_means_labels == k) != (mbk_means_labels == k))
+    different += (k_means_labels == k) != (mbk_means_labels == k)
 
 identic = np.logical_not(different)
-ax.plot(X[identic, 0], X[identic, 1], 'w',
-        markerfacecolor='#bbbbbb', marker='.')
-ax.plot(X[different, 0], X[different, 1], 'w',
-        markerfacecolor='m', marker='.')
-ax.set_title('Difference')
+ax.plot(X[identic, 0], X[identic, 1], "w", markerfacecolor="#bbbbbb", marker=".")
+ax.plot(X[different, 0], X[different, 1], "w", markerfacecolor="m", marker=".")
+ax.set_title("Difference")
 ax.set_xticks(())
 ax.set_yticks(())
 
diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py
index 211fb84aede30..4ca81be91de1b 100644
--- a/examples/cluster/plot_optics.py
+++ b/examples/cluster/plot_optics.py
@@ -30,25 +30,31 @@
 np.random.seed(0)
 n_points_per_cluster = 250
 
-C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
-C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
-C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
-C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
+C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2)
+C2 = [4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2)
+C3 = [1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2)
+C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2)
 C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
 C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
 X = np.vstack((C1, C2, C3, C4, C5, C6))
 
-clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
+clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05)
 
 # Run the fit
 clust.fit(X)
 
-labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
-                                   core_distances=clust.core_distances_,
-                                   ordering=clust.ordering_, eps=0.5)
-labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
-                                   core_distances=clust.core_distances_,
-                                   ordering=clust.ordering_, eps=2)
+labels_050 = cluster_optics_dbscan(
+    reachability=clust.reachability_,
+    core_distances=clust.core_distances_,
+    ordering=clust.ordering_,
+    eps=0.5,
+)
+labels_200 = cluster_optics_dbscan(
+    reachability=clust.reachability_,
+    core_distances=clust.core_distances_,
+    ordering=clust.ordering_,
+    eps=2,
+)
 
 space = np.arange(len(X))
 reachability = clust.reachability_[clust.ordering_]
@@ -62,40 +68,40 @@
 ax4 = plt.subplot(G[1, 2])
 
 # Reachability plot
-colors = ['g.', 'r.', 'b.', 'y.', 'c.']
+colors = ["g.", "r.", "b.", "y.", "c."]
 for klass, color in zip(range(0, 5), colors):
     Xk = space[labels == klass]
     Rk = reachability[labels == klass]
     ax1.plot(Xk, Rk, color, alpha=0.3)
-ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
-ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
-ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
-ax1.set_ylabel('Reachability (epsilon distance)')
-ax1.set_title('Reachability Plot')
+ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3)
+ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5)
+ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5)
+ax1.set_ylabel("Reachability (epsilon distance)")
+ax1.set_title("Reachability Plot")
 
 # OPTICS
-colors = ['g.', 'r.', 'b.', 'y.', 'c.']
+colors = ["g.", "r.", "b.", "y.", "c."]
 for klass, color in zip(range(0, 5), colors):
     Xk = X[clust.labels_ == klass]
     ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
-ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1)
-ax2.set_title('Automatic Clustering\nOPTICS')
+ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], "k+", alpha=0.1)
+ax2.set_title("Automatic Clustering\nOPTICS")
 
 # DBSCAN at 0.5
-colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
+colors = ["g", "greenyellow", "olive", "r", "b", "c"]
 for klass, color in zip(range(0, 6), colors):
     Xk = X[labels_050 == klass]
-    ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.')
-ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1)
-ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN')
+    ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker=".")
+ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], "k+", alpha=0.1)
+ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN")
 
 # DBSCAN at 2.
-colors = ['g.', 'm.', 'y.', 'c.']
+colors = ["g.", "m.", "y.", "c."]
 for klass, color in zip(range(0, 4), colors):
     Xk = X[labels_200 == klass]
     ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
-ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1)
-ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN')
+ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], "k+", alpha=0.1)
+ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN")
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index a6980c5f271ef..f4709358b63a4 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -73,8 +73,8 @@
 
 # Force the solver to be arpack, since amg is numerically
 # unstable on this example
-labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack')
-label_im = np.full(mask.shape, -1.)
+labels = spectral_clustering(graph, n_clusters=4, eigen_solver="arpack")
+label_im = np.full(mask.shape, -1.0)
 label_im[mask] = labels
 
 plt.matshow(img)
@@ -91,8 +91,8 @@
 graph = image.img_to_graph(img, mask=mask)
 graph.data = np.exp(-graph.data / graph.data.std())
 
-labels = spectral_clustering(graph, n_clusters=2, eigen_solver='arpack')
-label_im = np.full(mask.shape, -1.)
+labels = spectral_clustering(graph, n_clusters=2, eigen_solver="arpack")
+label_im = np.full(mask.shape, -1.0)
 label_im[mask] = labels
 
 plt.matshow(img)
diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py
index 671bdd2735280..75a2aecb6fd3b 100644
--- a/examples/cluster/plot_ward_structured_vs_unstructured.py
+++ b/examples/cluster/plot_ward_structured_vs_unstructured.py
@@ -39,13 +39,13 @@
 noise = 0.05
 X, _ = make_swiss_roll(n_samples, noise=noise)
 # Make it thinner
-X[:, 1] *= .5
+X[:, 1] *= 0.5
 
 # #############################################################################
 # Compute clustering
 print("Compute unstructured hierarchical clustering...")
 st = time.time()
-ward = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(X)
+ward = AgglomerativeClustering(n_clusters=6, linkage="ward").fit(X)
 elapsed_time = time.time() - st
 label = ward.labels_
 print("Elapsed time: %.2fs" % elapsed_time)
@@ -57,23 +57,30 @@
 ax = p3.Axes3D(fig)
 ax.view_init(7, -80)
 for l in np.unique(label):
-    ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2],
-               color=plt.cm.jet(float(l) / np.max(label + 1)),
-               s=20, edgecolor='k')
-plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time)
+    ax.scatter(
+        X[label == l, 0],
+        X[label == l, 1],
+        X[label == l, 2],
+        color=plt.cm.jet(float(l) / np.max(label + 1)),
+        s=20,
+        edgecolor="k",
+    )
+plt.title("Without connectivity constraints (time %.2fs)" % elapsed_time)
 
 
 # #############################################################################
 # Define the structure A of the data. Here a 10 nearest neighbors
 from sklearn.neighbors import kneighbors_graph
+
 connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
 
 # #############################################################################
 # Compute clustering
 print("Compute structured hierarchical clustering...")
 st = time.time()
-ward = AgglomerativeClustering(n_clusters=6, connectivity=connectivity,
-                               linkage='ward').fit(X)
+ward = AgglomerativeClustering(
+    n_clusters=6, connectivity=connectivity, linkage="ward"
+).fit(X)
 elapsed_time = time.time() - st
 label = ward.labels_
 print("Elapsed time: %.2fs" % elapsed_time)
@@ -85,9 +92,14 @@
 ax = p3.Axes3D(fig)
 ax.view_init(7, -80)
 for l in np.unique(label):
-    ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2],
-               color=plt.cm.jet(float(l) / np.max(label + 1)),
-               s=20, edgecolor='k')
-plt.title('With connectivity constraints (time %.2fs)' % elapsed_time)
+    ax.scatter(
+        X[label == l, 0],
+        X[label == l, 1],
+        X[label == l, 2],
+        color=plt.cm.jet(float(l) / np.max(label + 1)),
+        s=20,
+        edgecolor="k",
+    )
+plt.title("With connectivity constraints (time %.2fs)" % elapsed_time)
 
 plt.show()
diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
index f6144ed2c491b..81fd448b56d18 100644
--- a/examples/compose/plot_column_transformer.py
+++ b/examples/compose/plot_column_transformer.py
@@ -43,17 +43,21 @@
 # a specific date. We will only use posts from 2 categories to speed up running
 # time.
 
-categories = ['sci.med', 'sci.space']
-X_train, y_train = fetch_20newsgroups(random_state=1,
-                                      subset='train',
-                                      categories=categories,
-                                      remove=('footers', 'quotes'),
-                                      return_X_y=True)
-X_test, y_test = fetch_20newsgroups(random_state=1,
-                                    subset='test',
-                                    categories=categories,
-                                    remove=('footers', 'quotes'),
-                                    return_X_y=True)
+categories = ["sci.med", "sci.space"]
+X_train, y_train = fetch_20newsgroups(
+    random_state=1,
+    subset="train",
+    categories=categories,
+    remove=("footers", "quotes"),
+    return_X_y=True,
+)
+X_test, y_test = fetch_20newsgroups(
+    random_state=1,
+    subset="test",
+    categories=categories,
+    remove=("footers", "quotes"),
+    return_X_y=True,
+)
 
 ##############################################################################
 # Each feature comprises meta information about that post, such as the subject,
@@ -79,16 +83,16 @@ def subject_body_extractor(posts):
     features = np.empty(shape=(len(posts), 2), dtype=object)
     for i, text in enumerate(posts):
         # temporary variable `_` stores '\n\n'
-        headers, _, body = text.partition('\n\n')
+        headers, _, body = text.partition("\n\n")
         # store body text in second column
         features[i, 1] = body
 
-        prefix = 'Subject:'
-        sub = ''
+        prefix = "Subject:"
+        sub = ""
         # save text after 'Subject:' in first column
-        for line in headers.split('\n'):
+        for line in headers.split("\n"):
             if line.startswith(prefix):
-                sub = line[len(prefix):]
+                sub = line[len(prefix) :]
                 break
         features[i, 0] = sub
 
@@ -103,9 +107,7 @@ def subject_body_extractor(posts):
 
 
 def text_stats(posts):
-    return [{'length': len(text),
-             'num_sentences': text.count('.')}
-            for text in posts]
+    return [{"length": len(text), "num_sentences": text.count(".")} for text in posts]
 
 
 text_stats_transformer = FunctionTransformer(text_stats)
@@ -121,35 +123,59 @@ def text_stats(posts):
 # ``ColumnTransformer``. We combine them, with weights, then train a
 # classifier on the combined set of features.
 
-pipeline = Pipeline([
-    # Extract subject & body
-    ('subjectbody', subject_body_transformer),
-    # Use ColumnTransformer to combine the subject and body features
-    ('union', ColumnTransformer(
-        [
-            # bag-of-words for subject (col 0)
-            ('subject', TfidfVectorizer(min_df=50), 0),
-            # bag-of-words with decomposition for body (col 1)
-            ('body_bow', Pipeline([
-                ('tfidf', TfidfVectorizer()),
-                ('best', TruncatedSVD(n_components=50)),
-            ]), 1),
-            # Pipeline for pulling text stats from post's body
-            ('body_stats', Pipeline([
-                ('stats', text_stats_transformer),  # returns a list of dicts
-                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
-            ]), 1),
-        ],
-        # weight above ColumnTransformer features
-        transformer_weights={
-            'subject': 0.8,
-            'body_bow': 0.5,
-            'body_stats': 1.0,
-        }
-    )),
-    # Use a SVC classifier on the combined features
-    ('svc', LinearSVC(dual=False)),
-], verbose=True)
+pipeline = Pipeline(
+    [
+        # Extract subject & body
+        ("subjectbody", subject_body_transformer),
+        # Use ColumnTransformer to combine the subject and body features
+        (
+            "union",
+            ColumnTransformer(
+                [
+                    # bag-of-words for subject (col 0)
+                    ("subject", TfidfVectorizer(min_df=50), 0),
+                    # bag-of-words with decomposition for body (col 1)
+                    (
+                        "body_bow",
+                        Pipeline(
+                            [
+                                ("tfidf", TfidfVectorizer()),
+                                ("best", TruncatedSVD(n_components=50)),
+                            ]
+                        ),
+                        1,
+                    ),
+                    # Pipeline for pulling text stats from post's body
+                    (
+                        "body_stats",
+                        Pipeline(
+                            [
+                                (
+                                    "stats",
+                                    text_stats_transformer,
+                                ),  # returns a list of dicts
+                                (
+                                    "vect",
+                                    DictVectorizer(),
+                                ),  # list of dicts -> feature matrix
+                            ]
+                        ),
+                        1,
+                    ),
+                ],
+                # weight above ColumnTransformer features
+                transformer_weights={
+                    "subject": 0.8,
+                    "body_bow": 0.5,
+                    "body_stats": 1.0,
+                },
+            ),
+        ),
+        # Use a SVC classifier on the combined features
+        ("svc", LinearSVC(dual=False)),
+    ],
+    verbose=True,
+)
 
 ##############################################################################
 # Finally, we fit our pipeline on the training data and use it to predict
@@ -157,6 +183,4 @@ def text_stats(posts):
 
 pipeline.fit(X_train, y_train)
 y_pred = pipeline.predict(X_test)
-print('Classification report:\n\n{}'.format(
-    classification_report(y_test, y_pred))
-)
+print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index f51a55742926d..f6ea68105dc97 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -66,26 +66,28 @@
 # Note that ``pclass`` could either be treated as a categorical or numeric
 # feature.
 
-numeric_features = ['age', 'fare']
-numeric_transformer = Pipeline(steps=[
-    ('imputer', SimpleImputer(strategy='median')),
-    ('scaler', StandardScaler())])
+numeric_features = ["age", "fare"]
+numeric_transformer = Pipeline(
+    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
+)
 
-categorical_features = ['embarked', 'sex', 'pclass']
-categorical_transformer = OneHotEncoder(handle_unknown='ignore')
+categorical_features = ["embarked", "sex", "pclass"]
+categorical_transformer = OneHotEncoder(handle_unknown="ignore")
 
 preprocessor = ColumnTransformer(
     transformers=[
-        ('num', numeric_transformer, numeric_features),
-        ('cat', categorical_transformer, categorical_features)])
+        ("num", numeric_transformer, numeric_features),
+        ("cat", categorical_transformer, categorical_features),
+    ]
+)
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
-clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+clf = Pipeline(
+    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
+)
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
-                                                    random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
@@ -97,7 +99,7 @@
 # representation of the estimator is displayed as follows:
 from sklearn import set_config
 
-set_config(display='diagram')
+set_config(display="diagram")
 clf
 
 # %%
@@ -110,7 +112,7 @@
 # First, let's only select a subset of columns to simplify our
 # example.
 
-subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
+subset_feature = ["embarked", "sex", "pclass", "age", "fare"]
 X_train, X_test = X_train[subset_feature], X_test[subset_feature]
 
 # %%
@@ -134,12 +136,15 @@
 
 from sklearn.compose import make_column_selector as selector
 
-preprocessor = ColumnTransformer(transformers=[
-    ('num', numeric_transformer, selector(dtype_exclude="category")),
-    ('cat', categorical_transformer, selector(dtype_include="category"))
-])
-clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+preprocessor = ColumnTransformer(
+    transformers=[
+        ("num", numeric_transformer, selector(dtype_exclude="category")),
+        ("cat", categorical_transformer, selector(dtype_include="category")),
+    ]
+)
+clf = Pipeline(
+    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
+)
 
 
 clf.fit(X_train, y_train)
@@ -167,8 +172,8 @@
 # :class:`~sklearn.model_selection.GridSearchCV`.
 
 param_grid = {
-    'preprocessor__num__imputer__strategy': ['mean', 'median'],
-    'classifier__C': [0.1, 1.0, 10, 100],
+    "preprocessor__num__imputer__strategy": ["mean", "median"],
+    "classifier__C": [0.1, 1.0, 10, 100],
 }
 
 grid_search = GridSearchCV(clf, param_grid, cv=10)
@@ -193,15 +198,23 @@
 
 cv_results = pd.DataFrame(grid_search.cv_results_)
 cv_results = cv_results.sort_values("mean_test_score", ascending=False)
-cv_results[["mean_test_score", "std_test_score",
-            "param_preprocessor__num__imputer__strategy",
-            "param_classifier__C"
-            ]].head(5)
+cv_results[
+    [
+        "mean_test_score",
+        "std_test_score",
+        "param_preprocessor__num__imputer__strategy",
+        "param_classifier__C",
+    ]
+].head(5)
 
 # %%
 # The best hyper-parameters have be used to re-fit a final model on the full
 # training set. We can evaluate that final model on held out test data that was
 # not used for hyperparameter tuning.
 #
-print(("best logistic regression from grid search: %.3f"
-       % grid_search.score(X_test, y_test)))
+print(
+    (
+        "best logistic regression from grid search: %.3f"
+        % grid_search.score(X_test, y_test)
+    )
+)
diff --git a/examples/compose/plot_compare_reduction.py b/examples/compose/plot_compare_reduction.py
index 4e7d2c5900420..b31ac5d150998 100755
--- a/examples/compose/plot_compare_reduction.py
+++ b/examples/compose/plot_compare_reduction.py
@@ -38,51 +38,52 @@
 
 print(__doc__)
 
-pipe = Pipeline([
-    # the reduce_dim stage is populated by the param_grid
-    ('reduce_dim', 'passthrough'),
-    ('classify', LinearSVC(dual=False, max_iter=10000))
-])
+pipe = Pipeline(
+    [
+        # the reduce_dim stage is populated by the param_grid
+        ("reduce_dim", "passthrough"),
+        ("classify", LinearSVC(dual=False, max_iter=10000)),
+    ]
+)
 
 N_FEATURES_OPTIONS = [2, 4, 8]
 C_OPTIONS = [1, 10, 100, 1000]
 param_grid = [
     {
-        'reduce_dim': [PCA(iterated_power=7), NMF()],
-        'reduce_dim__n_components': N_FEATURES_OPTIONS,
-        'classify__C': C_OPTIONS
+        "reduce_dim": [PCA(iterated_power=7), NMF()],
+        "reduce_dim__n_components": N_FEATURES_OPTIONS,
+        "classify__C": C_OPTIONS,
     },
     {
-        'reduce_dim': [SelectKBest(chi2)],
-        'reduce_dim__k': N_FEATURES_OPTIONS,
-        'classify__C': C_OPTIONS
+        "reduce_dim": [SelectKBest(chi2)],
+        "reduce_dim__k": N_FEATURES_OPTIONS,
+        "classify__C": C_OPTIONS,
     },
 ]
-reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
+reducer_labels = ["PCA", "NMF", "KBest(chi2)"]
 
 grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)
 X, y = load_digits(return_X_y=True)
 grid.fit(X, y)
 
-mean_scores = np.array(grid.cv_results_['mean_test_score'])
+mean_scores = np.array(grid.cv_results_["mean_test_score"])
 # scores are in the order of param_grid iteration, which is alphabetical
 mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
 # select score for best C
 mean_scores = mean_scores.max(axis=0)
-bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
-               (len(reducer_labels) + 1) + .5)
+bar_offsets = np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + 0.5
 
 plt.figure()
-COLORS = 'bgrcmyk'
+COLORS = "bgrcmyk"
 for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
     plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
 
 plt.title("Comparing feature reduction techniques")
-plt.xlabel('Reduced number of features')
+plt.xlabel("Reduced number of features")
 plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
-plt.ylabel('Digit classification accuracy')
+plt.ylabel("Digit classification accuracy")
 plt.ylim((0, 1))
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 
 plt.show()
 
@@ -103,11 +104,12 @@
 from shutil import rmtree
 
 # Create a temporary folder to store the transformers of the pipeline
-location = 'cachedir'
+location = "cachedir"
 memory = Memory(location=location, verbose=10)
-cached_pipe = Pipeline([('reduce_dim', PCA()),
-                        ('classify', LinearSVC(dual=False, max_iter=10000))],
-                       memory=memory)
+cached_pipe = Pipeline(
+    [("reduce_dim", PCA()), ("classify", LinearSVC(dual=False, max_iter=10000))],
+    memory=memory,
+)
 
 # This time, a cached pipeline will be used within the grid search
 
diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index 7c7ddf938a14f..8974ccc0e651e 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -36,14 +36,14 @@
 pca = PCA()
 # set the tolerance to a large value to make the example faster
 logistic = LogisticRegression(max_iter=10000, tol=0.1)
-pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
+pipe = Pipeline(steps=[("pca", pca), ("logistic", logistic)])
 
 X_digits, y_digits = datasets.load_digits(return_X_y=True)
 
 # Parameters of pipelines can be set using ‘__’ separated parameter names:
 param_grid = {
-    'pca__n_components': [5, 15, 30, 45, 64],
-    'logistic__C': np.logspace(-4, 4, 4),
+    "pca__n_components": [5, 15, 30, 45, 64],
+    "logistic__C": np.logspace(-4, 4, 4),
 }
 search = GridSearchCV(pipe, param_grid, n_jobs=-1)
 search.fit(X_digits, y_digits)
@@ -54,24 +54,30 @@
 pca.fit(X_digits)
 
 fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
-ax0.plot(np.arange(1, pca.n_components_ + 1),
-         pca.explained_variance_ratio_, '+', linewidth=2)
-ax0.set_ylabel('PCA explained variance ratio')
-
-ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
-            linestyle=':', label='n_components chosen')
+ax0.plot(
+    np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2
+)
+ax0.set_ylabel("PCA explained variance ratio")
+
+ax0.axvline(
+    search.best_estimator_.named_steps["pca"].n_components,
+    linestyle=":",
+    label="n_components chosen",
+)
 ax0.legend(prop=dict(size=12))
 
 # For each number of components, find the best classifier results
 results = pd.DataFrame(search.cv_results_)
-components_col = 'param_pca__n_components'
+components_col = "param_pca__n_components"
 best_clfs = results.groupby(components_col).apply(
-    lambda g: g.nlargest(1, 'mean_test_score'))
-
-best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
-               legend=False, ax=ax1)
-ax1.set_ylabel('Classification accuracy (val)')
-ax1.set_xlabel('n_components')
+    lambda g: g.nlargest(1, "mean_test_score")
+)
+
+best_clfs.plot(
+    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
+)
+ax1.set_ylabel("Classification accuracy (val)")
+ax1.set_xlabel("n_components")
 
 plt.xlim(-1, 70)
 
diff --git a/examples/compose/plot_feature_union.py b/examples/compose/plot_feature_union.py
index c76548a2d376f..09e6e6919ab9a 100644
--- a/examples/compose/plot_feature_union.py
+++ b/examples/compose/plot_feature_union.py
@@ -50,9 +50,11 @@
 
 pipeline = Pipeline([("features", combined_features), ("svm", svm)])
 
-param_grid = dict(features__pca__n_components=[1, 2, 3],
-                  features__univ_select__k=[1, 2],
-                  svm__C=[0.1, 1, 10])
+param_grid = dict(
+    features__pca__n_components=[1, 2, 3],
+    features__univ_select__k=[1, 2],
+    svm__C=[0.1, 1, 10],
+)
 
 grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
 grid_search.fit(X, y)
diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py
index eac10a926ee46..c7f64e265b4b9 100755
--- a/examples/compose/plot_transformed_target.py
+++ b/examples/compose/plot_transformed_target.py
@@ -32,10 +32,10 @@
 ##############################################################################
 
 # `normed` is being deprecated in favor of `density` in histograms
-if parse_version(matplotlib.__version__) >= parse_version('2.1'):
-    density_param = {'density': True}
+if parse_version(matplotlib.__version__) >= parse_version("2.1"):
+    density_param = {"density": True}
 else:
-    density_param = {'normed': True}
+    density_param = {"normed": True}
 
 # %%
 # A synthetic random regression dataset is generated. The targets ``y`` are
@@ -62,14 +62,14 @@
 
 ax0.hist(y, bins=100, **density_param)
 ax0.set_xlim([0, 2000])
-ax0.set_ylabel('Probability')
-ax0.set_xlabel('Target')
-ax0.set_title('Target distribution')
+ax0.set_ylabel("Probability")
+ax0.set_xlabel("Target")
+ax0.set_title("Target distribution")
 
 ax1.hist(y_trans, bins=100, **density_param)
-ax1.set_ylabel('Probability')
-ax1.set_xlabel('Target')
-ax1.set_title('Transformed target distribution')
+ax1.set_ylabel("Probability")
+ax1.set_xlabel("Target")
+ax1.set_title("Transformed target distribution")
 
 f.suptitle("Synthetic data", y=0.06, x=0.53)
 f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
@@ -90,28 +90,36 @@
 y_pred = regr.predict(X_test)
 # Plot results
 ax0.scatter(y_test, y_pred)
-ax0.plot([0, 2000], [0, 2000], '--k')
-ax0.set_ylabel('Target predicted')
-ax0.set_xlabel('True Target')
-ax0.set_title('Ridge regression \n without target transformation')
-ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
-    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
+ax0.plot([0, 2000], [0, 2000], "--k")
+ax0.set_ylabel("Target predicted")
+ax0.set_xlabel("True Target")
+ax0.set_title("Ridge regression \n without target transformation")
+ax0.text(
+    100,
+    1750,
+    r"$R^2$=%.2f, MAE=%.2f"
+    % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)),
+)
 ax0.set_xlim([0, 2000])
 ax0.set_ylim([0, 2000])
 # Transform targets and use same linear model
-regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
-                                        func=np.log1p,
-                                        inverse_func=np.expm1)
+regr_trans = TransformedTargetRegressor(
+    regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1
+)
 regr_trans.fit(X_train, y_train)
 y_pred = regr_trans.predict(X_test)
 
 ax1.scatter(y_test, y_pred)
-ax1.plot([0, 2000], [0, 2000], '--k')
-ax1.set_ylabel('Target predicted')
-ax1.set_xlabel('True Target')
-ax1.set_title('Ridge regression \n with target transformation')
-ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
-    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
+ax1.plot([0, 2000], [0, 2000], "--k")
+ax1.set_ylabel("Target predicted")
+ax1.set_xlabel("True Target")
+ax1.set_title("Ridge regression \n with target transformation")
+ax1.text(
+    100,
+    1750,
+    r"$R^2$=%.2f, MAE=%.2f"
+    % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)),
+)
 ax1.set_xlim([0, 2000])
 ax1.set_ylim([0, 2000])
 
@@ -133,12 +141,11 @@
 # Keep only numeric columns
 X = ames.data.select_dtypes(np.number)
 # Remove columns with NaN or Inf values
-X = X.drop(columns=['LotFrontage', 'GarageYrBlt', 'MasVnrArea'])
+X = X.drop(columns=["LotFrontage", "GarageYrBlt", "MasVnrArea"])
 y = ames.target
-y_trans = quantile_transform(y.to_frame(),
-                             n_quantiles=900,
-                             output_distribution='normal',
-                             copy=True).squeeze()
+y_trans = quantile_transform(
+    y.to_frame(), n_quantiles=900, output_distribution="normal", copy=True
+).squeeze()
 # %%
 # A :class:`~sklearn.preprocessing.QuantileTransformer` is used to normalize
 # the target distribution before applying a
@@ -147,15 +154,15 @@
 f, (ax0, ax1) = plt.subplots(1, 2)
 
 ax0.hist(y, bins=100, **density_param)
-ax0.set_ylabel('Probability')
-ax0.set_xlabel('Target')
-ax0.text(s='Target distribution', x=1.2e5, y=9.8e-6, fontsize=12)
+ax0.set_ylabel("Probability")
+ax0.set_xlabel("Target")
+ax0.text(s="Target distribution", x=1.2e5, y=9.8e-6, fontsize=12)
 ax0.ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
 
 ax1.hist(y_trans, bins=100, **density_param)
-ax1.set_ylabel('Probability')
-ax1.set_xlabel('Target')
-ax1.text(s='Transformed target distribution', x=-6.8, y=0.479, fontsize=12)
+ax1.set_ylabel("Probability")
+ax1.set_xlabel("Target")
+ax1.text(s="Transformed target distribution", x=-6.8, y=0.479, fontsize=12)
 
 f.suptitle("Ames housing data: selling price", y=0.04)
 f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])
@@ -171,51 +178,69 @@
 # target. With target transformation, the shape is more linear indicating
 # better model fit.
 
-f, (ax0, ax1) = plt.subplots(2, 2, sharey='row', figsize=(6.5, 8))
+f, (ax0, ax1) = plt.subplots(2, 2, sharey="row", figsize=(6.5, 8))
 
 regr = RidgeCV()
 regr.fit(X_train, y_train)
 y_pred = regr.predict(X_test)
 
 ax0[0].scatter(y_pred, y_test, s=8)
-ax0[0].plot([0, 7e5], [0, 7e5], '--k')
-ax0[0].set_ylabel('True target')
-ax0[0].set_xlabel('Predicted target')
-ax0[0].text(s='Ridge regression \n without target transformation', x=-5e4,
-            y=8e5, fontsize=12, multialignment='center')
-ax0[0].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % (
-    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
+ax0[0].plot([0, 7e5], [0, 7e5], "--k")
+ax0[0].set_ylabel("True target")
+ax0[0].set_xlabel("Predicted target")
+ax0[0].text(
+    s="Ridge regression \n without target transformation",
+    x=-5e4,
+    y=8e5,
+    fontsize=12,
+    multialignment="center",
+)
+ax0[0].text(
+    3e4,
+    64e4,
+    r"$R^2$=%.2f, MAE=%.2f"
+    % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)),
+)
 ax0[0].set_xlim([0, 7e5])
 ax0[0].set_ylim([0, 7e5])
 ax0[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
 
 ax1[0].scatter(y_pred, (y_pred - y_test), s=8)
-ax1[0].set_ylabel('Residual')
-ax1[0].set_xlabel('Predicted target')
+ax1[0].set_ylabel("Residual")
+ax1[0].set_xlabel("Predicted target")
 ax1[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
 
 regr_trans = TransformedTargetRegressor(
     regressor=RidgeCV(),
-    transformer=QuantileTransformer(n_quantiles=900,
-                                    output_distribution='normal'))
+    transformer=QuantileTransformer(n_quantiles=900, output_distribution="normal"),
+)
 regr_trans.fit(X_train, y_train)
 y_pred = regr_trans.predict(X_test)
 
 ax0[1].scatter(y_pred, y_test, s=8)
-ax0[1].plot([0, 7e5], [0, 7e5], '--k')
-ax0[1].set_ylabel('True target')
-ax0[1].set_xlabel('Predicted target')
-ax0[1].text(s='Ridge regression \n with target transformation', x=-5e4,
-            y=8e5, fontsize=12, multialignment='center')
-ax0[1].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % (
-    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
+ax0[1].plot([0, 7e5], [0, 7e5], "--k")
+ax0[1].set_ylabel("True target")
+ax0[1].set_xlabel("Predicted target")
+ax0[1].text(
+    s="Ridge regression \n with target transformation",
+    x=-5e4,
+    y=8e5,
+    fontsize=12,
+    multialignment="center",
+)
+ax0[1].text(
+    3e4,
+    64e4,
+    r"$R^2$=%.2f, MAE=%.2f"
+    % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)),
+)
 ax0[1].set_xlim([0, 7e5])
 ax0[1].set_ylim([0, 7e5])
 ax0[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
 
 ax1[1].scatter(y_pred, (y_pred - y_test), s=8)
-ax1[1].set_ylabel('Residual')
-ax1[1].set_xlabel('Predicted target')
+ax1[1].set_ylabel("Residual")
+ax1[1].set_xlabel("Predicted target")
 ax1[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0))
 
 f.suptitle("Ames housing data: selling price", y=0.035)
diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py
index c0540a75f9e67..eae43b3c7d4d3 100644
--- a/examples/covariance/plot_covariance_estimation.py
+++ b/examples/covariance/plot_covariance_estimation.py
@@ -47,8 +47,13 @@
 import matplotlib.pyplot as plt
 from scipy import linalg
 
-from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \
-    log_likelihood, empirical_covariance
+from sklearn.covariance import (
+    LedoitWolf,
+    OAS,
+    ShrunkCovariance,
+    log_likelihood,
+    empirical_covariance,
+)
 from sklearn.model_selection import GridSearchCV
 
 
@@ -69,8 +74,9 @@
 
 # spanning a range of possible shrinkage coefficient values
 shrinkages = np.logspace(-2, 0, 30)
-negative_logliks = [-ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test)
-                    for s in shrinkages]
+negative_logliks = [
+    -ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test) for s in shrinkages
+]
 
 # under the ground-truth model, which we would not have access to in real
 # settings
@@ -82,7 +88,7 @@
 # Compare different approaches to setting the parameter
 
 # GridSearch for an optimal shrinkage coefficient
-tuned_parameters = [{'shrinkage': shrinkages}]
+tuned_parameters = [{"shrinkage": shrinkages}]
 cv = GridSearchCV(ShrunkCovariance(), tuned_parameters)
 cv.fit(X_train)
 
@@ -98,31 +104,42 @@
 # Plot results
 fig = plt.figure()
 plt.title("Regularized covariance: likelihood and shrinkage coefficient")
-plt.xlabel('Regularization parameter: shrinkage coefficient')
-plt.ylabel('Error: negative log-likelihood on test data')
+plt.xlabel("Regularization parameter: shrinkage coefficient")
+plt.ylabel("Error: negative log-likelihood on test data")
 # range shrinkage curve
 plt.loglog(shrinkages, negative_logliks, label="Negative log-likelihood")
 
-plt.plot(plt.xlim(), 2 * [loglik_real], '--r',
-         label="Real covariance likelihood")
+plt.plot(plt.xlim(), 2 * [loglik_real], "--r", label="Real covariance likelihood")
 
 # adjust view
 lik_max = np.amax(negative_logliks)
 lik_min = np.amin(negative_logliks)
-ymin = lik_min - 6. * np.log((plt.ylim()[1] - plt.ylim()[0]))
-ymax = lik_max + 10. * np.log(lik_max - lik_min)
+ymin = lik_min - 6.0 * np.log((plt.ylim()[1] - plt.ylim()[0]))
+ymax = lik_max + 10.0 * np.log(lik_max - lik_min)
 xmin = shrinkages[0]
 xmax = shrinkages[-1]
 # LW likelihood
-plt.vlines(lw.shrinkage_, ymin, -loglik_lw, color='magenta',
-           linewidth=3, label='Ledoit-Wolf estimate')
+plt.vlines(
+    lw.shrinkage_,
+    ymin,
+    -loglik_lw,
+    color="magenta",
+    linewidth=3,
+    label="Ledoit-Wolf estimate",
+)
 # OAS likelihood
-plt.vlines(oa.shrinkage_, ymin, -loglik_oa, color='purple',
-           linewidth=3, label='OAS estimate')
+plt.vlines(
+    oa.shrinkage_, ymin, -loglik_oa, color="purple", linewidth=3, label="OAS estimate"
+)
 # best CV estimator likelihood
-plt.vlines(cv.best_estimator_.shrinkage, ymin,
-           -cv.best_estimator_.score(X_test), color='cyan',
-           linewidth=3, label='Cross-validation best estimate')
+plt.vlines(
+    cv.best_estimator_.shrinkage,
+    ymin,
+    -cv.best_estimator_.score(X_test),
+    color="cyan",
+    linewidth=3,
+    label="Cross-validation best estimate",
+)
 
 plt.ylim(ymin, ymax)
 plt.xlim(xmin, xmax)
diff --git a/examples/covariance/plot_lw_vs_oas.py b/examples/covariance/plot_lw_vs_oas.py
index f9cd11bbb81b4..abd451da3875a 100644
--- a/examples/covariance/plot_lw_vs_oas.py
+++ b/examples/covariance/plot_lw_vs_oas.py
@@ -44,8 +44,7 @@
 oa_shrinkage = np.zeros((n_samples_range.size, repeat))
 for i, n_samples in enumerate(n_samples_range):
     for j in range(repeat):
-        X = np.dot(
-            np.random.normal(size=(n_samples, n_features)), coloring_matrix.T)
+        X = np.dot(np.random.normal(size=(n_samples, n_features)), coloring_matrix.T)
 
         lw = LedoitWolf(store_precision=False, assume_centered=True)
         lw.fit(X)
@@ -59,10 +58,22 @@
 
 # plot MSE
 plt.subplot(2, 1, 1)
-plt.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1),
-             label='Ledoit-Wolf', color='navy', lw=2)
-plt.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1),
-             label='OAS', color='darkorange', lw=2)
+plt.errorbar(
+    n_samples_range,
+    lw_mse.mean(1),
+    yerr=lw_mse.std(1),
+    label="Ledoit-Wolf",
+    color="navy",
+    lw=2,
+)
+plt.errorbar(
+    n_samples_range,
+    oa_mse.mean(1),
+    yerr=oa_mse.std(1),
+    label="OAS",
+    color="darkorange",
+    lw=2,
+)
 plt.ylabel("Squared error")
 plt.legend(loc="upper right")
 plt.title("Comparison of covariance estimators")
@@ -70,14 +81,26 @@
 
 # plot shrinkage coefficient
 plt.subplot(2, 1, 2)
-plt.errorbar(n_samples_range, lw_shrinkage.mean(1), yerr=lw_shrinkage.std(1),
-             label='Ledoit-Wolf', color='navy', lw=2)
-plt.errorbar(n_samples_range, oa_shrinkage.mean(1), yerr=oa_shrinkage.std(1),
-             label='OAS', color='darkorange', lw=2)
+plt.errorbar(
+    n_samples_range,
+    lw_shrinkage.mean(1),
+    yerr=lw_shrinkage.std(1),
+    label="Ledoit-Wolf",
+    color="navy",
+    lw=2,
+)
+plt.errorbar(
+    n_samples_range,
+    oa_shrinkage.mean(1),
+    yerr=oa_shrinkage.std(1),
+    label="OAS",
+    color="darkorange",
+    lw=2,
+)
 plt.xlabel("n_samples")
 plt.ylabel("Shrinkage")
 plt.legend(loc="lower right")
-plt.ylim(plt.ylim()[0], 1. + (plt.ylim()[1] - plt.ylim()[0]) / 10.)
+plt.ylim(plt.ylim()[0], 1.0 + (plt.ylim()[1] - plt.ylim()[0]) / 10.0)
 plt.xlim(5, 31)
 
 plt.show()
diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py
index ab2e9fe8471d5..b93d68a269706 100644
--- a/examples/covariance/plot_mahalanobis_distances.py
+++ b/examples/covariance/plot_mahalanobis_distances.py
@@ -84,11 +84,11 @@
 
 # generate Gaussian data of shape (125, 2)
 gen_cov = np.eye(n_features)
-gen_cov[0, 0] = 2.
+gen_cov[0, 0] = 2.0
 X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
 # add some outliers
 outliers_cov = np.eye(n_features)
-outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
+outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.0
 X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)
 
 # %%
@@ -109,9 +109,11 @@
 robust_cov = MinCovDet().fit(X)
 # fit a MLE estimator to data
 emp_cov = EmpiricalCovariance().fit(X)
-print('Estimated covariance matrix:\n'
-      'MCD (Robust):\n{}\n'
-      'MLE:\n{}'.format(robust_cov.covariance_, emp_cov.covariance_))
+print(
+    "Estimated covariance matrix:\nMCD (Robust):\n{}\nMLE:\n{}".format(
+        robust_cov.covariance_, emp_cov.covariance_
+    )
+)
 
 # %%
 # To better visualize the difference, we plot contours of the
@@ -122,33 +124,44 @@
 
 fig, ax = plt.subplots(figsize=(10, 5))
 # Plot data set
-inlier_plot = ax.scatter(X[:, 0], X[:, 1],
-                         color='black', label='inliers')
-outlier_plot = ax.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
-                          color='red', label='outliers')
-ax.set_xlim(ax.get_xlim()[0], 10.)
+inlier_plot = ax.scatter(X[:, 0], X[:, 1], color="black", label="inliers")
+outlier_plot = ax.scatter(
+    X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color="red", label="outliers"
+)
+ax.set_xlim(ax.get_xlim()[0], 10.0)
 ax.set_title("Mahalanobis distances of a contaminated data set")
 
 # Create meshgrid of feature 1 and feature 2 values
-xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
-                     np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
+xx, yy = np.meshgrid(
+    np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
+    np.linspace(plt.ylim()[0], plt.ylim()[1], 100),
+)
 zz = np.c_[xx.ravel(), yy.ravel()]
 # Calculate the MLE based Mahalanobis distances of the meshgrid
 mahal_emp_cov = emp_cov.mahalanobis(zz)
 mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
-emp_cov_contour = plt.contour(xx, yy, np.sqrt(mahal_emp_cov),
-                              cmap=plt.cm.PuBu_r, linestyles='dashed')
+emp_cov_contour = plt.contour(
+    xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles="dashed"
+)
 # Calculate the MCD based Mahalanobis distances
 mahal_robust_cov = robust_cov.mahalanobis(zz)
 mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
-robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov),
-                            cmap=plt.cm.YlOrBr_r, linestyles='dotted')
+robust_contour = ax.contour(
+    xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles="dotted"
+)
 
 # Add legend
-ax.legend([emp_cov_contour.collections[1], robust_contour.collections[1],
-          inlier_plot, outlier_plot],
-          ['MLE dist', 'MCD dist', 'inliers', 'outliers'],
-          loc="upper right", borderaxespad=0)
+ax.legend(
+    [
+        emp_cov_contour.collections[1],
+        robust_contour.collections[1],
+        inlier_plot,
+        outlier_plot,
+    ],
+    ["MLE dist", "MCD dist", "inliers", "outliers"],
+    loc="upper right",
+    borderaxespad=0,
+)
 
 plt.show()
 
@@ -161,32 +174,37 @@
 # distribution of inlier samples for robust MCD based Mahalanobis distances.
 
 fig, (ax1, ax2) = plt.subplots(1, 2)
-plt.subplots_adjust(wspace=.6)
+plt.subplots_adjust(wspace=0.6)
 
 # Calculate cubic root of MLE Mahalanobis distances for samples
 emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33)
 # Plot boxplots
-ax1.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25)
+ax1.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=0.25)
 # Plot individual samples
-ax1.plot(np.full(n_samples - n_outliers, 1.26), emp_mahal[:-n_outliers],
-         '+k', markeredgewidth=1)
-ax1.plot(np.full(n_outliers, 2.26), emp_mahal[-n_outliers:],
-         '+k', markeredgewidth=1)
-ax1.axes.set_xticklabels(('inliers', 'outliers'), size=15)
+ax1.plot(
+    np.full(n_samples - n_outliers, 1.26),
+    emp_mahal[:-n_outliers],
+    "+k",
+    markeredgewidth=1,
+)
+ax1.plot(np.full(n_outliers, 2.26), emp_mahal[-n_outliers:], "+k", markeredgewidth=1)
+ax1.axes.set_xticklabels(("inliers", "outliers"), size=15)
 ax1.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$", size=16)
 ax1.set_title("Using non-robust estimates\n(Maximum Likelihood)")
 
 # Calculate cubic root of MCD Mahalanobis distances for samples
 robust_mahal = robust_cov.mahalanobis(X - robust_cov.location_) ** (0.33)
 # Plot boxplots
-ax2.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]],
-            widths=.25)
+ax2.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]], widths=0.25)
 # Plot individual samples
-ax2.plot(np.full(n_samples - n_outliers, 1.26), robust_mahal[:-n_outliers],
-         '+k', markeredgewidth=1)
-ax2.plot(np.full(n_outliers, 2.26), robust_mahal[-n_outliers:],
-         '+k', markeredgewidth=1)
-ax2.axes.set_xticklabels(('inliers', 'outliers'), size=15)
+ax2.plot(
+    np.full(n_samples - n_outliers, 1.26),
+    robust_mahal[:-n_outliers],
+    "+k",
+    markeredgewidth=1,
+)
+ax2.plot(np.full(n_outliers, 2.26), robust_mahal[-n_outliers:], "+k", markeredgewidth=1)
+ax2.axes.set_xticklabels(("inliers", "outliers"), size=15)
 ax2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$", size=16)
 ax2.set_title("Using robust estimates\n(Minimum Covariance Determinant)")
 
diff --git a/examples/covariance/plot_robust_vs_empirical_covariance.py b/examples/covariance/plot_robust_vs_empirical_covariance.py
index 8ca0a49d66642..a4fb6ef8941ed 100644
--- a/examples/covariance/plot_robust_vs_empirical_covariance.py
+++ b/examples/covariance/plot_robust_vs_empirical_covariance.py
@@ -66,8 +66,11 @@
 repeat = 10
 
 range_n_outliers = np.concatenate(
-    (np.linspace(0, n_samples / 8, 5),
-     np.linspace(n_samples / 8, n_samples / 2, 5)[1:-1])).astype(int)
+    (
+        np.linspace(0, n_samples / 8, 5),
+        np.linspace(n_samples / 8, n_samples / 2, 5)[1:-1],
+    )
+).astype(int)
 
 # definition of arrays to store results
 err_loc_mcd = np.zeros((range_n_outliers.size, repeat))
@@ -87,8 +90,9 @@
         X = rng.randn(n_samples, n_features)
         # add some outliers
         outliers_index = rng.permutation(n_samples)[:n_outliers]
-        outliers_offset = 10. * \
-            (np.random.randint(2, size=(n_outliers, n_features)) - 0.5)
+        outliers_offset = 10.0 * (
+            np.random.randint(2, size=(n_outliers, n_features)) - 0.5
+        )
         X[outliers_index] += outliers_offset
         inliers_mask = np.ones(n_samples).astype(bool)
         inliers_mask[outliers_index] = False
@@ -102,8 +106,9 @@
         # compare estimators learned from the full data set with true
         # parameters
         err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2)
-        err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm(
-            np.eye(n_features))
+        err_cov_emp_full[i, j] = (
+            EmpiricalCovariance().fit(X).error_norm(np.eye(n_features))
+        )
 
         # compare with an empirical covariance learned from a pure data set
         # (i.e. "perfect" mcd)
@@ -117,34 +122,63 @@
 font_prop = matplotlib.font_manager.FontProperties(size=11)
 plt.subplot(2, 1, 1)
 lw = 2
-plt.errorbar(range_n_outliers, err_loc_mcd.mean(1),
-             yerr=err_loc_mcd.std(1) / np.sqrt(repeat),
-             label="Robust location", lw=lw, color='m')
-plt.errorbar(range_n_outliers, err_loc_emp_full.mean(1),
-             yerr=err_loc_emp_full.std(1) / np.sqrt(repeat),
-             label="Full data set mean", lw=lw, color='green')
-plt.errorbar(range_n_outliers, err_loc_emp_pure.mean(1),
-             yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat),
-             label="Pure data set mean", lw=lw, color='black')
+plt.errorbar(
+    range_n_outliers,
+    err_loc_mcd.mean(1),
+    yerr=err_loc_mcd.std(1) / np.sqrt(repeat),
+    label="Robust location",
+    lw=lw,
+    color="m",
+)
+plt.errorbar(
+    range_n_outliers,
+    err_loc_emp_full.mean(1),
+    yerr=err_loc_emp_full.std(1) / np.sqrt(repeat),
+    label="Full data set mean",
+    lw=lw,
+    color="green",
+)
+plt.errorbar(
+    range_n_outliers,
+    err_loc_emp_pure.mean(1),
+    yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat),
+    label="Pure data set mean",
+    lw=lw,
+    color="black",
+)
 plt.title("Influence of outliers on the location estimation")
 plt.ylabel(r"Error ($||\mu - \hat{\mu}||_2^2$)")
 plt.legend(loc="upper left", prop=font_prop)
 
 plt.subplot(2, 1, 2)
 x_size = range_n_outliers.size
-plt.errorbar(range_n_outliers, err_cov_mcd.mean(1),
-             yerr=err_cov_mcd.std(1),
-             label="Robust covariance (mcd)", color='m')
-plt.errorbar(range_n_outliers[:(x_size // 5 + 1)],
-             err_cov_emp_full.mean(1)[:(x_size // 5 + 1)],
-             yerr=err_cov_emp_full.std(1)[:(x_size // 5 + 1)],
-             label="Full data set empirical covariance", color='green')
-plt.plot(range_n_outliers[(x_size // 5):(x_size // 2 - 1)],
-         err_cov_emp_full.mean(1)[(x_size // 5):(x_size // 2 - 1)],
-         color='green', ls='--')
-plt.errorbar(range_n_outliers, err_cov_emp_pure.mean(1),
-             yerr=err_cov_emp_pure.std(1),
-             label="Pure data set empirical covariance", color='black')
+plt.errorbar(
+    range_n_outliers,
+    err_cov_mcd.mean(1),
+    yerr=err_cov_mcd.std(1),
+    label="Robust covariance (mcd)",
+    color="m",
+)
+plt.errorbar(
+    range_n_outliers[: (x_size // 5 + 1)],
+    err_cov_emp_full.mean(1)[: (x_size // 5 + 1)],
+    yerr=err_cov_emp_full.std(1)[: (x_size // 5 + 1)],
+    label="Full data set empirical covariance",
+    color="green",
+)
+plt.plot(
+    range_n_outliers[(x_size // 5) : (x_size // 2 - 1)],
+    err_cov_emp_full.mean(1)[(x_size // 5) : (x_size // 2 - 1)],
+    color="green",
+    ls="--",
+)
+plt.errorbar(
+    range_n_outliers,
+    err_cov_emp_pure.mean(1),
+    yerr=err_cov_emp_pure.std(1),
+    label="Pure data set empirical covariance",
+    color="black",
+)
 plt.title("Influence of outliers on the covariance estimation")
 plt.xlabel("Amount of contamination (%)")
 plt.ylabel("RMSE")
diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py
index c595e7d3d9661..ddb8de244a3b9 100644
--- a/examples/covariance/plot_sparse_cov.py
+++ b/examples/covariance/plot_sparse_cov.py
@@ -65,10 +65,9 @@
 n_features = 20
 
 prng = np.random.RandomState(1)
-prec = make_sparse_spd_matrix(n_features, alpha=.98,
-                              smallest_coef=.4,
-                              largest_coef=.7,
-                              random_state=prng)
+prec = make_sparse_spd_matrix(
+    n_features, alpha=0.98, smallest_coef=0.4, largest_coef=0.7, random_state=prng
+)
 cov = linalg.inv(prec)
 d = np.sqrt(np.diag(cov))
 cov /= d
@@ -97,42 +96,55 @@
 plt.subplots_adjust(left=0.02, right=0.98)
 
 # plot the covariances
-covs = [('Empirical', emp_cov), ('Ledoit-Wolf', lw_cov_),
-        ('GraphicalLassoCV', cov_), ('True', cov)]
+covs = [
+    ("Empirical", emp_cov),
+    ("Ledoit-Wolf", lw_cov_),
+    ("GraphicalLassoCV", cov_),
+    ("True", cov),
+]
 vmax = cov_.max()
 for i, (name, this_cov) in enumerate(covs):
     plt.subplot(2, 4, i + 1)
-    plt.imshow(this_cov, interpolation='nearest', vmin=-vmax, vmax=vmax,
-               cmap=plt.cm.RdBu_r)
+    plt.imshow(
+        this_cov, interpolation="nearest", vmin=-vmax, vmax=vmax, cmap=plt.cm.RdBu_r
+    )
     plt.xticks(())
     plt.yticks(())
-    plt.title('%s covariance' % name)
+    plt.title("%s covariance" % name)
 
 
 # plot the precisions
-precs = [('Empirical', linalg.inv(emp_cov)), ('Ledoit-Wolf', lw_prec_),
-         ('GraphicalLasso', prec_), ('True', prec)]
-vmax = .9 * prec_.max()
+precs = [
+    ("Empirical", linalg.inv(emp_cov)),
+    ("Ledoit-Wolf", lw_prec_),
+    ("GraphicalLasso", prec_),
+    ("True", prec),
+]
+vmax = 0.9 * prec_.max()
 for i, (name, this_prec) in enumerate(precs):
     ax = plt.subplot(2, 4, i + 5)
-    plt.imshow(np.ma.masked_equal(this_prec, 0),
-               interpolation='nearest', vmin=-vmax, vmax=vmax,
-               cmap=plt.cm.RdBu_r)
+    plt.imshow(
+        np.ma.masked_equal(this_prec, 0),
+        interpolation="nearest",
+        vmin=-vmax,
+        vmax=vmax,
+        cmap=plt.cm.RdBu_r,
+    )
     plt.xticks(())
     plt.yticks(())
-    plt.title('%s precision' % name)
-    if hasattr(ax, 'set_facecolor'):
-        ax.set_facecolor('.7')
+    plt.title("%s precision" % name)
+    if hasattr(ax, "set_facecolor"):
+        ax.set_facecolor(".7")
     else:
-        ax.set_axis_bgcolor('.7')
+        ax.set_axis_bgcolor(".7")
 
 # plot the model selection metric
 plt.figure(figsize=(4, 3))
-plt.axes([.2, .15, .75, .7])
-plt.plot(model.cv_results_["alphas"], model.cv_results_["mean_score"], 'o-')
-plt.axvline(model.alpha_, color='.5')
-plt.title('Model selection')
-plt.ylabel('Cross-validation score')
-plt.xlabel('alpha')
+plt.axes([0.2, 0.15, 0.75, 0.7])
+plt.plot(model.cv_results_["alphas"], model.cv_results_["mean_score"], "o-")
+plt.axvline(model.alpha_, color=".5")
+plt.title("Model selection")
+plt.ylabel("Cross-validation score")
+plt.xlabel("alpha")
 
 plt.show()
diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py
index 2f1f1d21f30d5..21b735e401711 100644
--- a/examples/cross_decomposition/plot_compare_cross_decomposition.py
+++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py
@@ -36,10 +36,10 @@
 X = latents + np.random.normal(size=4 * n).reshape((n, 4))
 Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
 
-X_train = X[:n // 2]
-Y_train = Y[:n // 2]
-X_test = X[n // 2:]
-Y_test = Y[n // 2:]
+X_train = X[: n // 2]
+Y_train = Y[: n // 2]
+X_test = X[n // 2 :]
+Y_test = Y[n // 2 :]
 
 print("Corr(X)")
 print(np.round(np.corrcoef(X.T), 2))
@@ -61,54 +61,54 @@
 # 1) On diagonal plot X vs Y scores on each components
 plt.figure(figsize=(12, 8))
 plt.subplot(221)
-plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train",
-            marker="o", s=25)
-plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test",
-            marker="o", s=25)
+plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", s=25)
+plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", marker="o", s=25)
 plt.xlabel("x scores")
 plt.ylabel("y scores")
-plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
-          np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
+plt.title(
+    "Comp. 1: X vs Y (test corr = %.2f)"
+    % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]
+)
 plt.xticks(())
 plt.yticks(())
 plt.legend(loc="best")
 
 plt.subplot(224)
-plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train",
-            marker="o", s=25)
-plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test",
-            marker="o", s=25)
+plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", marker="o", s=25)
+plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", marker="o", s=25)
 plt.xlabel("x scores")
 plt.ylabel("y scores")
-plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
-          np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
+plt.title(
+    "Comp. 2: X vs Y (test corr = %.2f)"
+    % np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1]
+)
 plt.xticks(())
 plt.yticks(())
 plt.legend(loc="best")
 
 # 2) Off diagonal plot components 1 vs 2 for X and Y
 plt.subplot(222)
-plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train",
-            marker="*", s=50)
-plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test",
-            marker="*", s=50)
+plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", marker="*", s=50)
+plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", marker="*", s=50)
 plt.xlabel("X comp. 1")
 plt.ylabel("X comp. 2")
-plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
-          % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
+plt.title(
+    "X comp. 1 vs X comp. 2 (test corr = %.2f)"
+    % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1]
+)
 plt.legend(loc="best")
 plt.xticks(())
 plt.yticks(())
 
 plt.subplot(223)
-plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train",
-            marker="*", s=50)
-plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test",
-            marker="*", s=50)
+plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", marker="*", s=50)
+plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", marker="*", s=50)
 plt.xlabel("Y comp. 1")
 plt.ylabel("Y comp. 2")
-plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
-          % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
+plt.title(
+    "Y comp. 1 vs Y comp. 2 , (test corr = %.2f)"
+    % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]
+)
 plt.legend(loc="best")
 plt.xticks(())
 plt.yticks(())
diff --git a/examples/cross_decomposition/plot_pcr_vs_pls.py b/examples/cross_decomposition/plot_pcr_vs_pls.py
index cc22f3bd0ebc6..09633e988c1f1 100644
--- a/examples/cross_decomposition/plot_pcr_vs_pls.py
+++ b/examples/cross_decomposition/plot_pcr_vs_pls.py
@@ -48,20 +48,27 @@
 
 rng = np.random.RandomState(0)
 n_samples = 500
-cov = [[3, 3],
-       [3, 4]]
+cov = [[3, 3], [3, 4]]
 X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
 pca = PCA(n_components=2).fit(X)
 
 
-plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')
+plt.scatter(X[:, 0], X[:, 1], alpha=0.3, label="samples")
 for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
     comp = comp * var  # scale component by its variance explanation power
-    plt.plot([0, comp[0]], [0, comp[1]], label=f"Component {i}", linewidth=5,
-             color=f"C{i + 2}")
-plt.gca().set(aspect='equal',
-              title="2-dimensional dataset with principal components",
-              xlabel='first feature', ylabel='second feature')
+    plt.plot(
+        [0, comp[0]],
+        [0, comp[1]],
+        label=f"Component {i}",
+        linewidth=5,
+        color=f"C{i + 2}",
+    )
+plt.gca().set(
+    aspect="equal",
+    title="2-dimensional dataset with principal components",
+    xlabel="first feature",
+    ylabel="second feature",
+)
 plt.legend()
 plt.show()
 
@@ -74,10 +81,10 @@
 
 fig, axes = plt.subplots(1, 2, figsize=(10, 3))
 
-axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)
-axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y')
-axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)
-axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y')
+axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3)
+axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y")
+axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3)
+axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y")
 plt.tight_layout()
 plt.show()
 
@@ -104,23 +111,25 @@
 
 pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
 pcr.fit(X_train, y_train)
-pca = pcr.named_steps['pca']  # retrieve the PCA step of the pipeline
+pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline
 
 pls = PLSRegression(n_components=1)
 pls.fit(X_train, y_train)
 
 fig, axes = plt.subplots(1, 2, figsize=(10, 3))
-axes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth')
-axes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3,
-                label='predictions')
-axes[0].set(xlabel='Projected data onto first PCA component',
-            ylabel='y', title='PCR / PCA')
+axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth")
+axes[0].scatter(
+    pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions"
+)
+axes[0].set(
+    xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
+)
 axes[0].legend()
-axes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth')
-axes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3,
-                label='predictions')
-axes[1].set(xlabel='Projected data onto first PLS component',
-            ylabel='y', title='PLS')
+axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth")
+axes[1].scatter(
+    pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions"
+)
+axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
 axes[1].legend()
 plt.tight_layout()
 plt.show()
diff --git a/examples/datasets/plot_digits_last_image.py b/examples/datasets/plot_digits_last_image.py
index 27314b6c9dcdb..98620d98702c7 100644
--- a/examples/datasets/plot_digits_last_image.py
+++ b/examples/datasets/plot_digits_last_image.py
@@ -31,5 +31,5 @@
 
 # Display the first digit
 plt.figure(1, figsize=(3, 3))
-plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
+plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation="nearest")
 plt.show()
diff --git a/examples/datasets/plot_iris_dataset.py b/examples/datasets/plot_iris_dataset.py
index d6e23253aa53e..c37057d230c1d 100644
--- a/examples/datasets/plot_iris_dataset.py
+++ b/examples/datasets/plot_iris_dataset.py
@@ -33,17 +33,16 @@
 X = iris.data[:, :2]  # we only take the first two features.
 y = iris.target
 
-x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
 
 plt.figure(2, figsize=(8, 6))
 plt.clf()
 
 # Plot the training points
-plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
-            edgecolor='k')
-plt.xlabel('Sepal length')
-plt.ylabel('Sepal width')
+plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor="k")
+plt.xlabel("Sepal length")
+plt.ylabel("Sepal width")
 
 plt.xlim(x_min, x_max)
 plt.ylim(y_min, y_max)
@@ -55,8 +54,15 @@
 fig = plt.figure(1, figsize=(8, 6))
 ax = Axes3D(fig, elev=-150, azim=110)
 X_reduced = PCA(n_components=3).fit_transform(iris.data)
-ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
-           cmap=plt.cm.Set1, edgecolor='k', s=40)
+ax.scatter(
+    X_reduced[:, 0],
+    X_reduced[:, 1],
+    X_reduced[:, 2],
+    c=y,
+    cmap=plt.cm.Set1,
+    edgecolor="k",
+    s=40,
+)
 ax.set_title("First three PCA directions")
 ax.set_xlabel("1st eigenvector")
 ax.w_xaxis.set_ticklabels([])
diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py
index 2f8d4be8ac383..7a94eaa5550f7 100644
--- a/examples/datasets/plot_random_dataset.py
+++ b/examples/datasets/plot_random_dataset.py
@@ -22,47 +22,42 @@
 from sklearn.datasets import make_gaussian_quantiles
 
 plt.figure(figsize=(8, 8))
-plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)
+plt.subplots_adjust(bottom=0.05, top=0.9, left=0.05, right=0.95)
 
 plt.subplot(321)
-plt.title("One informative feature, one cluster per class", fontsize='small')
-X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=1,
-                             n_clusters_per_class=1)
-plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
-            s=25, edgecolor='k')
+plt.title("One informative feature, one cluster per class", fontsize="small")
+X1, Y1 = make_classification(
+    n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1
+)
+plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k")
 
 plt.subplot(322)
-plt.title("Two informative features, one cluster per class", fontsize='small')
-X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2,
-                             n_clusters_per_class=1)
-plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
-            s=25, edgecolor='k')
+plt.title("Two informative features, one cluster per class", fontsize="small")
+X1, Y1 = make_classification(
+    n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1
+)
+plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k")
 
 plt.subplot(323)
-plt.title("Two informative features, two clusters per class",
-          fontsize='small')
+plt.title("Two informative features, two clusters per class", fontsize="small")
 X2, Y2 = make_classification(n_features=2, n_redundant=0, n_informative=2)
-plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2,
-            s=25, edgecolor='k')
+plt.scatter(X2[:, 0], X2[:, 1], marker="o", c=Y2, s=25, edgecolor="k")
 
 plt.subplot(324)
-plt.title("Multi-class, two informative features, one cluster",
-          fontsize='small')
-X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2,
-                             n_clusters_per_class=1, n_classes=3)
-plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
-            s=25, edgecolor='k')
+plt.title("Multi-class, two informative features, one cluster", fontsize="small")
+X1, Y1 = make_classification(
+    n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=3
+)
+plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k")
 
 plt.subplot(325)
-plt.title("Three blobs", fontsize='small')
+plt.title("Three blobs", fontsize="small")
 X1, Y1 = make_blobs(n_features=2, centers=3)
-plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
-            s=25, edgecolor='k')
+plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k")
 
 plt.subplot(326)
-plt.title("Gaussian divided into three quantiles", fontsize='small')
+plt.title("Gaussian divided into three quantiles", fontsize="small")
 X1, Y1 = make_gaussian_quantiles(n_features=2, n_classes=3)
-plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
-            s=25, edgecolor='k')
+plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k")
 
 plt.show()
diff --git a/examples/datasets/plot_random_multilabel_dataset.py b/examples/datasets/plot_random_multilabel_dataset.py
index 5cb54689d64be..a7ceba36e30db 100644
--- a/examples/datasets/plot_random_multilabel_dataset.py
+++ b/examples/datasets/plot_random_multilabel_dataset.py
@@ -41,15 +41,18 @@
 
 print(__doc__)
 
-COLORS = np.array(['!',
-                   '#FF3333',  # red
-                   '#0198E1',  # blue
-                   '#BF5FFF',  # purple
-                   '#FCD116',  # yellow
-                   '#FF7216',  # orange
-                   '#4DBD33',  # green
-                   '#87421F'   # brown
-                   ])
+COLORS = np.array(
+    [
+        "!",
+        "#FF3333",  # red
+        "#0198E1",  # blue
+        "#BF5FFF",  # purple
+        "#FCD116",  # yellow
+        "#FF7216",  # orange
+        "#4DBD33",  # green
+        "#87421F",  # brown
+    ]
+)
 
 # Use same random seed for multiple calls to make_multilabel_classification to
 # ensure same distributions
@@ -57,38 +60,48 @@
 
 
 def plot_2d(ax, n_labels=1, n_classes=3, length=50):
-    X, Y, p_c, p_w_c = make_ml_clf(n_samples=150, n_features=2,
-                                   n_classes=n_classes, n_labels=n_labels,
-                                   length=length, allow_unlabeled=False,
-                                   return_distributions=True,
-                                   random_state=RANDOM_SEED)
-
-    ax.scatter(X[:, 0], X[:, 1], color=COLORS.take((Y * [1, 2, 4]
-                                                    ).sum(axis=1)),
-               marker='.')
-    ax.scatter(p_w_c[0] * length, p_w_c[1] * length,
-               marker='*', linewidth=.5, edgecolor='black',
-               s=20 + 1500 * p_c ** 2,
-               color=COLORS.take([1, 2, 4]))
-    ax.set_xlabel('Feature 0 count')
+    X, Y, p_c, p_w_c = make_ml_clf(
+        n_samples=150,
+        n_features=2,
+        n_classes=n_classes,
+        n_labels=n_labels,
+        length=length,
+        allow_unlabeled=False,
+        return_distributions=True,
+        random_state=RANDOM_SEED,
+    )
+
+    ax.scatter(
+        X[:, 0], X[:, 1], color=COLORS.take((Y * [1, 2, 4]).sum(axis=1)), marker="."
+    )
+    ax.scatter(
+        p_w_c[0] * length,
+        p_w_c[1] * length,
+        marker="*",
+        linewidth=0.5,
+        edgecolor="black",
+        s=20 + 1500 * p_c ** 2,
+        color=COLORS.take([1, 2, 4]),
+    )
+    ax.set_xlabel("Feature 0 count")
     return p_c, p_w_c
 
 
-_, (ax1, ax2) = plt.subplots(1, 2, sharex='row', sharey='row', figsize=(8, 4))
-plt.subplots_adjust(bottom=.15)
+_, (ax1, ax2) = plt.subplots(1, 2, sharex="row", sharey="row", figsize=(8, 4))
+plt.subplots_adjust(bottom=0.15)
 
 p_c, p_w_c = plot_2d(ax1, n_labels=1)
-ax1.set_title('n_labels=1, length=50')
-ax1.set_ylabel('Feature 1 count')
+ax1.set_title("n_labels=1, length=50")
+ax1.set_ylabel("Feature 1 count")
 
 plot_2d(ax2, n_labels=3)
-ax2.set_title('n_labels=3, length=50')
+ax2.set_title("n_labels=3, length=50")
 ax2.set_xlim(left=0, auto=True)
 ax2.set_ylim(bottom=0, auto=True)
 
 plt.show()
 
-print('The data was generated from (random_state=%d):' % RANDOM_SEED)
-print('Class', 'P(C)', 'P(w0|C)', 'P(w1|C)', sep='\t')
-for k, p, p_w in zip(['red', 'blue', 'yellow'], p_c, p_w_c.T):
-    print('%s\t%0.2f\t%0.2f\t%0.2f' % (k, p, p_w[0], p_w[1]))
+print("The data was generated from (random_state=%d):" % RANDOM_SEED)
+print("Class", "P(C)", "P(w0|C)", "P(w1|C)", sep="\t")
+for k, p, p_w in zip(["red", "blue", "yellow"], p_c, p_w_c.T):
+    print("%s\t%0.2f\t%0.2f\t%0.2f" % (k, p, p_w[0], p_w[1]))
diff --git a/examples/decomposition/plot_beta_divergence.py b/examples/decomposition/plot_beta_divergence.py
index 8f39039446e88..41c908e273c72 100644
--- a/examples/decomposition/plot_beta_divergence.py
+++ b/examples/decomposition/plot_beta_divergence.py
@@ -15,8 +15,8 @@
 x = np.linspace(0.001, 4, 1000)
 y = np.zeros(x.shape)
 
-colors = 'mbgyr'
-for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
+colors = "mbgyr"
+for j, beta in enumerate((0.0, 0.5, 1.0, 1.5, 2.0)):
     for i, xi in enumerate(x):
         y[i] = _beta_divergence(1, xi, 1, beta)
     name = "beta = %1.1f" % beta
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index 84e6f923f0d3b..7c873e867aa8b 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -25,8 +25,7 @@
 from sklearn import decomposition
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 n_row, n_col = 2, 3
 n_components = n_row * n_col
 image_shape = (64, 64)
@@ -34,8 +33,7 @@
 
 # #############################################################################
 # Load faces data
-faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True,
-                                random_state=rng)
+faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True, random_state=rng)
 n_samples, n_features = faces.shape
 
 # global centering
@@ -48,56 +46,78 @@
 
 
 def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
-    plt.figure(figsize=(2. * n_col, 2.26 * n_row))
+    plt.figure(figsize=(2.0 * n_col, 2.26 * n_row))
     plt.suptitle(title, size=16)
     for i, comp in enumerate(images):
         plt.subplot(n_row, n_col, i + 1)
         vmax = max(comp.max(), -comp.min())
-        plt.imshow(comp.reshape(image_shape), cmap=cmap,
-                   interpolation='nearest',
-                   vmin=-vmax, vmax=vmax)
+        plt.imshow(
+            comp.reshape(image_shape),
+            cmap=cmap,
+            interpolation="nearest",
+            vmin=-vmax,
+            vmax=vmax,
+        )
         plt.xticks(())
         plt.yticks(())
-    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
+    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.0)
 
 
 # #############################################################################
 # List of the different estimators, whether to center and transpose the
 # problem, and whether the transformer uses the clustering API.
 estimators = [
-    ('Eigenfaces - PCA using randomized SVD',
-     decomposition.PCA(n_components=n_components, svd_solver='randomized',
-                       whiten=True),
-     True),
-
-    ('Non-negative components - NMF',
-     decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3),
-     False),
-
-    ('Independent components - FastICA',
-     decomposition.FastICA(n_components=n_components, whiten=True),
-     True),
-
-    ('Sparse comp. - MiniBatchSparsePCA',
-     decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
-                                      n_iter=100, batch_size=3,
-                                      random_state=rng),
-     True),
-
-    ('MiniBatchDictionaryLearning',
-        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
-                                                  n_iter=50, batch_size=3,
-                                                  random_state=rng),
-     True),
-
-    ('Cluster centers - MiniBatchKMeans',
-        MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20,
-                        max_iter=50, random_state=rng),
-     True),
-
-    ('Factor Analysis components - FA',
-     decomposition.FactorAnalysis(n_components=n_components, max_iter=20),
-     True),
+    (
+        "Eigenfaces - PCA using randomized SVD",
+        decomposition.PCA(
+            n_components=n_components, svd_solver="randomized", whiten=True
+        ),
+        True,
+    ),
+    (
+        "Non-negative components - NMF",
+        decomposition.NMF(n_components=n_components, init="nndsvda", tol=5e-3),
+        False,
+    ),
+    (
+        "Independent components - FastICA",
+        decomposition.FastICA(n_components=n_components, whiten=True),
+        True,
+    ),
+    (
+        "Sparse comp. - MiniBatchSparsePCA",
+        decomposition.MiniBatchSparsePCA(
+            n_components=n_components,
+            alpha=0.8,
+            n_iter=100,
+            batch_size=3,
+            random_state=rng,
+        ),
+        True,
+    ),
+    (
+        "MiniBatchDictionaryLearning",
+        decomposition.MiniBatchDictionaryLearning(
+            n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng
+        ),
+        True,
+    ),
+    (
+        "Cluster centers - MiniBatchKMeans",
+        MiniBatchKMeans(
+            n_clusters=n_components,
+            tol=1e-3,
+            batch_size=20,
+            max_iter=50,
+            random_state=rng,
+        ),
+        True,
+    ),
+    (
+        "Factor Analysis components - FA",
+        decomposition.FactorAnalysis(n_components=n_components, max_iter=20),
+        True,
+    ),
 ]
 
 
@@ -116,9 +136,9 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
     if center:
         data = faces_centered
     estimator.fit(data)
-    train_time = (time() - t0)
+    train_time = time() - t0
     print("done in %0.3fs" % train_time)
-    if hasattr(estimator, 'cluster_centers_'):
+    if hasattr(estimator, "cluster_centers_"):
         components_ = estimator.cluster_centers_
     else:
         components_ = estimator.components_
@@ -128,53 +148,79 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
     # via the PCA decomposition, also provides a scalar noise_variance_
     # (the mean of pixelwise variance) that cannot be displayed as an image
     # so we skip it.
-    if (hasattr(estimator, 'noise_variance_') and
-            estimator.noise_variance_.ndim > 0):  # Skip the Eigenfaces case
-        plot_gallery("Pixelwise variance",
-                     estimator.noise_variance_.reshape(1, -1), n_col=1,
-                     n_row=1)
-    plot_gallery('%s - Train time %.1fs' % (name, train_time),
-                 components_[:n_components])
+    if (
+        hasattr(estimator, "noise_variance_") and estimator.noise_variance_.ndim > 0
+    ):  # Skip the Eigenfaces case
+        plot_gallery(
+            "Pixelwise variance",
+            estimator.noise_variance_.reshape(1, -1),
+            n_col=1,
+            n_row=1,
+        )
+    plot_gallery(
+        "%s - Train time %.1fs" % (name, train_time), components_[:n_components]
+    )
 
 plt.show()
 
 # #############################################################################
 # Various positivity constraints applied to dictionary learning.
 estimators = [
-    ('Dictionary learning',
-        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
-                                                  n_iter=50, batch_size=3,
-                                                  random_state=rng),
-     True),
-    ('Dictionary learning - positive dictionary',
-        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
-                                                  n_iter=50, batch_size=3,
-                                                  random_state=rng,
-                                                  positive_dict=True),
-     True),
-    ('Dictionary learning - positive code',
-        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
-                                                  n_iter=50, batch_size=3,
-                                                  fit_algorithm='cd',
-                                                  random_state=rng,
-                                                  positive_code=True),
-     True),
-    ('Dictionary learning - positive dictionary & code',
-        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
-                                                  n_iter=50, batch_size=3,
-                                                  fit_algorithm='cd',
-                                                  random_state=rng,
-                                                  positive_dict=True,
-                                                  positive_code=True),
-     True),
+    (
+        "Dictionary learning",
+        decomposition.MiniBatchDictionaryLearning(
+            n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng
+        ),
+        True,
+    ),
+    (
+        "Dictionary learning - positive dictionary",
+        decomposition.MiniBatchDictionaryLearning(
+            n_components=15,
+            alpha=0.1,
+            n_iter=50,
+            batch_size=3,
+            random_state=rng,
+            positive_dict=True,
+        ),
+        True,
+    ),
+    (
+        "Dictionary learning - positive code",
+        decomposition.MiniBatchDictionaryLearning(
+            n_components=15,
+            alpha=0.1,
+            n_iter=50,
+            batch_size=3,
+            fit_algorithm="cd",
+            random_state=rng,
+            positive_code=True,
+        ),
+        True,
+    ),
+    (
+        "Dictionary learning - positive dictionary & code",
+        decomposition.MiniBatchDictionaryLearning(
+            n_components=15,
+            alpha=0.1,
+            n_iter=50,
+            batch_size=3,
+            fit_algorithm="cd",
+            random_state=rng,
+            positive_dict=True,
+            positive_code=True,
+        ),
+        True,
+    ),
 ]
 
 
 # #############################################################################
 # Plot a sample of the input data
 
-plot_gallery("First centered Olivetti faces", faces_centered[:n_components],
-             cmap=plt.cm.RdBu)
+plot_gallery(
+    "First centered Olivetti faces", faces_centered[:n_components], cmap=plt.cm.RdBu
+)
 
 # #############################################################################
 # Do the estimation and plot it
@@ -186,7 +232,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
     if center:
         data = faces_centered
     estimator.fit(data)
-    train_time = (time() - t0)
+    train_time = time() - t0
     print("done in %0.3fs" % train_time)
     components_ = estimator.components_
     plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu)
diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py
index 92fda7c20adf2..bbb9ac1c897e6 100644
--- a/examples/decomposition/plot_ica_blind_source_separation.py
+++ b/examples/decomposition/plot_ica_blind_source_separation.py
@@ -57,11 +57,13 @@
 plt.figure()
 
 models = [X, S, S_, H]
-names = ['Observations (mixed signal)',
-         'True Sources',
-         'ICA recovered signals',
-         'PCA recovered signals']
-colors = ['red', 'steelblue', 'orange']
+names = [
+    "Observations (mixed signal)",
+    "True Sources",
+    "ICA recovered signals",
+    "PCA recovered signals",
+]
+colors = ["red", "steelblue", "orange"]
 
 for ii, (model, name) in enumerate(zip(models, names), 1):
     plt.subplot(4, 1, ii)
diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py
index b4fa513dbc36e..769fe47a028f4 100644
--- a/examples/decomposition/plot_ica_vs_pca.py
+++ b/examples/decomposition/plot_ica_vs_pca.py
@@ -41,7 +41,7 @@
 # Generate sample data
 rng = np.random.RandomState(42)
 S = rng.standard_t(1.5, size=(20000, 2))
-S[:, 0] *= 2.
+S[:, 0] *= 2.0
 
 # Mix data
 A = np.array([[1, 1], [0, 2]])  # Mixing matrix
@@ -60,47 +60,57 @@
 # #############################################################################
 # Plot results
 
+
 def plot_samples(S, axis_list=None):
-    plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10,
-                color='steelblue', alpha=0.5)
+    plt.scatter(
+        S[:, 0], S[:, 1], s=2, marker="o", zorder=10, color="steelblue", alpha=0.5
+    )
     if axis_list is not None:
-        colors = ['orange', 'red']
+        colors = ["orange", "red"]
         for color, axis in zip(colors, axis_list):
             axis /= axis.std()
             x_axis, y_axis = axis
             # Trick to get legend to work
             plt.plot(0.1 * x_axis, 0.1 * y_axis, linewidth=2, color=color)
-            plt.quiver((0, 0), (0, 0), x_axis, y_axis, zorder=11, width=0.01,
-                       scale=6, color=color)
+            plt.quiver(
+                (0, 0),
+                (0, 0),
+                x_axis,
+                y_axis,
+                zorder=11,
+                width=0.01,
+                scale=6,
+                color=color,
+            )
 
     plt.hlines(0, -3, 3)
     plt.vlines(0, -3, 3)
     plt.xlim(-3, 3)
     plt.ylim(-3, 3)
-    plt.xlabel('x')
-    plt.ylabel('y')
+    plt.xlabel("x")
+    plt.ylabel("y")
 
 
 plt.figure()
 plt.subplot(2, 2, 1)
 plot_samples(S / S.std())
-plt.title('True Independent Sources')
+plt.title("True Independent Sources")
 
 axis_list = [pca.components_.T, ica.mixing_]
 plt.subplot(2, 2, 2)
 plot_samples(X / np.std(X), axis_list=axis_list)
-legend = plt.legend(['PCA', 'ICA'], loc='upper right')
+legend = plt.legend(["PCA", "ICA"], loc="upper right")
 legend.set_zorder(100)
 
-plt.title('Observations')
+plt.title("Observations")
 
 plt.subplot(2, 2, 3)
 plot_samples(S_pca_ / np.std(S_pca_, axis=0))
-plt.title('PCA recovered signals')
+plt.title("PCA recovered signals")
 
 plt.subplot(2, 2, 4)
 plot_samples(S_ica_ / np.std(S_ica_))
-plt.title('ICA recovered signals')
+plt.title("ICA recovered signals")
 
 plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.36)
 plt.show()
diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py
index 0cf505b7548c5..df08e052d3ec1 100644
--- a/examples/decomposition/plot_image_denoising.py
+++ b/examples/decomposition/plot_image_denoising.py
@@ -46,13 +46,14 @@
 
 try:  # SciPy >= 0.16 have face in misc
     from scipy.misc import face
+
     face = face(gray=True)
 except ImportError:
     face = sp.face(gray=True)
 
 # Convert from uint8 representation with values between 0 and 255 to
 # a floating point representation with values between 0 and 1.
-face = face / 255.
+face = face / 255.0
 
 # downsample for higher speed
 face = face[::4, ::4] + face[1::4, ::4] + face[::4, 1::4] + face[1::4, 1::4]
@@ -60,92 +61,92 @@
 height, width = face.shape
 
 # Distort the right half of the image
-print('Distorting image...')
+print("Distorting image...")
 distorted = face.copy()
-distorted[:, width // 2:] += 0.075 * np.random.randn(height, width // 2)
+distorted[:, width // 2 :] += 0.075 * np.random.randn(height, width // 2)
 
 # Extract all reference patches from the left half of the image
-print('Extracting reference patches...')
+print("Extracting reference patches...")
 t0 = time()
 patch_size = (7, 7)
-data = extract_patches_2d(distorted[:, :width // 2], patch_size)
+data = extract_patches_2d(distorted[:, : width // 2], patch_size)
 data = data.reshape(data.shape[0], -1)
 data -= np.mean(data, axis=0)
 data /= np.std(data, axis=0)
-print('done in %.2fs.' % (time() - t0))
+print("done in %.2fs." % (time() - t0))
 
 # #############################################################################
 # Learn the dictionary from reference patches
 
-print('Learning the dictionary...')
+print("Learning the dictionary...")
 t0 = time()
 dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500)
 V = dico.fit(data).components_
 dt = time() - t0
-print('done in %.2fs.' % dt)
+print("done in %.2fs." % dt)
 
 plt.figure(figsize=(4.2, 4))
 for i, comp in enumerate(V[:100]):
     plt.subplot(10, 10, i + 1)
-    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
-               interpolation='nearest')
+    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, interpolation="nearest")
     plt.xticks(())
     plt.yticks(())
-plt.suptitle('Dictionary learned from face patches\n' +
-             'Train time %.1fs on %d patches' % (dt, len(data)),
-             fontsize=16)
+plt.suptitle(
+    "Dictionary learned from face patches\n"
+    + "Train time %.1fs on %d patches" % (dt, len(data)),
+    fontsize=16,
+)
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 
 # #############################################################################
 # Display the distorted image
 
+
 def show_with_diff(image, reference, title):
     """Helper function to display denoising"""
     plt.figure(figsize=(5, 3.3))
     plt.subplot(1, 2, 1)
-    plt.title('Image')
-    plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray,
-               interpolation='nearest')
+    plt.title("Image")
+    plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray, interpolation="nearest")
     plt.xticks(())
     plt.yticks(())
     plt.subplot(1, 2, 2)
     difference = image - reference
 
-    plt.title('Difference (norm: %.2f)' % np.sqrt(np.sum(difference ** 2)))
-    plt.imshow(difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr,
-               interpolation='nearest')
+    plt.title("Difference (norm: %.2f)" % np.sqrt(np.sum(difference ** 2)))
+    plt.imshow(
+        difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr, interpolation="nearest"
+    )
     plt.xticks(())
     plt.yticks(())
     plt.suptitle(title, size=16)
     plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.2)
 
 
-show_with_diff(distorted, face, 'Distorted image')
+show_with_diff(distorted, face, "Distorted image")
 
 # #############################################################################
 # Extract noisy patches and reconstruct them using the dictionary
 
-print('Extracting noisy patches... ')
+print("Extracting noisy patches... ")
 t0 = time()
-data = extract_patches_2d(distorted[:, width // 2:], patch_size)
+data = extract_patches_2d(distorted[:, width // 2 :], patch_size)
 data = data.reshape(data.shape[0], -1)
 intercept = np.mean(data, axis=0)
 data -= intercept
-print('done in %.2fs.' % (time() - t0))
+print("done in %.2fs." % (time() - t0))
 
 transform_algorithms = [
-    ('Orthogonal Matching Pursuit\n1 atom', 'omp',
-     {'transform_n_nonzero_coefs': 1}),
-    ('Orthogonal Matching Pursuit\n2 atoms', 'omp',
-     {'transform_n_nonzero_coefs': 2}),
-    ('Least-angle regression\n5 atoms', 'lars',
-     {'transform_n_nonzero_coefs': 5}),
-    ('Thresholding\n alpha=0.1', 'threshold', {'transform_alpha': .1})]
+    ("Orthogonal Matching Pursuit\n1 atom", "omp", {"transform_n_nonzero_coefs": 1}),
+    ("Orthogonal Matching Pursuit\n2 atoms", "omp", {"transform_n_nonzero_coefs": 2}),
+    ("Least-angle regression\n5 atoms", "lars", {"transform_n_nonzero_coefs": 5}),
+    ("Thresholding\n alpha=0.1", "threshold", {"transform_alpha": 0.1}),
+]
 
 reconstructions = {}
 for title, transform_algorithm, kwargs in transform_algorithms:
-    print(title + '...')
+    print(title + "...")
     reconstructions[title] = face.copy()
     t0 = time()
     dico.set_params(transform_algorithm=transform_algorithm, **kwargs)
@@ -154,14 +155,14 @@ def show_with_diff(image, reference, title):
 
     patches += intercept
     patches = patches.reshape(len(data), *patch_size)
-    if transform_algorithm == 'threshold':
+    if transform_algorithm == "threshold":
         patches -= patches.min()
         patches /= patches.max()
-    reconstructions[title][:, width // 2:] = reconstruct_from_patches_2d(
-        patches, (height, width // 2))
+    reconstructions[title][:, width // 2 :] = reconstruct_from_patches_2d(
+        patches, (height, width // 2)
+    )
     dt = time() - t0
-    print('done in %.2fs.' % dt)
-    show_with_diff(reconstructions[title], face,
-                   title + ' (time: %.1fs)' % dt)
+    print("done in %.2fs." % dt)
+    show_with_diff(reconstructions[title], face, title + " (time: %.1fs)" % dt)
 
 plt.show()
diff --git a/examples/decomposition/plot_incremental_pca.py b/examples/decomposition/plot_incremental_pca.py
index 980f9d019ea1c..88cd6a679c479 100644
--- a/examples/decomposition/plot_incremental_pca.py
+++ b/examples/decomposition/plot_incremental_pca.py
@@ -40,18 +40,22 @@
 pca = PCA(n_components=n_components)
 X_pca = pca.fit_transform(X)
 
-colors = ['navy', 'turquoise', 'darkorange']
+colors = ["navy", "turquoise", "darkorange"]
 
 for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
     plt.figure(figsize=(8, 8))
     for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
-        plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
-                    color=color, lw=2, label=target_name)
+        plt.scatter(
+            X_transformed[y == i, 0],
+            X_transformed[y == i, 1],
+            color=color,
+            lw=2,
+            label=target_name,
+        )
 
     if "Incremental" in title:
         err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
-        plt.title(title + " of iris dataset\nMean absolute unsigned error "
-                  "%.6f" % err)
+        plt.title(title + " of iris dataset\nMean absolute unsigned error %.6f" % err)
     else:
         plt.title(title + " of iris dataset")
     plt.legend(loc="best", shadow=False, scatterpoints=1)
diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index cfec4f4ec8b1d..8a9ad066cb181 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -20,7 +20,7 @@
 
 np.random.seed(0)
 
-X, y = make_circles(n_samples=400, factor=.3, noise=.05)
+X, y = make_circles(n_samples=400, factor=0.3, noise=0.05)
 
 kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
 X_kpca = kpca.fit_transform(X)
@@ -31,15 +31,13 @@
 # Plot results
 
 plt.figure()
-plt.subplot(2, 2, 1, aspect='equal')
+plt.subplot(2, 2, 1, aspect="equal")
 plt.title("Original space")
 reds = y == 0
 blues = y == 1
 
-plt.scatter(X[reds, 0], X[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X[blues, 0], X[blues, 1], c="blue",
-            s=20, edgecolor='k')
+plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k")
+plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k")
 plt.xlabel("$x_1$")
 plt.ylabel("$x_2$")
 
@@ -47,31 +45,25 @@
 X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
 # projection on the first principal component (in the phi space)
 Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
-plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower')
+plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower")
 
-plt.subplot(2, 2, 2, aspect='equal')
-plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue",
-            s=20, edgecolor='k')
+plt.subplot(2, 2, 2, aspect="equal")
+plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k")
+plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k")
 plt.title("Projection by PCA")
 plt.xlabel("1st principal component")
 plt.ylabel("2nd component")
 
-plt.subplot(2, 2, 3, aspect='equal')
-plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue",
-            s=20, edgecolor='k')
+plt.subplot(2, 2, 3, aspect="equal")
+plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k")
+plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k")
 plt.title("Projection by KPCA")
 plt.xlabel(r"1st principal component in space induced by $\phi$")
 plt.ylabel("2nd component")
 
-plt.subplot(2, 2, 4, aspect='equal')
-plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue",
-            s=20, edgecolor='k')
+plt.subplot(2, 2, 4, aspect="equal")
+plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k")
+plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k")
 plt.title("Original space after inverse transform")
 plt.xlabel("$x_1$")
 plt.ylabel("$x_2$")
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index 4d7a851a938da..27d3a34e2dd75 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -34,8 +34,7 @@
 
 
 def pdf(x):
-    return 0.5 * (stats.norm(scale=0.25 / e).pdf(x)
-                  + stats.norm(scale=4 / e).pdf(x))
+    return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) + stats.norm(scale=4 / e).pdf(x))
 
 
 y = np.random.normal(scale=0.5, size=(30000))
@@ -61,9 +60,9 @@ def pdf(x):
 def plot_figs(fig_num, elev, azim):
     fig = plt.figure(fig_num, figsize=(4, 3))
     plt.clf()
-    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elev, azim=azim)
+    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=elev, azim=azim)
 
-    ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker='+', alpha=.4)
+    ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker="+", alpha=0.4)
     Y = np.c_[a, b, c]
 
     # Using SciPy's SVD, this would be:
@@ -74,9 +73,9 @@ def plot_figs(fig_num, elev, azim):
     V = pca.components_.T
 
     x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
-    x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
-    y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
-    z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
+    x_pca_plane = np.r_[x_pca_axis[:2], -x_pca_axis[1::-1]]
+    y_pca_plane = np.r_[y_pca_axis[:2], -y_pca_axis[1::-1]]
+    z_pca_plane = np.r_[z_pca_axis[:2], -z_pca_axis[1::-1]]
     x_pca_plane.shape = (2, 2)
     y_pca_plane.shape = (2, 2)
     z_pca_plane.shape = (2, 2)
diff --git a/examples/decomposition/plot_pca_iris.py b/examples/decomposition/plot_pca_iris.py
index 67baf0deefdb3..8ed89104cddc8 100644
--- a/examples/decomposition/plot_pca_iris.py
+++ b/examples/decomposition/plot_pca_iris.py
@@ -34,23 +34,25 @@
 
 fig = plt.figure(1, figsize=(4, 3))
 plt.clf()
-ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
+ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
 
 plt.cla()
 pca = decomposition.PCA(n_components=3)
 pca.fit(X)
 X = pca.transform(X)
 
-for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
-    ax.text3D(X[y == label, 0].mean(),
-              X[y == label, 1].mean() + 1.5,
-              X[y == label, 2].mean(), name,
-              horizontalalignment='center',
-              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
+for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
+    ax.text3D(
+        X[y == label, 0].mean(),
+        X[y == label, 1].mean() + 1.5,
+        X[y == label, 2].mean(),
+        name,
+        horizontalalignment="center",
+        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
+    )
 # Reorder the labels to have colors matching the cluster results
 y = np.choose(y, [1, 2, 0]).astype(float)
-ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral,
-           edgecolor='k')
+ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k")
 
 ax.w_xaxis.set_ticklabels([])
 ax.w_yaxis.set_ticklabels([])
diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py
index b858434d910e3..65c04838f8796 100644
--- a/examples/decomposition/plot_pca_vs_fa_model_selection.py
+++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py
@@ -43,7 +43,7 @@
 # Create the data
 
 n_samples, n_features, rank = 1000, 50, 10
-sigma = 1.
+sigma = 1.0
 rng = np.random.RandomState(42)
 U, _, _ = linalg.svd(rng.randn(n_features, n_features))
 X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)
@@ -52,7 +52,7 @@
 X_homo = X + sigma * rng.randn(n_samples, n_features)
 
 # Adding heteroscedastic noise
-sigmas = sigma * rng.rand(n_features) + sigma / 2.
+sigmas = sigma * rng.rand(n_features) + sigma / 2.0
 X_hetero = X + rng.randn(n_samples, n_features) * sigmas
 
 # #############################################################################
@@ -62,7 +62,7 @@
 
 
 def compute_scores(X):
-    pca = PCA(svd_solver='full')
+    pca = PCA(svd_solver="full")
     fa = FactorAnalysis()
 
     pca_scores, fa_scores = [], []
@@ -77,7 +77,7 @@ def compute_scores(X):
 
 def shrunk_cov_score(X):
     shrinkages = np.logspace(-2, 0, 30)
-    cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages})
+    cv = GridSearchCV(ShrunkCovariance(), {"shrinkage": shrinkages})
     return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))
 
 
@@ -85,13 +85,12 @@ def lw_score(X):
     return np.mean(cross_val_score(LedoitWolf(), X))
 
 
-for X, title in [(X_homo, 'Homoscedastic Noise'),
-                 (X_hetero, 'Heteroscedastic Noise')]:
+for X, title in [(X_homo, "Homoscedastic Noise"), (X_hetero, "Heteroscedastic Noise")]:
     pca_scores, fa_scores = compute_scores(X)
     n_components_pca = n_components[np.argmax(pca_scores)]
     n_components_fa = n_components[np.argmax(fa_scores)]
 
-    pca = PCA(svd_solver='full', n_components='mle')
+    pca = PCA(svd_solver="full", n_components="mle")
     pca.fit(X)
     n_components_pca_mle = pca.n_components_
 
@@ -100,26 +99,45 @@ def lw_score(X):
     print("best n_components by PCA MLE = %d" % n_components_pca_mle)
 
     plt.figure()
-    plt.plot(n_components, pca_scores, 'b', label='PCA scores')
-    plt.plot(n_components, fa_scores, 'r', label='FA scores')
-    plt.axvline(rank, color='g', label='TRUTH: %d' % rank, linestyle='-')
-    plt.axvline(n_components_pca, color='b',
-                label='PCA CV: %d' % n_components_pca, linestyle='--')
-    plt.axvline(n_components_fa, color='r',
-                label='FactorAnalysis CV: %d' % n_components_fa,
-                linestyle='--')
-    plt.axvline(n_components_pca_mle, color='k',
-                label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')
+    plt.plot(n_components, pca_scores, "b", label="PCA scores")
+    plt.plot(n_components, fa_scores, "r", label="FA scores")
+    plt.axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-")
+    plt.axvline(
+        n_components_pca,
+        color="b",
+        label="PCA CV: %d" % n_components_pca,
+        linestyle="--",
+    )
+    plt.axvline(
+        n_components_fa,
+        color="r",
+        label="FactorAnalysis CV: %d" % n_components_fa,
+        linestyle="--",
+    )
+    plt.axvline(
+        n_components_pca_mle,
+        color="k",
+        label="PCA MLE: %d" % n_components_pca_mle,
+        linestyle="--",
+    )
 
     # compare with other covariance estimators
-    plt.axhline(shrunk_cov_score(X), color='violet',
-                label='Shrunk Covariance MLE', linestyle='-.')
-    plt.axhline(lw_score(X), color='orange',
-                label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')
-
-    plt.xlabel('nb of components')
-    plt.ylabel('CV scores')
-    plt.legend(loc='lower right')
+    plt.axhline(
+        shrunk_cov_score(X),
+        color="violet",
+        label="Shrunk Covariance MLE",
+        linestyle="-.",
+    )
+    plt.axhline(
+        lw_score(X),
+        color="orange",
+        label="LedoitWolf MLE" % n_components_pca_mle,
+        linestyle="-.",
+    )
+
+    plt.xlabel("nb of components")
+    plt.ylabel("CV scores")
+    plt.legend(loc="lower right")
     plt.title(title)
 
 plt.show()
diff --git a/examples/decomposition/plot_pca_vs_lda.py b/examples/decomposition/plot_pca_vs_lda.py
index 051b96ffedf2c..f9abf21b3ad0e 100644
--- a/examples/decomposition/plot_pca_vs_lda.py
+++ b/examples/decomposition/plot_pca_vs_lda.py
@@ -37,24 +37,28 @@
 X_r2 = lda.fit(X, y).transform(X)
 
 # Percentage of variance explained for each components
-print('explained variance ratio (first two components): %s'
-      % str(pca.explained_variance_ratio_))
+print(
+    "explained variance ratio (first two components): %s"
+    % str(pca.explained_variance_ratio_)
+)
 
 plt.figure()
-colors = ['navy', 'turquoise', 'darkorange']
+colors = ["navy", "turquoise", "darkorange"]
 lw = 2
 
 for color, i, target_name in zip(colors, [0, 1, 2], target_names):
-    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
-                label=target_name)
-plt.legend(loc='best', shadow=False, scatterpoints=1)
-plt.title('PCA of IRIS dataset')
+    plt.scatter(
+        X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
+    )
+plt.legend(loc="best", shadow=False, scatterpoints=1)
+plt.title("PCA of IRIS dataset")
 
 plt.figure()
 for color, i, target_name in zip(colors, [0, 1, 2], target_names):
-    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
-                label=target_name)
-plt.legend(loc='best', shadow=False, scatterpoints=1)
-plt.title('LDA of IRIS dataset')
+    plt.scatter(
+        X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name
+    )
+plt.legend(loc="best", shadow=False, scatterpoints=1)
+plt.title("LDA of IRIS dataset")
 
 plt.show()
diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
index 144401073a7fe..681b1ca1942c0 100644
--- a/examples/decomposition/plot_sparse_coding.py
+++ b/examples/decomposition/plot_sparse_coding.py
@@ -26,9 +26,11 @@
 def ricker_function(resolution, center, width):
     """Discrete sub-sampled Ricker (Mexican hat) wavelet"""
     x = np.linspace(0, resolution - 1, resolution)
-    x = ((2 / (np.sqrt(3 * width) * np.pi ** .25))
-         * (1 - (x - center) ** 2 / width ** 2)
-         * np.exp(-(x - center) ** 2 / (2 * width ** 2)))
+    x = (
+        (2 / (np.sqrt(3 * width) * np.pi ** 0.25))
+        * (1 - (x - center) ** 2 / width ** 2)
+        * np.exp(-((x - center) ** 2) / (2 * width ** 2))
+    )
     return x
 
 
@@ -48,57 +50,74 @@ def ricker_matrix(width, resolution, n_components):
 n_components = resolution // subsampling
 
 # Compute a wavelet dictionary
-D_fixed = ricker_matrix(width=width, resolution=resolution,
-                        n_components=n_components)
-D_multi = np.r_[tuple(ricker_matrix(width=w, resolution=resolution,
-                      n_components=n_components // 5)
-                for w in (10, 50, 100, 500, 1000))]
+D_fixed = ricker_matrix(width=width, resolution=resolution, n_components=n_components)
+D_multi = np.r_[
+    tuple(
+        ricker_matrix(width=w, resolution=resolution, n_components=n_components // 5)
+        for w in (10, 50, 100, 500, 1000)
+    )
+]
 
 # Generate a signal
 y = np.linspace(0, resolution - 1, resolution)
 first_quarter = y < resolution / 4
-y[first_quarter] = 3.
-y[np.logical_not(first_quarter)] = -1.
+y[first_quarter] = 3.0
+y[np.logical_not(first_quarter)] = -1.0
 
 # List the different sparse coding methods in the following format:
 # (title, transform_algorithm, transform_alpha,
 #  transform_n_nozero_coefs, color)
-estimators = [('OMP', 'omp', None, 15, 'navy'),
-              ('Lasso', 'lasso_lars', 2, None, 'turquoise'), ]
+estimators = [
+    ("OMP", "omp", None, 15, "navy"),
+    ("Lasso", "lasso_lars", 2, None, "turquoise"),
+]
 lw = 2
 # Avoid FutureWarning about default value change when numpy >= 1.14
-lstsq_rcond = None if np_version >= parse_version('1.14') else -1
+lstsq_rcond = None if np_version >= parse_version("1.14") else -1
 
 plt.figure(figsize=(13, 6))
-for subplot, (D, title) in enumerate(zip((D_fixed, D_multi),
-                                         ('fixed width', 'multiple widths'))):
+for subplot, (D, title) in enumerate(
+    zip((D_fixed, D_multi), ("fixed width", "multiple widths"))
+):
     plt.subplot(1, 2, subplot + 1)
-    plt.title('Sparse coding against %s dictionary' % title)
-    plt.plot(y, lw=lw, linestyle='--', label='Original signal')
+    plt.title("Sparse coding against %s dictionary" % title)
+    plt.plot(y, lw=lw, linestyle="--", label="Original signal")
     # Do a wavelet approximation
     for title, algo, alpha, n_nonzero, color in estimators:
-        coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero,
-                            transform_alpha=alpha, transform_algorithm=algo)
+        coder = SparseCoder(
+            dictionary=D,
+            transform_n_nonzero_coefs=n_nonzero,
+            transform_alpha=alpha,
+            transform_algorithm=algo,
+        )
         x = coder.transform(y.reshape(1, -1))
         density = len(np.flatnonzero(x))
         x = np.ravel(np.dot(x, D))
         squared_error = np.sum((y - x) ** 2)
-        plt.plot(x, color=color, lw=lw,
-                 label='%s: %s nonzero coefs,\n%.2f error'
-                 % (title, density, squared_error))
+        plt.plot(
+            x,
+            color=color,
+            lw=lw,
+            label="%s: %s nonzero coefs,\n%.2f error" % (title, density, squared_error),
+        )
 
     # Soft thresholding debiasing
-    coder = SparseCoder(dictionary=D, transform_algorithm='threshold',
-                        transform_alpha=20)
+    coder = SparseCoder(
+        dictionary=D, transform_algorithm="threshold", transform_alpha=20
+    )
     x = coder.transform(y.reshape(1, -1))
     _, idx = np.where(x != 0)
     x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T, y, rcond=lstsq_rcond)
     x = np.ravel(np.dot(x, D))
     squared_error = np.sum((y - x) ** 2)
-    plt.plot(x, color='darkorange', lw=lw,
-             label='Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error'
-             % (len(idx), squared_error))
-    plt.axis('tight')
-    plt.legend(shadow=False, loc='best')
-plt.subplots_adjust(.04, .07, .97, .90, .09, .2)
+    plt.plot(
+        x,
+        color="darkorange",
+        lw=lw,
+        label="Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error"
+        % (len(idx), squared_error),
+    )
+    plt.axis("tight")
+    plt.legend(shadow=False, loc="best")
+plt.subplots_adjust(0.04, 0.07, 0.97, 0.90, 0.09, 0.2)
 plt.show()
diff --git a/examples/decomposition/plot_varimax_fa.py b/examples/decomposition/plot_varimax_fa.py
index 4e786406bdbce..82644595daf94 100644
--- a/examples/decomposition/plot_varimax_fa.py
+++ b/examples/decomposition/plot_varimax_fa.py
@@ -52,9 +52,11 @@
 # Run factor analysis with Varimax rotation
 n_comps = 2
 
-methods = [('PCA', PCA()),
-           ('Unrotated FA', FactorAnalysis()),
-           ('Varimax FA', FactorAnalysis(rotation='varimax'))]
+methods = [
+    ("PCA", PCA()),
+    ("Unrotated FA", FactorAnalysis()),
+    ("Varimax FA", FactorAnalysis(rotation="varimax")),
+]
 fig, axes = plt.subplots(ncols=len(methods), figsize=(10, 8))
 
 for ax, (method, fa) in zip(axes, methods):
diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py
index 4d48d13dd24f2..a4bf4d3875ed2 100644
--- a/examples/ensemble/plot_adaboost_hastie_10_2.py
+++ b/examples/ensemble/plot_adaboost_hastie_10_2.py
@@ -36,7 +36,7 @@
 
 n_estimators = 400
 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R
-learning_rate = 1.
+learning_rate = 1.0
 
 X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
 
@@ -55,23 +55,23 @@
     base_estimator=dt_stump,
     learning_rate=learning_rate,
     n_estimators=n_estimators,
-    algorithm="SAMME")
+    algorithm="SAMME",
+)
 ada_discrete.fit(X_train, y_train)
 
 ada_real = AdaBoostClassifier(
     base_estimator=dt_stump,
     learning_rate=learning_rate,
     n_estimators=n_estimators,
-    algorithm="SAMME.R")
+    algorithm="SAMME.R",
+)
 ada_real.fit(X_train, y_train)
 
 fig = plt.figure()
 ax = fig.add_subplot(111)
 
-ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-',
-        label='Decision Stump Error')
-ax.plot([1, n_estimators], [dt_err] * 2, 'k--',
-        label='Decision Tree Error')
+ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error")
+ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error")
 
 ada_discrete_err = np.zeros((n_estimators,))
 for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
@@ -89,24 +89,36 @@
 for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
     ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
 
-ax.plot(np.arange(n_estimators) + 1, ada_discrete_err,
-        label='Discrete AdaBoost Test Error',
-        color='red')
-ax.plot(np.arange(n_estimators) + 1, ada_discrete_err_train,
-        label='Discrete AdaBoost Train Error',
-        color='blue')
-ax.plot(np.arange(n_estimators) + 1, ada_real_err,
-        label='Real AdaBoost Test Error',
-        color='orange')
-ax.plot(np.arange(n_estimators) + 1, ada_real_err_train,
-        label='Real AdaBoost Train Error',
-        color='green')
+ax.plot(
+    np.arange(n_estimators) + 1,
+    ada_discrete_err,
+    label="Discrete AdaBoost Test Error",
+    color="red",
+)
+ax.plot(
+    np.arange(n_estimators) + 1,
+    ada_discrete_err_train,
+    label="Discrete AdaBoost Train Error",
+    color="blue",
+)
+ax.plot(
+    np.arange(n_estimators) + 1,
+    ada_real_err,
+    label="Real AdaBoost Test Error",
+    color="orange",
+)
+ax.plot(
+    np.arange(n_estimators) + 1,
+    ada_real_err_train,
+    label="Real AdaBoost Train Error",
+    color="green",
+)
 
 ax.set_ylim((0.0, 0.5))
-ax.set_xlabel('n_estimators')
-ax.set_ylabel('error rate')
+ax.set_xlabel("n_estimators")
+ax.set_ylabel("error rate")
 
-leg = ax.legend(loc='upper right', fancybox=True)
+leg = ax.legend(loc="upper right", fancybox=True)
 leg.get_frame().set_alpha(0.7)
 
 plt.show()
diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
index 0ee08c5ed322e..af28b3fe940bf 100644
--- a/examples/ensemble/plot_adaboost_multiclass.py
+++ b/examples/ensemble/plot_adaboost_multiclass.py
@@ -37,8 +37,9 @@
 from sklearn.tree import DecisionTreeClassifier
 
 
-X, y = make_gaussian_quantiles(n_samples=13000, n_features=10,
-                               n_classes=3, random_state=1)
+X, y = make_gaussian_quantiles(
+    n_samples=13000, n_features=10, n_classes=3, random_state=1
+)
 
 n_split = 3000
 
@@ -46,15 +47,15 @@
 y_train, y_test = y[:n_split], y[n_split:]
 
 bdt_real = AdaBoostClassifier(
-    DecisionTreeClassifier(max_depth=2),
-    n_estimators=600,
-    learning_rate=1)
+    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1
+)
 
 bdt_discrete = AdaBoostClassifier(
     DecisionTreeClassifier(max_depth=2),
     n_estimators=600,
     learning_rate=1.5,
-    algorithm="SAMME")
+    algorithm="SAMME",
+)
 
 bdt_real.fit(X_train, y_train)
 bdt_discrete.fit(X_train, y_train)
@@ -63,11 +64,10 @@
 discrete_test_errors = []
 
 for real_test_predict, discrete_train_predict in zip(
-        bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
-    real_test_errors.append(
-        1. - accuracy_score(real_test_predict, y_test))
-    discrete_test_errors.append(
-        1. - accuracy_score(discrete_train_predict, y_test))
+    bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)
+):
+    real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test))
+    discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test))
 
 n_trees_discrete = len(bdt_discrete)
 n_trees_real = len(bdt_real)
@@ -81,35 +81,41 @@
 plt.figure(figsize=(15, 5))
 
 plt.subplot(131)
-plt.plot(range(1, n_trees_discrete + 1),
-         discrete_test_errors, c='black', label='SAMME')
-plt.plot(range(1, n_trees_real + 1),
-         real_test_errors, c='black',
-         linestyle='dashed', label='SAMME.R')
+plt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c="black", label="SAMME")
+plt.plot(
+    range(1, n_trees_real + 1),
+    real_test_errors,
+    c="black",
+    linestyle="dashed",
+    label="SAMME.R",
+)
 plt.legend()
 plt.ylim(0.18, 0.62)
-plt.ylabel('Test Error')
-plt.xlabel('Number of Trees')
+plt.ylabel("Test Error")
+plt.xlabel("Number of Trees")
 
 plt.subplot(132)
-plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors,
-         "b", label='SAMME', alpha=.5)
-plt.plot(range(1, n_trees_real + 1), real_estimator_errors,
-         "r", label='SAMME.R', alpha=.5)
+plt.plot(
+    range(1, n_trees_discrete + 1),
+    discrete_estimator_errors,
+    "b",
+    label="SAMME",
+    alpha=0.5,
+)
+plt.plot(
+    range(1, n_trees_real + 1), real_estimator_errors, "r", label="SAMME.R", alpha=0.5
+)
 plt.legend()
-plt.ylabel('Error')
-plt.xlabel('Number of Trees')
-plt.ylim((.2,
-         max(real_estimator_errors.max(),
-             discrete_estimator_errors.max()) * 1.2))
+plt.ylabel("Error")
+plt.xlabel("Number of Trees")
+plt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2))
 plt.xlim((-20, len(bdt_discrete) + 20))
 
 plt.subplot(133)
-plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights,
-         "b", label='SAMME')
+plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, "b", label="SAMME")
 plt.legend()
-plt.ylabel('Weight')
-plt.xlabel('Number of Trees')
+plt.ylabel("Weight")
+plt.xlabel("Number of Trees")
 plt.ylim((0, discrete_estimator_weights.max() * 1.2))
 plt.xlim((-20, n_trees_discrete + 20))
 
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 659a2a5944ea6..0c3f01299b06e 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -32,8 +32,9 @@
 # Fit regression model
 regr_1 = DecisionTreeRegressor(max_depth=4)
 
-regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
-                           n_estimators=300, random_state=rng)
+regr_2 = AdaBoostRegressor(
+    DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng
+)
 
 regr_1.fit(X, y)
 regr_2.fit(X, y)
diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py
index edb4cbb1a97b3..d22f14cf7c8c9 100644
--- a/examples/ensemble/plot_adaboost_twoclass.py
+++ b/examples/ensemble/plot_adaboost_twoclass.py
@@ -31,19 +31,19 @@
 
 
 # Construct dataset
-X1, y1 = make_gaussian_quantiles(cov=2.,
-                                 n_samples=200, n_features=2,
-                                 n_classes=2, random_state=1)
-X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
-                                 n_samples=300, n_features=2,
-                                 n_classes=2, random_state=1)
+X1, y1 = make_gaussian_quantiles(
+    cov=2.0, n_samples=200, n_features=2, n_classes=2, random_state=1
+)
+X2, y2 = make_gaussian_quantiles(
+    mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1
+)
 X = np.concatenate((X1, X2))
-y = np.concatenate((y1, - y2 + 1))
+y = np.concatenate((y1, -y2 + 1))
 
 # Create and fit an AdaBoosted decision tree
-bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
-                         algorithm="SAMME",
-                         n_estimators=200)
+bdt = AdaBoostClassifier(
+    DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200
+)
 
 bdt.fit(X, y)
 
@@ -57,8 +57,9 @@
 plt.subplot(121)
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
-                     np.arange(y_min, y_max, plot_step))
+xx, yy = np.meshgrid(
+    np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
+)
 
 Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
@@ -68,35 +69,42 @@
 # Plot the training points
 for i, n, c in zip(range(2), class_names, plot_colors):
     idx = np.where(y == i)
-    plt.scatter(X[idx, 0], X[idx, 1],
-                c=c, cmap=plt.cm.Paired,
-                s=20, edgecolor='k',
-                label="Class %s" % n)
+    plt.scatter(
+        X[idx, 0],
+        X[idx, 1],
+        c=c,
+        cmap=plt.cm.Paired,
+        s=20,
+        edgecolor="k",
+        label="Class %s" % n,
+    )
 plt.xlim(x_min, x_max)
 plt.ylim(y_min, y_max)
-plt.legend(loc='upper right')
-plt.xlabel('x')
-plt.ylabel('y')
-plt.title('Decision Boundary')
+plt.legend(loc="upper right")
+plt.xlabel("x")
+plt.ylabel("y")
+plt.title("Decision Boundary")
 
 # Plot the two-class decision scores
 twoclass_output = bdt.decision_function(X)
 plot_range = (twoclass_output.min(), twoclass_output.max())
 plt.subplot(122)
 for i, n, c in zip(range(2), class_names, plot_colors):
-    plt.hist(twoclass_output[y == i],
-             bins=10,
-             range=plot_range,
-             facecolor=c,
-             label='Class %s' % n,
-             alpha=.5,
-             edgecolor='k')
+    plt.hist(
+        twoclass_output[y == i],
+        bins=10,
+        range=plot_range,
+        facecolor=c,
+        label="Class %s" % n,
+        alpha=0.5,
+        edgecolor="k",
+    )
 x1, x2, y1, y2 = plt.axis()
 plt.axis((x1, x2, y1, y2 * 1.2))
-plt.legend(loc='upper right')
-plt.ylabel('Samples')
-plt.xlabel('Score')
-plt.title('Decision Scores')
+plt.legend(loc="upper right")
+plt.ylabel("Samples")
+plt.xlabel("Score")
+plt.title("Decision Scores")
 
 plt.tight_layout()
 plt.subplots_adjust(wspace=0.35)
diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py
index 0af239e197cf0..f78a200a41c83 100644
--- a/examples/ensemble/plot_bias_variance.py
+++ b/examples/ensemble/plot_bias_variance.py
@@ -73,18 +73,20 @@
 from sklearn.tree import DecisionTreeRegressor
 
 # Settings
-n_repeat = 50       # Number of iterations for computing expectations
-n_train = 50        # Size of the training set
-n_test = 1000       # Size of the test set
-noise = 0.1         # Standard deviation of the noise
+n_repeat = 50  # Number of iterations for computing expectations
+n_train = 50  # Size of the training set
+n_test = 1000  # Size of the test set
+noise = 0.1  # Standard deviation of the noise
 np.random.seed(0)
 
 # Change this for exploring the bias-variance decomposition of other
 # estimators. This should work well for estimators with high variance (e.g.,
 # decision trees or KNN), but poorly for estimators with low variance (e.g.,
 # linear models).
-estimators = [("Tree", DecisionTreeRegressor()),
-              ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor()))]
+estimators = [
+    ("Tree", DecisionTreeRegressor()),
+    ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor())),
+]
 
 n_estimators = len(estimators)
 
@@ -93,7 +95,7 @@
 def f(x):
     x = x.ravel()
 
-    return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)
+    return np.exp(-(x ** 2)) + 1.5 * np.exp(-((x - 2) ** 2))
 
 
 def generate(n_samples, noise, n_repeat=1):
@@ -141,18 +143,18 @@ def generate(n_samples, noise, n_repeat=1):
         for j in range(n_repeat):
             y_error += (y_test[:, j] - y_predict[:, i]) ** 2
 
-    y_error /= (n_repeat * n_repeat)
+    y_error /= n_repeat * n_repeat
 
     y_noise = np.var(y_test, axis=1)
     y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
     y_var = np.var(y_predict, axis=1)
 
-    print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
-          " + {3:.4f} (var) + {4:.4f} (noise)".format(name,
-                                                      np.mean(y_error),
-                                                      np.mean(y_bias),
-                                                      np.mean(y_var),
-                                                      np.mean(y_noise)))
+    print(
+        "{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
+        " + {3:.4f} (var) + {4:.4f} (noise)".format(
+            name, np.mean(y_error), np.mean(y_bias), np.mean(y_var), np.mean(y_noise)
+        )
+    )
 
     # Plot figures
     plt.subplot(2, n_estimators, n + 1)
@@ -165,14 +167,13 @@ def generate(n_samples, noise, n_repeat=1):
         else:
             plt.plot(X_test, y_predict[:, i], "r", alpha=0.05)
 
-    plt.plot(X_test, np.mean(y_predict, axis=1), "c",
-             label=r"$\mathbb{E}_{LS} \^y(x)$")
+    plt.plot(X_test, np.mean(y_predict, axis=1), "c", label=r"$\mathbb{E}_{LS} \^y(x)$")
 
     plt.xlim([-5, 5])
     plt.title(name)
 
     if n == n_estimators - 1:
-        plt.legend(loc=(1.1, .5))
+        plt.legend(loc=(1.1, 0.5))
 
     plt.subplot(2, n_estimators, n_estimators + n + 1)
     plt.plot(X_test, y_error, "r", label="$error(x)$")
@@ -185,7 +186,7 @@ def generate(n_samples, noise, n_repeat=1):
 
     if n == n_estimators - 1:
 
-        plt.legend(loc=(1.1, .5))
+        plt.legend(loc=(1.1, 0.5))
 
-plt.subplots_adjust(right=.75)
+plt.subplots_adjust(right=0.75)
 plt.show()
diff --git a/examples/ensemble/plot_ensemble_oob.py b/examples/ensemble/plot_ensemble_oob.py
index e7b37f212177c..5b798eece8667 100644
--- a/examples/ensemble/plot_ensemble_oob.py
+++ b/examples/ensemble/plot_ensemble_oob.py
@@ -36,26 +36,45 @@
 RANDOM_STATE = 123
 
 # Generate a binary classification dataset.
-X, y = make_classification(n_samples=500, n_features=25,
-                           n_clusters_per_class=1, n_informative=15,
-                           random_state=RANDOM_STATE)
+X, y = make_classification(
+    n_samples=500,
+    n_features=25,
+    n_clusters_per_class=1,
+    n_informative=15,
+    random_state=RANDOM_STATE,
+)
 
 # NOTE: Setting the `warm_start` construction parameter to `True` disables
 # support for parallelized ensembles but is necessary for tracking the OOB
 # error trajectory during training.
 ensemble_clfs = [
-    ("RandomForestClassifier, max_features='sqrt'",
-        RandomForestClassifier(warm_start=True, oob_score=True,
-                               max_features="sqrt",
-                               random_state=RANDOM_STATE)),
-    ("RandomForestClassifier, max_features='log2'",
-        RandomForestClassifier(warm_start=True, max_features='log2',
-                               oob_score=True,
-                               random_state=RANDOM_STATE)),
-    ("RandomForestClassifier, max_features=None",
-        RandomForestClassifier(warm_start=True, max_features=None,
-                               oob_score=True,
-                               random_state=RANDOM_STATE))
+    (
+        "RandomForestClassifier, max_features='sqrt'",
+        RandomForestClassifier(
+            warm_start=True,
+            oob_score=True,
+            max_features="sqrt",
+            random_state=RANDOM_STATE,
+        ),
+    ),
+    (
+        "RandomForestClassifier, max_features='log2'",
+        RandomForestClassifier(
+            warm_start=True,
+            max_features="log2",
+            oob_score=True,
+            random_state=RANDOM_STATE,
+        ),
+    ),
+    (
+        "RandomForestClassifier, max_features=None",
+        RandomForestClassifier(
+            warm_start=True,
+            max_features=None,
+            oob_score=True,
+            random_state=RANDOM_STATE,
+        ),
+    ),
 ]
 
 # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py
index 7b75b92a1f0e0..9d19bbf907904 100644
--- a/examples/ensemble/plot_forest_importances.py
+++ b/examples/ensemble/plot_forest_importances.py
@@ -25,16 +25,22 @@
 from sklearn.model_selection import train_test_split
 
 X, y = make_classification(
-    n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
-    n_repeated=0, n_classes=2, random_state=0, shuffle=False)
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, stratify=y, random_state=42)
+    n_samples=1000,
+    n_features=10,
+    n_informative=3,
+    n_redundant=0,
+    n_repeated=0,
+    n_classes=2,
+    random_state=0,
+    shuffle=False,
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
 
 # %%
 # A random forest classifier will be fitted to compute the feature importances.
 from sklearn.ensemble import RandomForestClassifier
 
-feature_names = [f'feature {i}' for i in range(X.shape[1])]
+feature_names = [f"feature {i}" for i in range(X.shape[1])]
 forest = RandomForestClassifier(random_state=0)
 forest.fit(X_train, y_train)
 
@@ -54,16 +60,15 @@
 
 start_time = time.time()
 importances = forest.feature_importances_
-std = np.std([
-    tree.feature_importances_ for tree in forest.estimators_], axis=0)
+std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
 elapsed_time = time.time() - start_time
 
-print(f"Elapsed time to compute the importances: "
-      f"{elapsed_time:.3f} seconds")
+print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
 
 # %%
 # Let's plot the impurity-based importance.
 import pandas as pd
+
 forest_importances = pd.Series(importances, index=feature_names)
 
 fig, ax = plt.subplots()
@@ -84,10 +89,10 @@
 
 start_time = time.time()
 result = permutation_importance(
-    forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
+    forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
+)
 elapsed_time = time.time() - start_time
-print(f"Elapsed time to compute the importances: "
-      f"{elapsed_time:.3f} seconds")
+print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
 
 forest_importances = pd.Series(result.importances_mean, index=feature_names)
 
diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py
index ff2ec6f67ed99..8bf265f345be8 100644
--- a/examples/ensemble/plot_forest_importances_faces.py
+++ b/examples/ensemble/plot_forest_importances_faces.py
@@ -44,8 +44,7 @@
 # A random forest classifier will be fitted to compute the feature importances.
 from sklearn.ensemble import RandomForestClassifier
 
-forest = RandomForestClassifier(
-    n_estimators=750, n_jobs=n_jobs, random_state=42)
+forest = RandomForestClassifier(n_estimators=750, n_jobs=n_jobs, random_state=42)
 
 forest.fit(X, y)
 
@@ -68,8 +67,7 @@
 importances = forest.feature_importances_
 elapsed_time = time.time() - start_time
 
-print(f"Elapsed time to compute the importances: "
-      f"{elapsed_time:.3f} seconds")
+print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
 imp_reshaped = importances.reshape(img_shape)
 plt.matshow(imp_reshaped, cmap=plt.cm.hot)
 plt.title("Pixel importances using impurity values")
diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py
index 81cd54a9bb4d3..b2e95ef2ecc81 100644
--- a/examples/ensemble/plot_forest_iris.py
+++ b/examples/ensemble/plot_forest_iris.py
@@ -47,8 +47,11 @@
 from matplotlib.colors import ListedColormap
 
 from sklearn.datasets import load_iris
-from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
-                              AdaBoostClassifier)
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    ExtraTreesClassifier,
+    AdaBoostClassifier,
+)
 from sklearn.tree import DecisionTreeClassifier
 
 # Parameters
@@ -64,11 +67,12 @@
 
 plot_idx = 1
 
-models = [DecisionTreeClassifier(max_depth=None),
-          RandomForestClassifier(n_estimators=n_estimators),
-          ExtraTreesClassifier(n_estimators=n_estimators),
-          AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
-                             n_estimators=n_estimators)]
+models = [
+    DecisionTreeClassifier(max_depth=None),
+    RandomForestClassifier(n_estimators=n_estimators),
+    ExtraTreesClassifier(n_estimators=n_estimators),
+    AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators),
+]
 
 for pair in ([0, 1], [0, 2], [2, 3]):
     for model in models:
@@ -94,15 +98,12 @@
         scores = model.score(X, y)
         # Create a title for each column and the console by using str() and
         # slicing away useless parts of the string
-        model_title = str(type(model)).split(
-            ".")[-1][:-2][:-len("Classifier")]
+        model_title = str(type(model)).split(".")[-1][:-2][: -len("Classifier")]
 
         model_details = model_title
         if hasattr(model, "estimators_"):
-            model_details += " with {} estimators".format(
-                len(model.estimators_))
-        print(model_details + " with features", pair,
-              "has a score of", scores)
+            model_details += " with {} estimators".format(len(model.estimators_))
+        print(model_details + " with features", pair, "has a score of", scores)
 
         plt.subplot(3, 4, plot_idx)
         if plot_idx <= len(models):
@@ -113,8 +114,9 @@
         # filled contour plot
         x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
         y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
-                             np.arange(y_min, y_max, plot_step))
+        xx, yy = np.meshgrid(
+            np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
+        )
 
         # Plot either a single DecisionTreeClassifier or alpha blend the
         # decision surfaces of the ensemble of classifiers
@@ -139,19 +141,30 @@
         # black outline
         xx_coarser, yy_coarser = np.meshgrid(
             np.arange(x_min, x_max, plot_step_coarser),
-            np.arange(y_min, y_max, plot_step_coarser))
-        Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(),
-                                         yy_coarser.ravel()]
-                                         ).reshape(xx_coarser.shape)
-        cs_points = plt.scatter(xx_coarser, yy_coarser, s=15,
-                                c=Z_points_coarser, cmap=cmap,
-                                edgecolors="none")
+            np.arange(y_min, y_max, plot_step_coarser),
+        )
+        Z_points_coarser = model.predict(
+            np.c_[xx_coarser.ravel(), yy_coarser.ravel()]
+        ).reshape(xx_coarser.shape)
+        cs_points = plt.scatter(
+            xx_coarser,
+            yy_coarser,
+            s=15,
+            c=Z_points_coarser,
+            cmap=cmap,
+            edgecolors="none",
+        )
 
         # Plot the training points, these are clustered together and have a
         # black outline
-        plt.scatter(X[:, 0], X[:, 1], c=y,
-                    cmap=ListedColormap(['r', 'y', 'b']),
-                    edgecolor='k', s=20)
+        plt.scatter(
+            X[:, 0],
+            X[:, 1],
+            c=y,
+            cmap=ListedColormap(["r", "y", "b"]),
+            edgecolor="k",
+            s=20,
+        )
         plot_idx += 1  # move on to the next plot in sequence
 
 plt.suptitle("Classifiers on feature subsets of the Iris dataset", fontsize=12)
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index 876a1ca21ec4c..3bb406a0ffe86 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -32,8 +32,8 @@
 
 X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
 
-n_categorical_features = (X.dtypes == 'category').sum()
-n_numerical_features = (X.dtypes == 'float').sum()
+n_categorical_features = (X.dtypes == "category").sum()
+n_numerical_features = (X.dtypes == "float").sum()
 print(f"Number of samples: {X.shape[0]}")
 print(f"Number of features: {X.shape[1]}")
 print(f"Number of categorical features: {n_categorical_features}")
@@ -51,10 +51,9 @@
 from sklearn.compose import make_column_selector
 
 dropper = make_column_transformer(
-    ('drop', make_column_selector(dtype_include='category')),
-    remainder='passthrough')
-hist_dropped = make_pipeline(dropper,
-                             HistGradientBoostingRegressor(random_state=42))
+    ("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
+)
+hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
 
 # %%
 # Gradient boosting estimator with one-hot encoding
@@ -65,12 +64,16 @@
 from sklearn.preprocessing import OneHotEncoder
 
 one_hot_encoder = make_column_transformer(
-    (OneHotEncoder(sparse=False, handle_unknown='ignore'),
-     make_column_selector(dtype_include='category')),
-    remainder='passthrough')
+    (
+        OneHotEncoder(sparse=False, handle_unknown="ignore"),
+        make_column_selector(dtype_include="category"),
+    ),
+    remainder="passthrough",
+)
 
-hist_one_hot = make_pipeline(one_hot_encoder,
-                             HistGradientBoostingRegressor(random_state=42))
+hist_one_hot = make_pipeline(
+    one_hot_encoder, HistGradientBoostingRegressor(random_state=42)
+)
 
 # %%
 # Gradient boosting estimator with ordinal encoding
@@ -83,12 +86,16 @@
 import numpy as np
 
 ordinal_encoder = make_column_transformer(
-    (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan),
-     make_column_selector(dtype_include='category')),
-    remainder='passthrough')
+    (
+        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
+        make_column_selector(dtype_include="category"),
+    ),
+    remainder="passthrough",
+)
 
-hist_ordinal = make_pipeline(ordinal_encoder,
-                             HistGradientBoostingRegressor(random_state=42))
+hist_ordinal = make_pipeline(
+    ordinal_encoder, HistGradientBoostingRegressor(random_state=42)
+)
 
 # %%
 # Gradient boosting estimator with native categorical support
@@ -107,12 +114,12 @@
 
 # The ordinal encoder will first output the categorical features, and then the
 # continuous (passed-through) features
-categorical_mask = ([True] * n_categorical_features +
-                    [False] * n_numerical_features)
+categorical_mask = [True] * n_categorical_features + [False] * n_numerical_features
 hist_native = make_pipeline(
     ordinal_encoder,
-    HistGradientBoostingRegressor(random_state=42,
-                                  categorical_features=categorical_mask)
+    HistGradientBoostingRegressor(
+        random_state=42, categorical_features=categorical_mask
+    ),
 )
 
 
@@ -136,20 +143,33 @@
 def plot_results(figure_title):
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
 
-    plot_info = [('fit_time', 'Fit times (s)', ax1, None),
-                 ('test_score', 'Mean Absolute Percentage Error', ax2,
-                  (0, 0.20))]
+    plot_info = [
+        ("fit_time", "Fit times (s)", ax1, None),
+        ("test_score", "Mean Absolute Percentage Error", ax2, (0, 0.20)),
+    ]
 
     x, width = np.arange(4), 0.9
     for key, title, ax, y_limit in plot_info:
-        items = [dropped_result[key], one_hot_result[key], ordinal_result[key],
-                 native_result[key]]
-        ax.bar(x, [np.mean(np.abs(item)) for item in items],
-               width, yerr=[np.std(item) for item in items],
-               color=['C0', 'C1', 'C2', 'C3'])
-        ax.set(xlabel='Model', title=title, xticks=x,
-               xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
-               ylim=y_limit)
+        items = [
+            dropped_result[key],
+            one_hot_result[key],
+            ordinal_result[key],
+            native_result[key],
+        ]
+        ax.bar(
+            x,
+            [np.mean(np.abs(item)) for item in items],
+            width,
+            yerr=[np.std(item) for item in items],
+            color=["C0", "C1", "C2", "C3"],
+        )
+        ax.set(
+            xlabel="Model",
+            title=title,
+            xticks=x,
+            xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
+            ylim=y_limit,
+        )
     fig.suptitle(figure_title)
 
 
@@ -194,8 +214,10 @@ def plot_results(figure_title):
 # of trees and the depth of each tree.
 
 for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
-    pipe.set_params(histgradientboostingregressor__max_depth=3,
-                    histgradientboostingregressor__max_iter=15)
+    pipe.set_params(
+        histgradientboostingregressor__max_depth=3,
+        histgradientboostingregressor__max_iter=15,
+    )
 
 dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring)
 one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring)
diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py
index 6f38e57a15ca1..cc4408b6dc255 100644
--- a/examples/ensemble/plot_gradient_boosting_early_stopping.py
+++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py
@@ -49,7 +49,7 @@
 data_list = [datasets.load_iris(), datasets.load_digits()]
 data_list = [(d.data, d.target) for d in data_list]
 data_list += [datasets.make_hastie_10_2()]
-names = ['Iris Data', 'Digits Data', 'Hastie Data']
+names = ["Iris Data", "Digits Data", "Hastie Data"]
 
 n_gb = []
 score_gb = []
@@ -61,17 +61,20 @@
 n_estimators = 500
 
 for X, y in data_list:
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
-                                                        random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=0
+    )
 
     # We specify that if the scores don't improve by at least 0.01 for the last
     # 10 stages, stop fitting additional stages
-    gbes = ensemble.GradientBoostingClassifier(n_estimators=n_estimators,
-                                               validation_fraction=0.2,
-                                               n_iter_no_change=5, tol=0.01,
-                                               random_state=0)
-    gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators,
-                                             random_state=0)
+    gbes = ensemble.GradientBoostingClassifier(
+        n_estimators=n_estimators,
+        validation_fraction=0.2,
+        n_iter_no_change=5,
+        tol=0.01,
+        random_state=0,
+    )
+    gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, random_state=0)
     start = time.time()
     gb.fit(X_train, y_train)
     time_gb.append(time.time() - start)
@@ -97,10 +100,12 @@
 
 plt.figure(figsize=(9, 5))
 
-bar1 = plt.bar(index, score_gb, bar_width, label='Without early stopping',
-               color='crimson')
-bar2 = plt.bar(index + bar_width, score_gbes, bar_width,
-               label='With early stopping', color='coral')
+bar1 = plt.bar(
+    index, score_gb, bar_width, label="Without early stopping", color="crimson"
+)
+bar2 = plt.bar(
+    index + bar_width, score_gbes, bar_width, label="With early stopping", color="coral"
+)
 
 plt.xticks(index + bar_width, names)
 plt.yticks(np.arange(0, 1.3, 0.1))
@@ -111,20 +116,24 @@ def autolabel(rects, n_estimators):
     Attach a text label above each bar displaying n_estimators of each model
     """
     for i, rect in enumerate(rects):
-        plt.text(rect.get_x() + rect.get_width() / 2.,
-                 1.05 * rect.get_height(), 'n_est=%d' % n_estimators[i],
-                 ha='center', va='bottom')
+        plt.text(
+            rect.get_x() + rect.get_width() / 2.0,
+            1.05 * rect.get_height(),
+            "n_est=%d" % n_estimators[i],
+            ha="center",
+            va="bottom",
+        )
 
 
 autolabel(bar1, n_gb)
 autolabel(bar2, n_gbes)
 
 plt.ylim([0, 1.3])
-plt.legend(loc='best')
+plt.legend(loc="best")
 plt.grid(True)
 
-plt.xlabel('Datasets')
-plt.ylabel('Test score')
+plt.xlabel("Datasets")
+plt.ylabel("Test score")
 
 plt.show()
 
@@ -135,10 +144,12 @@ def autolabel(rects, n_estimators):
 
 plt.figure(figsize=(9, 5))
 
-bar1 = plt.bar(index, time_gb, bar_width, label='Without early stopping',
-               color='crimson')
-bar2 = plt.bar(index + bar_width, time_gbes, bar_width,
-               label='With early stopping', color='coral')
+bar1 = plt.bar(
+    index, time_gb, bar_width, label="Without early stopping", color="crimson"
+)
+bar2 = plt.bar(
+    index + bar_width, time_gbes, bar_width, label="With early stopping", color="coral"
+)
 
 max_y = np.amax(np.maximum(time_gb, time_gbes))
 
@@ -149,10 +160,10 @@ def autolabel(rects, n_estimators):
 autolabel(bar2, n_gbes)
 
 plt.ylim([0, 1.3 * max_y])
-plt.legend(loc='best')
+plt.legend(loc="best")
 plt.grid(True)
 
-plt.xlabel('Datasets')
-plt.ylabel('Fit Time')
+plt.xlabel("Datasets")
+plt.ylabel("Fit Time")
 
 plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py
index c3b9321f166be..ce13eb4398403 100644
--- a/examples/ensemble/plot_gradient_boosting_oob.py
+++ b/examples/ensemble/plot_gradient_boosting_oob.py
@@ -51,24 +51,29 @@
 X = np.c_[x1, x2, x3]
 
 X = X.astype(np.float32)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
-                                                    random_state=9)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=9)
 
 # Fit classifier with out-of-bag estimates
-params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
-          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
+params = {
+    "n_estimators": 1200,
+    "max_depth": 3,
+    "subsample": 0.5,
+    "learning_rate": 0.01,
+    "min_samples_leaf": 1,
+    "random_state": 3,
+}
 clf = ensemble.GradientBoostingClassifier(**params)
 
 clf.fit(X_train, y_train)
 acc = clf.score(X_test, y_test)
 print("Accuracy: {:.4f}".format(acc))
 
-n_estimators = params['n_estimators']
+n_estimators = params["n_estimators"]
 x = np.arange(n_estimators) + 1
 
 
 def heldout_score(clf, X_test, y_test):
-    """compute deviance scores on ``X_test`` and ``y_test``. """
+    """compute deviance scores on ``X_test`` and ``y_test``."""
     score = np.zeros((n_estimators,), dtype=np.float64)
     for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
         score[i] = clf.loss_(y_test, y_pred)
@@ -112,26 +117,26 @@ def cv_estimate(n_splits=None):
 cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))
 
 # plot curves and vertical lines for best iterations
-plt.plot(x, cumsum, label='OOB loss', color=oob_color)
-plt.plot(x, test_score, label='Test loss', color=test_color)
-plt.plot(x, cv_score, label='CV loss', color=cv_color)
+plt.plot(x, cumsum, label="OOB loss", color=oob_color)
+plt.plot(x, test_score, label="Test loss", color=test_color)
+plt.plot(x, cv_score, label="CV loss", color=cv_color)
 plt.axvline(x=oob_best_iter, color=oob_color)
 plt.axvline(x=test_best_iter, color=test_color)
 plt.axvline(x=cv_best_iter, color=cv_color)
 
 # add three vertical lines to xticks
 xticks = plt.xticks()
-xticks_pos = np.array(xticks[0].tolist() +
-                      [oob_best_iter, cv_best_iter, test_best_iter])
-xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +
-                        ['OOB', 'CV', 'Test'])
+xticks_pos = np.array(
+    xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter]
+)
+xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + ["OOB", "CV", "Test"])
 ind = np.argsort(xticks_pos)
 xticks_pos = xticks_pos[ind]
 xticks_label = xticks_label[ind]
 plt.xticks(xticks_pos, xticks_label)
 
-plt.legend(loc='upper right')
-plt.ylabel('normalized loss')
-plt.xlabel('number of iterations')
+plt.legend(loc="upper right")
+plt.ylabel("normalized loss")
+plt.xlabel("number of iterations")
 
 plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 67e208ece0b06..93bc70038d3f6 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -66,14 +66,13 @@ def f(x):
     min_samples_split=9,
 )
 for alpha in [0.05, 0.5, 0.95]:
-    gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha,
-                                    **common_params)
+    gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params)
     all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
 # %%
 # For the sake of comparison, we also fit a baseline model trained with the
 # usual (mean) squared error (MSE).
-gbr_ls = GradientBoostingRegressor(loss='squared_error', **common_params)
+gbr_ls = GradientBoostingRegressor(loss="squared_error", **common_params)
 all_models["mse"] = gbr_ls.fit(X_train, y_train)
 
 # %%
@@ -88,24 +87,25 @@ def f(x):
 import matplotlib.pyplot as plt
 
 
-y_pred = all_models['mse'].predict(xx)
-y_lower = all_models['q 0.05'].predict(xx)
-y_upper = all_models['q 0.95'].predict(xx)
-y_med = all_models['q 0.50'].predict(xx)
+y_pred = all_models["mse"].predict(xx)
+y_lower = all_models["q 0.05"].predict(xx)
+y_upper = all_models["q 0.95"].predict(xx)
+y_med = all_models["q 0.50"].predict(xx)
 
 fig = plt.figure(figsize=(10, 10))
-plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
-plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange")
-plt.plot(xx, y_pred, 'r-', label='Predicted mean')
-plt.plot(xx, y_upper, 'k-')
-plt.plot(xx, y_lower, 'k-')
-plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
-                 label='Predicted 90% interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
+plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
+plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
+plt.plot(xx, y_med, "r-", label="Predicted median", color="orange")
+plt.plot(xx, y_pred, "r-", label="Predicted mean")
+plt.plot(xx, y_upper, "k-")
+plt.plot(xx, y_lower, "k-")
+plt.fill_between(
+    xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
+)
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
 plt.ylim(-10, 25)
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 plt.show()
 
 # %%
@@ -129,21 +129,19 @@ def f(x):
 
 def highlight_min(x):
     x_min = x.min()
-    return ['font-weight: bold' if v == x_min else ''
-            for v in x]
+    return ["font-weight: bold" if v == x_min else "" for v in x]
 
 
 results = []
 for name, gbr in sorted(all_models.items()):
-    metrics = {'model': name}
+    metrics = {"model": name}
     y_pred = gbr.predict(X_train)
     for alpha in [0.05, 0.5, 0.95]:
-        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
-            y_train, y_pred, alpha=alpha)
-    metrics['MSE'] = mean_squared_error(y_train, y_pred)
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_train, y_pred, alpha=alpha)
+    metrics["MSE"] = mean_squared_error(y_train, y_pred)
     results.append(metrics)
 
-pd.DataFrame(results).set_index('model').style.apply(highlight_min)
+pd.DataFrame(results).set_index("model").style.apply(highlight_min)
 
 # %%
 # One column shows all models evaluated by the same metric. The minimum number
@@ -163,15 +161,14 @@ def highlight_min(x):
 # We then do the same on the test set.
 results = []
 for name, gbr in sorted(all_models.items()):
-    metrics = {'model': name}
+    metrics = {"model": name}
     y_pred = gbr.predict(X_test)
     for alpha in [0.05, 0.5, 0.95]:
-        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
-            y_test, y_pred, alpha=alpha)
-    metrics['MSE'] = mean_squared_error(y_test, y_pred)
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_test, y_pred, alpha=alpha)
+    metrics["MSE"] = mean_squared_error(y_test, y_pred)
     results.append(metrics)
 
-pd.DataFrame(results).set_index('model').style.apply(highlight_min)
+pd.DataFrame(results).set_index("model").style.apply(highlight_min)
 
 
 # %%
@@ -199,16 +196,18 @@ def coverage_fraction(y, y_low, y_high):
     return np.mean(np.logical_and(y >= y_low, y <= y_high))
 
 
-coverage_fraction(y_train,
-                  all_models['q 0.05'].predict(X_train),
-                  all_models['q 0.95'].predict(X_train))
+coverage_fraction(
+    y_train,
+    all_models["q 0.05"].predict(X_train),
+    all_models["q 0.95"].predict(X_train),
+)
 
 # %%
 # On the training set the calibration is very close to the expected coverage
 # value for a 90% confidence interval.
-coverage_fraction(y_test,
-                  all_models['q 0.05'].predict(X_test),
-                  all_models['q 0.95'].predict(X_test))
+coverage_fraction(
+    y_test, all_models["q 0.05"].predict(X_test), all_models["q 0.95"].predict(X_test)
+)
 
 
 # %%
@@ -298,16 +297,17 @@ def coverage_fraction(y, y_low, y_high):
 y_upper = search_95p.predict(xx)
 
 fig = plt.figure(figsize=(10, 10))
-plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
-plt.plot(xx, y_upper, 'k-')
-plt.plot(xx, y_lower, 'k-')
-plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
-                 label='Predicted 90% interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
+plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
+plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
+plt.plot(xx, y_upper, "k-")
+plt.plot(xx, y_lower, "k-")
+plt.fill_between(
+    xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
+)
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
 plt.ylim(-10, 25)
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 plt.title("Prediction with tuned hyper-parameters")
 plt.show()
 
@@ -317,13 +317,9 @@ def coverage_fraction(y, y_low, y_high):
 #
 # We now quantitatively evaluate the joint-calibration of the pair of
 # estimators:
-coverage_fraction(y_train,
-                  search_05p.predict(X_train),
-                  search_95p.predict(X_train))
+coverage_fraction(y_train, search_05p.predict(X_train), search_95p.predict(X_train))
 # %%
-coverage_fraction(y_test,
-                  search_05p.predict(X_test),
-                  search_95p.predict(X_test))
+coverage_fraction(y_test, search_05p.predict(X_test), search_95p.predict(X_test))
 # %%
 # The calibration of the tuned pair is sadly not better on the test set: the
 # width of the estimated confidence interval is still too narrow.
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 3722f4bf2066f..c258dc13babaf 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -61,13 +61,16 @@
 # :class:`~sklearn.ensemble.GradientBoostingRegressor` ).
 
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.1, random_state=13)
+    X, y, test_size=0.1, random_state=13
+)
 
-params = {'n_estimators': 500,
-          'max_depth': 4,
-          'min_samples_split': 5,
-          'learning_rate': 0.01,
-          'loss': 'squared_error'}
+params = {
+    "n_estimators": 500,
+    "max_depth": 4,
+    "min_samples_split": 5,
+    "learning_rate": 0.01,
+    "loss": "squared_error",
+}
 
 # %%
 # Fit regression model
@@ -89,20 +92,25 @@
 # Finally, we will visualize the results. To do that we will first compute the
 # test set deviance and then plot it against boosting iterations.
 
-test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
+test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
 for i, y_pred in enumerate(reg.staged_predict(X_test)):
     test_score[i] = reg.loss_(y_test, y_pred)
 
 fig = plt.figure(figsize=(6, 6))
 plt.subplot(1, 1, 1)
-plt.title('Deviance')
-plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
-         label='Training Set Deviance')
-plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
-         label='Test Set Deviance')
-plt.legend(loc='upper right')
-plt.xlabel('Boosting Iterations')
-plt.ylabel('Deviance')
+plt.title("Deviance")
+plt.plot(
+    np.arange(params["n_estimators"]) + 1,
+    reg.train_score_,
+    "b-",
+    label="Training Set Deviance",
+)
+plt.plot(
+    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
+)
+plt.legend(loc="upper right")
+plt.xlabel("Boosting Iterations")
+plt.ylabel("Deviance")
 fig.tight_layout()
 plt.show()
 
@@ -123,19 +131,23 @@
 
 feature_importance = reg.feature_importances_
 sorted_idx = np.argsort(feature_importance)
-pos = np.arange(sorted_idx.shape[0]) + .5
+pos = np.arange(sorted_idx.shape[0]) + 0.5
 fig = plt.figure(figsize=(12, 6))
 plt.subplot(1, 2, 1)
-plt.barh(pos, feature_importance[sorted_idx], align='center')
+plt.barh(pos, feature_importance[sorted_idx], align="center")
 plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
-plt.title('Feature Importance (MDI)')
+plt.title("Feature Importance (MDI)")
 
-result = permutation_importance(reg, X_test, y_test, n_repeats=10,
-                                random_state=42, n_jobs=2)
+result = permutation_importance(
+    reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
+)
 sorted_idx = result.importances_mean.argsort()
 plt.subplot(1, 2, 2)
-plt.boxplot(result.importances[sorted_idx].T,
-            vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])
+plt.boxplot(
+    result.importances[sorted_idx].T,
+    vert=False,
+    labels=np.array(diabetes.feature_names)[sorted_idx],
+)
 plt.title("Permutation Importance (test set)")
 fig.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py
index 6f2eb893ebe0f..5947233d60860 100644
--- a/examples/ensemble/plot_gradient_boosting_regularization.py
+++ b/examples/ensemble/plot_gradient_boosting_regularization.py
@@ -41,21 +41,31 @@
 X_train, X_test = X[:2000], X[2000:]
 y_train, y_test = y[:2000], y[2000:]
 
-original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4,
-                   'max_depth': None, 'random_state': 2, 'min_samples_split': 5}
+original_params = {
+    "n_estimators": 1000,
+    "max_leaf_nodes": 4,
+    "max_depth": None,
+    "random_state": 2,
+    "min_samples_split": 5,
+}
 
 plt.figure()
 
-for label, color, setting in [('No shrinkage', 'orange',
-                               {'learning_rate': 1.0, 'subsample': 1.0}),
-                              ('learning_rate=0.1', 'turquoise',
-                               {'learning_rate': 0.1, 'subsample': 1.0}),
-                              ('subsample=0.5', 'blue',
-                               {'learning_rate': 1.0, 'subsample': 0.5}),
-                              ('learning_rate=0.1, subsample=0.5', 'gray',
-                               {'learning_rate': 0.1, 'subsample': 0.5}),
-                              ('learning_rate=0.1, max_features=2', 'magenta',
-                               {'learning_rate': 0.1, 'max_features': 2})]:
+for label, color, setting in [
+    ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}),
+    ("learning_rate=0.1", "turquoise", {"learning_rate": 0.1, "subsample": 1.0}),
+    ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}),
+    (
+        "learning_rate=0.1, subsample=0.5",
+        "gray",
+        {"learning_rate": 0.1, "subsample": 0.5},
+    ),
+    (
+        "learning_rate=0.1, max_features=2",
+        "magenta",
+        {"learning_rate": 0.1, "max_features": 2},
+    ),
+]:
     params = dict(original_params)
     params.update(setting)
 
@@ -63,17 +73,22 @@
     clf.fit(X_train, y_train)
 
     # compute test set deviance
-    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
+    test_deviance = np.zeros((params["n_estimators"],), dtype=np.float64)
 
     for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
         # clf.loss_ assumes that y_test[i] in {0, 1}
         test_deviance[i] = clf.loss_(y_test, y_pred)
 
-    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
-             '-', color=color, label=label)
-
-plt.legend(loc='upper left')
-plt.xlabel('Boosting Iterations')
-plt.ylabel('Test Set Deviance')
+    plt.plot(
+        (np.arange(test_deviance.shape[0]) + 1)[::5],
+        test_deviance[::5],
+        "-",
+        color=color,
+        label=label,
+    )
+
+plt.legend(loc="upper left")
+plt.xlabel("Boosting Iterations")
+plt.ylabel("Test Set Deviance")
 
 plt.show()
diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py
index 5370f3af3ef97..12a2bfd846279 100644
--- a/examples/ensemble/plot_isolation_forest.py
+++ b/examples/ensemble/plot_isolation_forest.py
@@ -54,17 +54,15 @@
 plt.title("IsolationForest")
 plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
 
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white',
-                 s=20, edgecolor='k')
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green',
-                 s=20, edgecolor='k')
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red',
-                s=20, edgecolor='k')
-plt.axis('tight')
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k")
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k")
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k")
+plt.axis("tight")
 plt.xlim((-5, 5))
 plt.ylim((-5, 5))
-plt.legend([b1, b2, c],
-           ["training observations",
-            "new regular observations", "new abnormal observations"],
-           loc="upper left")
+plt.legend(
+    [b1, b2, c],
+    ["training observations", "new regular observations", "new abnormal observations"],
+    loc="upper left",
+)
 plt.show()
diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py
index 6146a3bb72db1..1039f69b6fbe9 100644
--- a/examples/ensemble/plot_monotonic_constraints.py
+++ b/examples/ensemble/plot_monotonic_constraints.py
@@ -33,9 +33,7 @@
 f_1 = rng.rand(n_samples)  # negative correlation with y
 X = np.c_[f_0, f_1]
 noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
-y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
-     5 * f_1 - np.cos(10 * np.pi * f_1) +
-     noise)
+y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
 
 fig, ax = plt.subplots()
 
diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py
index 4d0ccd4502c31..339dab440502d 100644
--- a/examples/ensemble/plot_random_forest_embedding.py
+++ b/examples/ensemble/plot_random_forest_embedding.py
@@ -57,23 +57,24 @@
 fig = plt.figure(figsize=(9, 8))
 
 ax = plt.subplot(221)
-ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k')
+ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k")
 ax.set_title("Original Data (2d)")
 ax.set_xticks(())
 ax.set_yticks(())
 
 ax = plt.subplot(222)
-ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k')
-ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" %
-             X_transformed.shape[1])
+ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor="k")
+ax.set_title(
+    "Truncated SVD reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]
+)
 ax.set_xticks(())
 ax.set_yticks(())
 
 # Plot the decision in original space. For that, we will assign a color
 # to each point in the mesh [x_min, x_max]x[y_min, y_max].
-h = .01
-x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+h = 0.01
+x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
 # transform grid using RandomTreesEmbedding
@@ -83,7 +84,7 @@
 ax = plt.subplot(223)
 ax.set_title("Naive Bayes on Transformed data")
 ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
-ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k')
+ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k")
 ax.set_ylim(-1.4, 1.4)
 ax.set_xlim(-1.4, 1.4)
 ax.set_xticks(())
@@ -95,7 +96,7 @@
 ax = plt.subplot(224)
 ax.set_title("ExtraTrees predictions")
 ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
-ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k')
+ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k")
 ax.set_ylim(-1.4, 1.4)
 ax.set_xlim(-1.4, 1.4)
 ax.set_xticks(())
diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py
index 8b7803361a60a..220f759ba40b1 100644
--- a/examples/ensemble/plot_random_forest_regression_multioutput.py
+++ b/examples/ensemble/plot_random_forest_regression_multioutput.py
@@ -37,19 +37,19 @@
 rng = np.random.RandomState(1)
 X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
 y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
-y += (0.5 - rng.rand(*y.shape))
+y += 0.5 - rng.rand(*y.shape)
 
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, train_size=400, test_size=200, random_state=4)
+    X, y, train_size=400, test_size=200, random_state=4
+)
 
 max_depth = 30
-regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
-                                                          max_depth=max_depth,
-                                                          random_state=0))
+regr_multirf = MultiOutputRegressor(
+    RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
+)
 regr_multirf.fit(X_train, y_train)
 
-regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth,
-                                random_state=2)
+regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=2)
 regr_rf.fit(X_train, y_train)
 
 # Predict on new data
@@ -60,14 +60,35 @@
 plt.figure()
 s = 50
 a = 0.4
-plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k',
-            c="navy", s=s, marker="s", alpha=a, label="Data")
-plt.scatter(y_multirf[:, 0], y_multirf[:, 1], edgecolor='k',
-            c="cornflowerblue", s=s, alpha=a,
-            label="Multi RF score=%.2f" % regr_multirf.score(X_test, y_test))
-plt.scatter(y_rf[:, 0], y_rf[:, 1], edgecolor='k',
-            c="c", s=s, marker="^", alpha=a,
-            label="RF score=%.2f" % regr_rf.score(X_test, y_test))
+plt.scatter(
+    y_test[:, 0],
+    y_test[:, 1],
+    edgecolor="k",
+    c="navy",
+    s=s,
+    marker="s",
+    alpha=a,
+    label="Data",
+)
+plt.scatter(
+    y_multirf[:, 0],
+    y_multirf[:, 1],
+    edgecolor="k",
+    c="cornflowerblue",
+    s=s,
+    alpha=a,
+    label="Multi RF score=%.2f" % regr_multirf.score(X_test, y_test),
+)
+plt.scatter(
+    y_rf[:, 0],
+    y_rf[:, 1],
+    edgecolor="k",
+    c="c",
+    s=s,
+    marker="^",
+    alpha=a,
+    label="RF score=%.2f" % regr_rf.score(X_test, y_test),
+)
 plt.xlim([-6, 6])
 plt.ylim([-6, 6])
 plt.xlabel("target 1")
diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
index afa48c62d8d0b..1b48d50f2c40f 100644
--- a/examples/ensemble/plot_stack_predictors.py
+++ b/examples/ensemble/plot_stack_predictors.py
@@ -23,7 +23,8 @@
 print(__doc__)
 
 from sklearn import set_config
-set_config(display='diagram')
+
+set_config(display="diagram")
 
 # %%
 # Download the dataset
@@ -54,11 +55,28 @@ def load_ames_housing():
     X = df.data
     y = df.target
 
-    features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating',
-                'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea',
-                'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars',
-                'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1',
-                'HouseStyle', 'MiscFeature', 'MoSold']
+    features = [
+        "YrSold",
+        "HeatingQC",
+        "Street",
+        "YearRemodAdd",
+        "Heating",
+        "MasVnrType",
+        "BsmtUnfSF",
+        "Foundation",
+        "MasVnrArea",
+        "MSSubClass",
+        "ExterQual",
+        "Condition2",
+        "GarageCars",
+        "GarageType",
+        "OverallQual",
+        "TotalBsmtSF",
+        "BsmtFinSF1",
+        "HouseStyle",
+        "MiscFeature",
+        "MoSold",
+    ]
 
     X = X[features]
     X, y = shuffle(X, y, random_state=0)
@@ -105,11 +123,13 @@ def load_ames_housing():
 from sklearn.preprocessing import OrdinalEncoder
 
 cat_tree_processor = OrdinalEncoder(
-    handle_unknown="use_encoded_value", unknown_value=-1)
+    handle_unknown="use_encoded_value", unknown_value=-1
+)
 num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
 
 tree_preprocessor = make_column_transformer(
-    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector))
+    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
+)
 tree_preprocessor
 
 # %%
@@ -121,10 +141,12 @@ def load_ames_housing():
 
 cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
 num_linear_processor = make_pipeline(
-    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True))
+    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
+)
 
 linear_preprocessor = make_column_transformer(
-    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
+    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)
+)
 linear_preprocessor
 
 # %%
@@ -155,27 +177,28 @@ def load_ames_housing():
 # %%
 from sklearn.ensemble import RandomForestRegressor
 
-rf_pipeline = make_pipeline(
-    tree_preprocessor, RandomForestRegressor(random_state=42))
+rf_pipeline = make_pipeline(tree_preprocessor, RandomForestRegressor(random_state=42))
 rf_pipeline
 
 # %%
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 gbdt_pipeline = make_pipeline(
-    tree_preprocessor, HistGradientBoostingRegressor(random_state=0))
+    tree_preprocessor, HistGradientBoostingRegressor(random_state=0)
+)
 gbdt_pipeline
 
 # %%
 from sklearn.ensemble import StackingRegressor
 from sklearn.linear_model import RidgeCV
 
-estimators = [('Random Forest', rf_pipeline),
-              ('Lasso', lasso_pipeline),
-              ('Gradient Boosting', gbdt_pipeline)]
+estimators = [
+    ("Random Forest", rf_pipeline),
+    ("Lasso", lasso_pipeline),
+    ("Gradient Boosting", gbdt_pipeline),
+]
 
-stacking_regressor = StackingRegressor(
-    estimators=estimators, final_estimator=RidgeCV())
+stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
 stacking_regressor
 
 # %%
@@ -197,52 +220,58 @@ def load_ames_housing():
 
 def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time):
     """Scatter plot of the predicted vs true targets."""
-    ax.plot([y_true.min(), y_true.max()],
-            [y_true.min(), y_true.max()],
-            '--r', linewidth=2)
+    ax.plot(
+        [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2
+    )
     ax.scatter(y_true, y_pred, alpha=0.2)
 
-    ax.spines['top'].set_visible(False)
-    ax.spines['right'].set_visible(False)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
     ax.get_xaxis().tick_bottom()
     ax.get_yaxis().tick_left()
-    ax.spines['left'].set_position(('outward', 10))
-    ax.spines['bottom'].set_position(('outward', 10))
+    ax.spines["left"].set_position(("outward", 10))
+    ax.spines["bottom"].set_position(("outward", 10))
     ax.set_xlim([y_true.min(), y_true.max()])
     ax.set_ylim([y_true.min(), y_true.max()])
-    ax.set_xlabel('Measured')
-    ax.set_ylabel('Predicted')
-    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
-                          edgecolor='none', linewidth=0)
-    ax.legend([extra], [scores], loc='upper left')
-    title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time)
+    ax.set_xlabel("Measured")
+    ax.set_ylabel("Predicted")
+    extra = plt.Rectangle(
+        (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0
+    )
+    ax.legend([extra], [scores], loc="upper left")
+    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
     ax.set_title(title)
 
 
 fig, axs = plt.subplots(2, 2, figsize=(9, 7))
 axs = np.ravel(axs)
 
-for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor',
-                                               stacking_regressor)]):
+for ax, (name, est) in zip(
+    axs, estimators + [("Stacking Regressor", stacking_regressor)]
+):
     start_time = time.time()
-    score = cross_validate(est, X, y,
-                           scoring=['r2', 'neg_mean_absolute_error'],
-                           n_jobs=-1, verbose=0)
+    score = cross_validate(
+        est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0
+    )
     elapsed_time = time.time() - start_time
 
     y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
 
     plot_regression_results(
-        ax, y, y_pred,
+        ax,
+        y,
+        y_pred,
         name,
-        (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$')
-        .format(np.mean(score['test_r2']),
-                np.std(score['test_r2']),
-                -np.mean(score['test_neg_mean_absolute_error']),
-                np.std(score['test_neg_mean_absolute_error'])),
-        elapsed_time)
-
-plt.suptitle('Single predictors versus stacked predictors')
+        (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format(
+            np.mean(score["test_r2"]),
+            np.std(score["test_r2"]),
+            -np.mean(score["test_neg_mean_absolute_error"]),
+            np.std(score["test_neg_mean_absolute_error"]),
+        ),
+        elapsed_time,
+    )
+
+plt.suptitle("Single predictors versus stacked predictors")
 plt.tight_layout()
 plt.subplots_adjust(top=0.9)
 plt.show()
diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py
index fdfda74947f5f..7b2cb278c9035 100644
--- a/examples/ensemble/plot_voting_decision_regions.py
+++ b/examples/ensemble/plot_voting_decision_regions.py
@@ -43,10 +43,12 @@
 # Training classifiers
 clf1 = DecisionTreeClassifier(max_depth=4)
 clf2 = KNeighborsClassifier(n_neighbors=7)
-clf3 = SVC(gamma=.1, kernel='rbf', probability=True)
-eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
-                                    ('svc', clf3)],
-                        voting='soft', weights=[2, 1, 2])
+clf3 = SVC(gamma=0.1, kernel="rbf", probability=True)
+eclf = VotingClassifier(
+    estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)],
+    voting="soft",
+    weights=[2, 1, 2],
+)
 
 clf1.fit(X, y)
 clf2.fit(X, y)
@@ -56,22 +58,21 @@
 # Plotting decision regions
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
-                     np.arange(y_min, y_max, 0.1))
+xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
 
-f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
+f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8))
 
-for idx, clf, tt in zip(product([0, 1], [0, 1]),
-                        [clf1, clf2, clf3, eclf],
-                        ['Decision Tree (depth=4)', 'KNN (k=7)',
-                         'Kernel SVM', 'Soft Voting']):
+for idx, clf, tt in zip(
+    product([0, 1], [0, 1]),
+    [clf1, clf2, clf3, eclf],
+    ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"],
+):
 
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
     Z = Z.reshape(xx.shape)
 
     axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
-    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
-                                  s=20, edgecolor='k')
+    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
     axarr[idx[0], idx[1]].set_title(tt)
 
 plt.show()
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index 1e65e7d725964..311539f36fc75 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -38,9 +38,11 @@
 X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
 y = np.array([1, 1, 2, 2])
 
-eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
-                        voting='soft',
-                        weights=[1, 1, 5])
+eclf = VotingClassifier(
+    estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
+    voting="soft",
+    weights=[1, 1, 5],
+)
 
 # predict class probabilities for all classifiers
 probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)]
@@ -59,28 +61,36 @@
 fig, ax = plt.subplots()
 
 # bars for classifier 1-3
-p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width,
-            color='green', edgecolor='k')
-p2 = ax.bar(ind + width, np.hstack(([class2_1[:-1], [0]])), width,
-            color='lightgreen', edgecolor='k')
+p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k")
+p2 = ax.bar(
+    ind + width,
+    np.hstack(([class2_1[:-1], [0]])),
+    width,
+    color="lightgreen",
+    edgecolor="k",
+)
 
 # bars for VotingClassifier
-p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width,
-            color='blue', edgecolor='k')
-p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]], width,
-            color='steelblue', edgecolor='k')
+p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k")
+p4 = ax.bar(
+    ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k"
+)
 
 # plot annotations
-plt.axvline(2.8, color='k', linestyle='dashed')
+plt.axvline(2.8, color="k", linestyle="dashed")
 ax.set_xticks(ind + width)
-ax.set_xticklabels(['LogisticRegression\nweight 1',
-                    'GaussianNB\nweight 1',
-                    'RandomForestClassifier\nweight 5',
-                    'VotingClassifier\n(average probabilities)'],
-                   rotation=40,
-                   ha='right')
+ax.set_xticklabels(
+    [
+        "LogisticRegression\nweight 1",
+        "GaussianNB\nweight 1",
+        "RandomForestClassifier\nweight 5",
+        "VotingClassifier\n(average probabilities)",
+    ],
+    rotation=40,
+    ha="right",
+)
 plt.ylim([0, 1])
-plt.title('Class probabilities for sample 1 by different classifiers')
-plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left')
+plt.title("Class probabilities for sample 1 by different classifiers")
+plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left")
 plt.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py
index 106efbac2461d..bc0c0b059e9e6 100644
--- a/examples/ensemble/plot_voting_regressor.py
+++ b/examples/ensemble/plot_voting_regressor.py
@@ -51,7 +51,7 @@
 reg2.fit(X, y)
 reg3.fit(X, y)
 
-ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])
+ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)])
 ereg.fit(X, y)
 
 # %%
@@ -75,16 +75,15 @@
 # prediction made by :class:`~ensemble.VotingRegressor`.
 
 plt.figure()
-plt.plot(pred1, 'gd', label='GradientBoostingRegressor')
-plt.plot(pred2, 'b^', label='RandomForestRegressor')
-plt.plot(pred3, 'ys', label='LinearRegression')
-plt.plot(pred4, 'r*', ms=10, label='VotingRegressor')
-
-plt.tick_params(axis='x', which='both', bottom=False, top=False,
-                labelbottom=False)
-plt.ylabel('predicted')
-plt.xlabel('training samples')
+plt.plot(pred1, "gd", label="GradientBoostingRegressor")
+plt.plot(pred2, "b^", label="RandomForestRegressor")
+plt.plot(pred3, "ys", label="LinearRegression")
+plt.plot(pred4, "r*", ms=10, label="VotingRegressor")
+
+plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
+plt.ylabel("predicted")
+plt.xlabel("training samples")
 plt.legend(loc="best")
-plt.title('Regressor predictions and their average')
+plt.title("Regressor predictions and their average")
 
 plt.show()
diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py
index d71abd8fe6455..aa836111f081d 100644
--- a/examples/exercises/plot_cv_diabetes.py
+++ b/examples/exercises/plot_cv_diabetes.py
@@ -27,28 +27,28 @@
 lasso = Lasso(random_state=0, max_iter=10000)
 alphas = np.logspace(-4, -0.5, 30)
 
-tuned_parameters = [{'alpha': alphas}]
+tuned_parameters = [{"alpha": alphas}]
 n_folds = 5
 
 clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)
 clf.fit(X, y)
-scores = clf.cv_results_['mean_test_score']
-scores_std = clf.cv_results_['std_test_score']
+scores = clf.cv_results_["mean_test_score"]
+scores_std = clf.cv_results_["std_test_score"]
 plt.figure().set_size_inches(8, 6)
 plt.semilogx(alphas, scores)
 
 # plot error lines showing +/- std. errors of the scores
 std_error = scores_std / np.sqrt(n_folds)
 
-plt.semilogx(alphas, scores + std_error, 'b--')
-plt.semilogx(alphas, scores - std_error, 'b--')
+plt.semilogx(alphas, scores + std_error, "b--")
+plt.semilogx(alphas, scores - std_error, "b--")
 
 # alpha=0.2 controls the translucency of the fill color
 plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)
 
-plt.ylabel('CV score +/- std error')
-plt.xlabel('alpha')
-plt.axhline(np.max(scores), linestyle='--', color='.5')
+plt.ylabel("CV score +/- std error")
+plt.xlabel("alpha")
+plt.axhline(np.max(scores), linestyle="--", color=".5")
 plt.xlim([alphas[0], alphas[-1]])
 
 # #############################################################################
@@ -62,15 +62,17 @@
 lasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)
 k_fold = KFold(3)
 
-print("Answer to the bonus question:",
-      "how much can you trust the selection of alpha?")
+print("Answer to the bonus question:", "how much can you trust the selection of alpha?")
 print()
 print("Alpha parameters maximising the generalization score on different")
 print("subsets of the data:")
 for k, (train, test) in enumerate(k_fold.split(X, y)):
     lasso_cv.fit(X[train], y[train])
-    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
-          format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
+    print(
+        "[fold {0}] alpha: {1:.5f}, score: {2:.5f}".format(
+            k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])
+        )
+    )
 print()
 print("Answer: Not very much since we obtained different alphas for different")
 print("subsets of the data and moreover, the scores for these alphas differ")
diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py
index 4f4ef8cc761e6..f350444395be7 100644
--- a/examples/exercises/plot_cv_digits.py
+++ b/examples/exercises/plot_cv_digits.py
@@ -17,7 +17,7 @@
 
 X, y = datasets.load_digits(return_X_y=True)
 
-svc = svm.SVC(kernel='linear')
+svc = svm.SVC(kernel="linear")
 C_s = np.logspace(-10, 0, 10)
 
 scores = list()
@@ -30,13 +30,14 @@
 
 # Do the plotting
 import matplotlib.pyplot as plt
+
 plt.figure()
 plt.semilogx(C_s, scores)
-plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
-plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
+plt.semilogx(C_s, np.array(scores) + np.array(scores_std), "b--")
+plt.semilogx(C_s, np.array(scores) - np.array(scores_std), "b--")
 locs, labels = plt.yticks()
 plt.yticks(locs, list(map(lambda x: "%g" % x, locs)))
-plt.ylabel('CV score')
-plt.xlabel('Parameter C')
+plt.ylabel("CV score")
+plt.xlabel("Parameter C")
 plt.ylim(0, 1.1)
 plt.show()
diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
index f5f01687d03eb..638631f80d066 100644
--- a/examples/exercises/plot_digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -19,14 +19,16 @@
 
 n_samples = len(X_digits)
 
-X_train = X_digits[:int(.9 * n_samples)]
-y_train = y_digits[:int(.9 * n_samples)]
-X_test = X_digits[int(.9 * n_samples):]
-y_test = y_digits[int(.9 * n_samples):]
+X_train = X_digits[: int(0.9 * n_samples)]
+y_train = y_digits[: int(0.9 * n_samples)]
+X_test = X_digits[int(0.9 * n_samples) :]
+y_test = y_digits[int(0.9 * n_samples) :]
 
 knn = neighbors.KNeighborsClassifier()
 logistic = linear_model.LogisticRegression(max_iter=1000)
 
-print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
-print('LogisticRegression score: %f'
-      % logistic.fit(X_train, y_train).score(X_test, y_test))
+print("KNN score: %f" % knn.fit(X_train, y_train).score(X_test, y_test))
+print(
+    "LogisticRegression score: %f"
+    % logistic.fit(X_train, y_train).score(X_test, y_test)
+)
diff --git a/examples/exercises/plot_iris_exercise.py b/examples/exercises/plot_iris_exercise.py
index 39723a2271f5d..98586c311697e 100644
--- a/examples/exercises/plot_iris_exercise.py
+++ b/examples/exercises/plot_iris_exercise.py
@@ -29,26 +29,28 @@
 X = X[order]
 y = y[order].astype(float)
 
-X_train = X[:int(.9 * n_sample)]
-y_train = y[:int(.9 * n_sample)]
-X_test = X[int(.9 * n_sample):]
-y_test = y[int(.9 * n_sample):]
+X_train = X[: int(0.9 * n_sample)]
+y_train = y[: int(0.9 * n_sample)]
+X_test = X[int(0.9 * n_sample) :]
+y_test = y[int(0.9 * n_sample) :]
 
 # fit the model
-for kernel in ('linear', 'rbf', 'poly'):
+for kernel in ("linear", "rbf", "poly"):
     clf = svm.SVC(kernel=kernel, gamma=10)
     clf.fit(X_train, y_train)
 
     plt.figure()
     plt.clf()
-    plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired,
-                edgecolor='k', s=20)
+    plt.scatter(
+        X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired, edgecolor="k", s=20
+    )
 
     # Circle out the test data
-    plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',
-                zorder=10, edgecolor='k')
+    plt.scatter(
+        X_test[:, 0], X_test[:, 1], s=80, facecolors="none", zorder=10, edgecolor="k"
+    )
 
-    plt.axis('tight')
+    plt.axis("tight")
     x_min = X[:, 0].min()
     x_max = X[:, 0].max()
     y_min = X[:, 1].min()
@@ -60,8 +62,14 @@
     # Put the result into a color plot
     Z = Z.reshape(XX.shape)
     plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
-    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
-                linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
+    plt.contour(
+        XX,
+        YY,
+        Z,
+        colors=["k", "k", "k"],
+        linestyles=["--", "-", "--"],
+        levels=[-0.5, 0, 0.5],
+    )
 
     plt.title(kernel)
 plt.show()
diff --git a/examples/feature_selection/plot_f_test_vs_mi.py b/examples/feature_selection/plot_f_test_vs_mi.py
index d9359380bfa96..d6fef3e62da0d 100644
--- a/examples/feature_selection/plot_f_test_vs_mi.py
+++ b/examples/feature_selection/plot_f_test_vs_mi.py
@@ -40,10 +40,9 @@
 plt.figure(figsize=(15, 5))
 for i in range(3):
     plt.subplot(1, 3, i + 1)
-    plt.scatter(X[:, i], y, edgecolor='black', s=20)
+    plt.scatter(X[:, i], y, edgecolor="black", s=20)
     plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
     if i == 0:
         plt.ylabel("$y$", fontsize=14)
-    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
-              fontsize=16)
+    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), fontsize=16)
 plt.show()
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index 1e4ef6a81bba8..a68019568e015 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -44,9 +44,7 @@
 X = np.hstack((X, E))
 
 # Split dataset to select feature and evaluate the classifier
-X_train, X_test, y_train, y_test = train_test_split(
-        X, y, stratify=y, random_state=0
-)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
 
 plt.figure(1)
 plt.clf()
@@ -61,38 +59,47 @@
 selector.fit(X_train, y_train)
 scores = -np.log10(selector.pvalues_)
 scores /= scores.max()
-plt.bar(X_indices - .45, scores, width=.2,
-        label=r'Univariate score ($-Log(p_{value})$)')
+plt.bar(
+    X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
+)
 
 # #############################################################################
 # Compare to the weights of an SVM
 clf = make_pipeline(MinMaxScaler(), LinearSVC())
 clf.fit(X_train, y_train)
-print('Classification accuracy without selecting features: {:.3f}'
-      .format(clf.score(X_test, y_test)))
+print(
+    "Classification accuracy without selecting features: {:.3f}".format(
+        clf.score(X_test, y_test)
+    )
+)
 
 svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
 svm_weights /= svm_weights.sum()
 
-plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight')
+plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")
 
-clf_selected = make_pipeline(
-        SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()
-)
+clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())
 clf_selected.fit(X_train, y_train)
-print('Classification accuracy after univariate feature selection: {:.3f}'
-      .format(clf_selected.score(X_test, y_test)))
+print(
+    "Classification accuracy after univariate feature selection: {:.3f}".format(
+        clf_selected.score(X_test, y_test)
+    )
+)
 
 svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
 svm_weights_selected /= svm_weights_selected.sum()
 
-plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
-        width=.2, label='SVM weights after selection')
+plt.bar(
+    X_indices[selector.get_support()] - 0.05,
+    svm_weights_selected,
+    width=0.2,
+    label="SVM weights after selection",
+)
 
 
 plt.title("Comparing feature selection")
-plt.xlabel('Feature number')
+plt.xlabel("Feature number")
 plt.yticks(())
-plt.axis('tight')
-plt.legend(loc='upper right')
+plt.axis("tight")
+plt.legend(loc="upper right")
 plt.show()
diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
index 871c894ee0711..a9a426a0e7b47 100644
--- a/examples/feature_selection/plot_feature_selection_pipeline.py
+++ b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -11,7 +11,8 @@
 
 print(__doc__)
 from sklearn import set_config
-set_config(display='diagram')
+
+set_config(display="diagram")
 
 # %%
 # We will start by generating a binary classification dataset. Subsequently, we
@@ -21,8 +22,13 @@
 from sklearn.model_selection import train_test_split
 
 X, y = make_classification(
-    n_features=20, n_informative=3, n_redundant=0, n_classes=2,
-    n_clusters_per_class=2, random_state=42)
+    n_features=20,
+    n_informative=3,
+    n_redundant=0,
+    n_classes=2,
+    n_clusters_per_class=2,
+    random_state=42,
+)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
 
 # %%
diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py
index 71acf5eace22b..160fd5a826376 100644
--- a/examples/feature_selection/plot_rfe_with_cross_validation.py
+++ b/examples/feature_selection/plot_rfe_with_cross_validation.py
@@ -15,18 +15,29 @@
 from sklearn.datasets import make_classification
 
 # Build a classification task using 3 informative features
-X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
-                           n_redundant=2, n_repeated=0, n_classes=8,
-                           n_clusters_per_class=1, random_state=0)
+X, y = make_classification(
+    n_samples=1000,
+    n_features=25,
+    n_informative=3,
+    n_redundant=2,
+    n_repeated=0,
+    n_classes=8,
+    n_clusters_per_class=1,
+    random_state=0,
+)
 
 # Create the RFE object and compute a cross-validated score.
 svc = SVC(kernel="linear")
 # The "accuracy" scoring shows the proportion of correct classifications
 
 min_features_to_select = 1  # Minimum number of features to consider
-rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
-              scoring='accuracy',
-              min_features_to_select=min_features_to_select)
+rfecv = RFECV(
+    estimator=svc,
+    step=1,
+    cv=StratifiedKFold(2),
+    scoring="accuracy",
+    min_features_to_select=min_features_to_select,
+)
 rfecv.fit(X, y)
 
 print("Optimal number of features : %d" % rfecv.n_features_)
@@ -35,7 +46,8 @@
 plt.figure()
 plt.xlabel("Number of features selected")
 plt.ylabel("Cross validation score (accuracy)")
-plt.plot(range(min_features_to_select,
-               len(rfecv.grid_scores_) + min_features_to_select),
-         rfecv.grid_scores_)
+plt.plot(
+    range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
+    rfecv.grid_scores_,
+)
 plt.show()
diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py
index fdc57dc867f9e..353774acd113e 100644
--- a/examples/feature_selection/plot_select_from_model_diabetes.py
+++ b/examples/feature_selection/plot_select_from_model_diabetes.py
@@ -77,8 +77,7 @@
 tic = time()
 sfm = SelectFromModel(lasso, threshold=threshold).fit(X, y)
 toc = time()
-print("Features selected by SelectFromModel: "
-      f"{feature_names[sfm.get_support()]}")
+print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")
 print(f"Done in {toc - tic:.3f}s")
 
 # %%
@@ -100,20 +99,26 @@
 from sklearn.feature_selection import SequentialFeatureSelector
 
 tic_fwd = time()
-sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=2,
-                                        direction='forward').fit(X, y)
+sfs_forward = SequentialFeatureSelector(
+    lasso, n_features_to_select=2, direction="forward"
+).fit(X, y)
 toc_fwd = time()
 
 tic_bwd = time()
-sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2,
-                                         direction='backward').fit(X, y)
+sfs_backward = SequentialFeatureSelector(
+    lasso, n_features_to_select=2, direction="backward"
+).fit(X, y)
 toc_bwd = time()
 
-print("Features selected by forward sequential selection: "
-      f"{feature_names[sfs_forward.get_support()]}")
+print(
+    "Features selected by forward sequential selection: "
+    f"{feature_names[sfs_forward.get_support()]}"
+)
 print(f"Done in {toc_fwd - tic_fwd:.3f}s")
-print("Features selected by backward sequential selection: "
-      f"{feature_names[sfs_backward.get_support()]}")
+print(
+    "Features selected by backward sequential selection: "
+    f"{feature_names[sfs_backward.get_support()]}"
+)
 print(f"Done in {toc_bwd - tic_bwd:.3f}s")
 
 # %%
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 1eb771673b0d6..dd60fa2d6dda8 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -69,17 +69,20 @@
 y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
 
 # Fit KernelRidge with parameter selection based on 5-fold cross validation
-param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
-              "kernel": [ExpSineSquared(l, p)
-                         for l in np.logspace(-2, 2, 10)
-                         for p in np.logspace(0, 2, 10)]}
+param_grid = {
+    "alpha": [1e0, 1e-1, 1e-2, 1e-3],
+    "kernel": [
+        ExpSineSquared(l, p)
+        for l in np.logspace(-2, 2, 10)
+        for p in np.logspace(0, 2, 10)
+    ],
+}
 kr = GridSearchCV(KernelRidge(), param_grid=param_grid)
 stime = time.time()
 kr.fit(X, y)
 print("Time for KRR fitting: %.3f" % (time.time() - stime))
 
-gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
-    + WhiteKernel(1e-1)
+gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) + WhiteKernel(1e-1)
 gpr = GaussianProcessRegressor(kernel=gp_kernel)
 stime = time.time()
 gpr.fit(X, y)
@@ -98,24 +101,22 @@
 
 stime = time.time()
 y_gpr, y_std = gpr.predict(X_plot, return_std=True)
-print("Time for GPR prediction with standard-deviation: %.3f"
-      % (time.time() - stime))
+print("Time for GPR prediction with standard-deviation: %.3f" % (time.time() - stime))
 
 # Plot results
 plt.figure(figsize=(10, 5))
 lw = 2
-plt.scatter(X, y, c='k', label='data')
-plt.plot(X_plot, np.sin(X_plot), color='navy', lw=lw, label='True')
-plt.plot(X_plot, y_kr, color='turquoise', lw=lw,
-         label='KRR (%s)' % kr.best_params_)
-plt.plot(X_plot, y_gpr, color='darkorange', lw=lw,
-         label='GPR (%s)' % gpr.kernel_)
-plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='darkorange',
-                 alpha=0.2)
-plt.xlabel('data')
-plt.ylabel('target')
+plt.scatter(X, y, c="k", label="data")
+plt.plot(X_plot, np.sin(X_plot), color="navy", lw=lw, label="True")
+plt.plot(X_plot, y_kr, color="turquoise", lw=lw, label="KRR (%s)" % kr.best_params_)
+plt.plot(X_plot, y_gpr, color="darkorange", lw=lw, label="GPR (%s)" % gpr.kernel_)
+plt.fill_between(
+    X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color="darkorange", alpha=0.2
+)
+plt.xlabel("data")
+plt.ylabel("target")
 plt.xlim(0, 20)
 plt.ylim(-4, 4)
-plt.title('GPR versus Kernel Ridge')
-plt.legend(loc="best",  scatterpoints=1, prop={'size': 8})
+plt.title("GPR versus Kernel Ridge")
+plt.legend(loc="best", scatterpoints=1, prop={"size": 8})
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 1d00b9f330400..ab22134e2048c 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -42,37 +42,58 @@
 y = np.array(X[:, 0] > 2.5, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
-gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
-                                   optimizer=None)
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), optimizer=None)
 gp_fix.fit(X[:train_size], y[:train_size])
 
 gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
 gp_opt.fit(X[:train_size], y[:train_size])
 
-print("Log Marginal Likelihood (initial): %.3f"
-      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
-print("Log Marginal Likelihood (optimized): %.3f"
-      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))
-
-print("Accuracy: %.3f (initial) %.3f (optimized)"
-      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
-         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
-print("Log-loss: %.3f (initial) %.3f (optimized)"
-      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
-         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))
+print(
+    "Log Marginal Likelihood (initial): %.3f"
+    % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)
+)
+print(
+    "Log Marginal Likelihood (optimized): %.3f"
+    % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)
+)
+
+print(
+    "Accuracy: %.3f (initial) %.3f (optimized)"
+    % (
+        accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
+        accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])),
+    )
+)
+print(
+    "Log-loss: %.3f (initial) %.3f (optimized)"
+    % (
+        log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
+        log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]),
+    )
+)
 
 
 # Plot posteriors
 plt.figure()
-plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
-            edgecolors=(0, 0, 0))
-plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
-            edgecolors=(0, 0, 0))
+plt.scatter(
+    X[:train_size, 0], y[:train_size], c="k", label="Train data", edgecolors=(0, 0, 0)
+)
+plt.scatter(
+    X[train_size:, 0], y[train_size:], c="g", label="Test data", edgecolors=(0, 0, 0)
+)
 X_ = np.linspace(0, 5, 100)
-plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
-         label="Initial kernel: %s" % gp_fix.kernel_)
-plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
-         label="Optimized kernel: %s" % gp_opt.kernel_)
+plt.plot(
+    X_,
+    gp_fix.predict_proba(X_[:, np.newaxis])[:, 1],
+    "r",
+    label="Initial kernel: %s" % gp_fix.kernel_,
+)
+plt.plot(
+    X_,
+    gp_opt.predict_proba(X_[:, np.newaxis])[:, 1],
+    "b",
+    label="Optimized kernel: %s" % gp_opt.kernel_,
+)
 plt.xlabel("Feature")
 plt.ylabel("Class 1 probability")
 plt.xlim(0, 5)
@@ -84,13 +105,20 @@
 theta0 = np.logspace(0, 8, 30)
 theta1 = np.logspace(-1, 1, 29)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
-        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = [
+    [
+        gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
+        for i in range(Theta0.shape[0])
+    ]
+    for j in range(Theta0.shape[1])
+]
 LML = np.array(LML).T
-plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
-         'ko', zorder=10)
-plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
-         'ko', zorder=10)
+plt.plot(
+    np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1], "ko", zorder=10
+)
+plt.plot(
+    np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1], "ko", zorder=10
+)
 plt.pcolor(Theta0, Theta1, LML)
 plt.xscale("log")
 plt.yscale("log")
diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py
index fe1030131709e..4aa4121664ece 100644
--- a/examples/gaussian_process/plot_gpc_iris.py
+++ b/examples/gaussian_process/plot_gpc_iris.py
@@ -21,7 +21,7 @@
 X = iris.data[:, :2]  # we only take the first two features.
 y = np.array(iris.target, dtype=int)
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 kernel = 1.0 * RBF([1.0])
 gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
@@ -31,8 +31,7 @@
 # create a mesh to plot in
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                     np.arange(y_min, y_max, h))
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
 titles = ["Isotropic RBF", "Anisotropic RBF"]
 plt.figure(figsize=(10, 5))
@@ -48,16 +47,16 @@
     plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")
 
     # Plot also the training points
-    plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y],
-                edgecolors=(0, 0, 0))
-    plt.xlabel('Sepal length')
-    plt.ylabel('Sepal width')
+    plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y], edgecolors=(0, 0, 0))
+    plt.xlabel("Sepal length")
+    plt.ylabel("Sepal width")
     plt.xlim(xx.min(), xx.max())
     plt.ylim(yy.min(), yy.max())
     plt.xticks(())
     plt.yticks(())
-    plt.title("%s, LML: %.3f" %
-              (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))
+    plt.title(
+        "%s, LML: %.3f" % (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta))
+    )
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 7b2a14cf4fc41..cd0fd740e2fc3 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -31,18 +31,22 @@
 def g(x):
     """The function to predict (classification will then consist in predicting
     whether g(x) <= 0 or not)"""
-    return 5. - x[:, 1] - .5 * x[:, 0] ** 2.
+    return 5.0 - x[:, 1] - 0.5 * x[:, 0] ** 2.0
 
 
 # Design of experiments
-X = np.array([[-4.61611719, -6.00099547],
-              [4.10469096, 5.32782448],
-              [0.00000000, -0.50000000],
-              [-6.17289014, -4.6984743],
-              [1.3109306, -6.93271427],
-              [-5.03823144, 3.10584743],
-              [-2.87600388, 6.74310541],
-              [5.21301203, 4.26386883]])
+X = np.array(
+    [
+        [-4.61611719, -6.00099547],
+        [4.10469096, 5.32782448],
+        [0.00000000, -0.50000000],
+        [-6.17289014, -4.6984743],
+        [1.3109306, -6.93271427],
+        [-5.03823144, 3.10584743],
+        [-2.87600388, 6.74310541],
+        [5.21301203, 4.26386883],
+    ]
+)
 
 # Observations
 y = np.array(g(X) > 0, dtype=int)
@@ -55,8 +59,7 @@ def g(x):
 
 # Evaluate real function and the predicted probability
 res = 50
-x1, x2 = np.meshgrid(np.linspace(- lim, lim, res),
-                     np.linspace(- lim, lim, res))
+x1, x2 = np.meshgrid(np.linspace(-lim, lim, res), np.linspace(-lim, lim, res))
 xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T
 
 y_true = g(xx)
@@ -67,37 +70,33 @@ def g(x):
 # Plot the probabilistic classification iso-values
 fig = plt.figure(1)
 ax = fig.gca()
-ax.axes.set_aspect('equal')
+ax.axes.set_aspect("equal")
 plt.xticks([])
 plt.yticks([])
 ax.set_xticklabels([])
 ax.set_yticklabels([])
-plt.xlabel('$x_1$')
-plt.ylabel('$x_2$')
-
-cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
-                 extent=(-lim, lim, -lim, lim))
-norm = plt.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
-cb = plt.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
-cb.set_label(r'${\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\right]$')
+plt.xlabel("$x_1$")
+plt.ylabel("$x_2$")
+
+cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8, extent=(-lim, lim, -lim, lim))
+norm = plt.matplotlib.colors.Normalize(vmin=0.0, vmax=0.9)
+cb = plt.colorbar(cax, ticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], norm=norm)
+cb.set_label(r"${\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\right]$")
 plt.clim(0, 1)
 
-plt.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
+plt.plot(X[y <= 0, 0], X[y <= 0, 1], "r.", markersize=12)
 
-plt.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12)
+plt.plot(X[y > 0, 0], X[y > 0, 1], "b.", markersize=12)
 
-plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')
+plt.contour(x1, x2, y_true, [0.0], colors="k", linestyles="dashdot")
 
-cs = plt.contour(x1, x2, y_prob, [0.666], colors='b',
-                 linestyles='solid')
+cs = plt.contour(x1, x2, y_prob, [0.666], colors="b", linestyles="solid")
 plt.clabel(cs, fontsize=11)
 
-cs = plt.contour(x1, x2, y_prob, [0.5], colors='k',
-                 linestyles='dashed')
+cs = plt.contour(x1, x2, y_prob, [0.5], colors="k", linestyles="dashed")
 plt.clabel(cs, fontsize=11)
 
-cs = plt.contour(x1, x2, y_prob, [0.334], colors='r',
-                 linestyles='solid')
+cs = plt.contour(x1, x2, y_prob, [0.334], colors="r", linestyles="solid")
 plt.clabel(cs, fontsize=11)
 
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 04f014e13e8ae..011f36ada1021 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -22,15 +22,14 @@
 from sklearn.gaussian_process.kernels import RBF, DotProduct
 
 
-xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
-                     np.linspace(-3, 3, 50))
+xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50))
 rng = np.random.RandomState(0)
 X = rng.randn(200, 2)
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
 plt.figure(figsize=(10, 5))
-kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
+kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0) ** 2]
 for i, kernel in enumerate(kernels):
     clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
 
@@ -39,20 +38,25 @@
     Z = Z.reshape(xx.shape)
 
     plt.subplot(1, 2, i + 1)
-    image = plt.imshow(Z, interpolation='nearest',
-                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
-                       aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
-    contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2,
-                           colors=['k'])
-    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
-                edgecolors=(0, 0, 0))
+    image = plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        aspect="auto",
+        origin="lower",
+        cmap=plt.cm.PuOr_r,
+    )
+    contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2, colors=["k"])
+    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors=(0, 0, 0))
     plt.xticks(())
     plt.yticks(())
     plt.axis([-3, 3, -3, 3])
     plt.colorbar(image)
-    plt.title("%s\n Log-Marginal-Likelihood:%.3f"
-              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
-              fontsize=12)
+    plt.title(
+        "%s\n Log-Marginal-Likelihood:%.3f"
+        % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
+        fontsize=12,
+    )
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 2cc751438cbd4..7afe41d77cc63 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -67,8 +67,12 @@
 from matplotlib import pyplot as plt
 from sklearn.datasets import fetch_openml
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels \
-    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    WhiteKernel,
+    RationalQuadratic,
+    ExpSineSquared,
+)
 
 print(__doc__)
 
@@ -102,52 +106,54 @@ def load_mauna_loa_atmospheric_co2():
 X, y = load_mauna_loa_atmospheric_co2()
 
 # Kernel with parameters given in GPML book
-k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
-k2 = 2.4**2 * RBF(length_scale=90.0) \
-    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
+k1 = 66.0 ** 2 * RBF(length_scale=67.0)  # long term smooth rising trend
+k2 = (
+    2.4 ** 2
+    * RBF(length_scale=90.0)
+    * ExpSineSquared(length_scale=1.3, periodicity=1.0)
+)  # seasonal component
 # medium term irregularity
-k3 = 0.66**2 \
-    * RationalQuadratic(length_scale=1.2, alpha=0.78)
-k4 = 0.18**2 * RBF(length_scale=0.134) \
-    + WhiteKernel(noise_level=0.19**2)  # noise terms
+k3 = 0.66 ** 2 * RationalQuadratic(length_scale=1.2, alpha=0.78)
+k4 = 0.18 ** 2 * RBF(length_scale=0.134) + WhiteKernel(
+    noise_level=0.19 ** 2
+)  # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
-                              optimizer=None, normalize_y=True)
+gp = GaussianProcessRegressor(
+    kernel=kernel_gpml, alpha=0, optimizer=None, normalize_y=True
+)
 gp.fit(X, y)
 
 print("GPML kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f"
-      % gp.log_marginal_likelihood(gp.kernel_.theta))
+print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 # Kernel with optimized parameters
-k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
-k2 = 2.0**2 * RBF(length_scale=100.0) \
-    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
-                     periodicity_bounds="fixed")  # seasonal component
+k1 = 50.0 ** 2 * RBF(length_scale=50.0)  # long term smooth rising trend
+k2 = (
+    2.0 ** 2
+    * RBF(length_scale=100.0)
+    * ExpSineSquared(length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed")
+)  # seasonal component
 # medium term irregularities
-k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
-k4 = 0.1**2 * RBF(length_scale=0.1) \
-    + WhiteKernel(noise_level=0.1**2,
-                  noise_level_bounds=(1e-5, np.inf))  # noise terms
+k3 = 0.5 ** 2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
+k4 = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(
+    noise_level=0.1 ** 2, noise_level_bounds=(1e-5, np.inf)
+)  # noise terms
 kernel = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
-                              normalize_y=True)
+gp = GaussianProcessRegressor(kernel=kernel, alpha=0, normalize_y=True)
 gp.fit(X, y)
 
 print("\nLearned kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f"
-      % gp.log_marginal_likelihood(gp.kernel_.theta))
+print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
 y_pred, y_std = gp.predict(X_, return_std=True)
 
 # Illustration
-plt.scatter(X, y, c='k')
+plt.scatter(X, y, c="k")
 plt.plot(X_, y_pred)
-plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
-                 alpha=0.5, color='k')
+plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std, alpha=0.5, color="k")
 plt.xlim(X_.min(), X_.max())
 plt.xlabel("Year")
 plt.ylabel(r"CO$_2$ in ppm")
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 5f8ce2cd0fe96..0bba4827cd685 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -36,40 +36,50 @@
 
 # First run
 plt.figure()
-kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
-gp = GaussianProcessRegressor(kernel=kernel,
-                              alpha=0.0).fit(X, y)
+kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1, noise_level_bounds=(1e-10, 1e1)
+)
+gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.theta)))
+plt.plot(X_, y_mean, "k", lw=3, zorder=9)
+plt.fill_between(
+    X_,
+    y_mean - np.sqrt(np.diag(y_cov)),
+    y_mean + np.sqrt(np.diag(y_cov)),
+    alpha=0.5,
+    color="k",
+)
+plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0))
+plt.title(
+    "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+    % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta))
+)
 plt.tight_layout()
 
 # Second run
 plt.figure()
-kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
-gp = GaussianProcessRegressor(kernel=kernel,
-                              alpha=0.0).fit(X, y)
+kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1e-5, noise_level_bounds=(1e-10, 1e1)
+)
+gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.theta)))
+plt.plot(X_, y_mean, "k", lw=3, zorder=9)
+plt.fill_between(
+    X_,
+    y_mean - np.sqrt(np.diag(y_cov)),
+    y_mean + np.sqrt(np.diag(y_cov)),
+    alpha=0.5,
+    color="k",
+)
+plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0))
+plt.title(
+    "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+    % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta))
+)
 plt.tight_layout()
 
 # Plot LML landscape
@@ -77,15 +87,19 @@
 theta0 = np.logspace(-2, 3, 49)
 theta1 = np.logspace(-2, 0, 50)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
-        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = [
+    [
+        gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
+        for i in range(Theta0.shape[0])
+    ]
+    for j in range(Theta0.shape[1])
+]
 LML = np.array(LML).T
 
 vmin, vmax = (-LML).min(), (-LML).max()
 vmax = 50
 level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1)
-plt.contour(Theta0, Theta1, -LML,
-            levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
+plt.contour(Theta0, Theta1, -LML, levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
 plt.colorbar()
 plt.xscale("log")
 plt.yscale("log")
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index e11071cec5bfd..0058589f7a786 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -41,7 +41,7 @@ def f(x):
 
 # ----------------------------------------------------------------------
 #  First the noiseless case
-X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
+X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
 
 # Observations
 y = f(X).ravel()
@@ -63,17 +63,21 @@ def f(x):
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
 plt.figure()
-plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X, y, 'r.', markersize=10, label='Observations')
-plt.plot(x, y_pred, 'b-', label='Prediction')
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate([y_pred - 1.9600 * sigma,
-                        (y_pred + 1.9600 * sigma)[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% confidence interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
+plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
+plt.plot(X, y, "r.", markersize=10, label="Observations")
+plt.plot(x, y_pred, "b-", label="Prediction")
+plt.fill(
+    np.concatenate([x, x[::-1]]),
+    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]),
+    alpha=0.5,
+    fc="b",
+    ec="None",
+    label="95% confidence interval",
+)
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
 plt.ylim(-10, 20)
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 
 # ----------------------------------------------------------------------
 # now the noisy case
@@ -87,8 +91,7 @@ def f(x):
 y += noise
 
 # Instantiate a Gaussian Process model
-gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2,
-                              n_restarts_optimizer=10)
+gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2, n_restarts_optimizer=10)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
@@ -99,16 +102,20 @@ def f(x):
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
 plt.figure()
-plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
-plt.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label='Observations')
-plt.plot(x, y_pred, 'b-', label='Prediction')
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate([y_pred - 1.9600 * sigma,
-                        (y_pred + 1.9600 * sigma)[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% confidence interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
+plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
+plt.errorbar(X.ravel(), y, dy, fmt="r.", markersize=10, label="Observations")
+plt.plot(x, y_pred, "b-", label="Prediction")
+plt.fill(
+    np.concatenate([x, x[::-1]]),
+    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]),
+    alpha=0.5,
+    fc="b",
+    ec="None",
+    label="95% confidence interval",
+)
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
 plt.ylim(-10, 20)
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 
 plt.show()
diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py
index 64a84ab38647a..59af31664cb74 100644
--- a/examples/gaussian_process/plot_gpr_on_structured_data.py
+++ b/examples/gaussian_process/plot_gpr_on_structured_data.py
@@ -48,44 +48,43 @@
 
 
 class SequenceKernel(GenericKernelMixin, Kernel):
-    '''
+    """
     A minimal (but valid) convolutional kernel for sequences of variable
-    lengths.'''
-    def __init__(self,
-                 baseline_similarity=0.5,
-                 baseline_similarity_bounds=(1e-5, 1)):
+    lengths."""
+
+    def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
         self.baseline_similarity = baseline_similarity
         self.baseline_similarity_bounds = baseline_similarity_bounds
 
     @property
     def hyperparameter_baseline_similarity(self):
-        return Hyperparameter("baseline_similarity",
-                              "numeric",
-                              self.baseline_similarity_bounds)
+        return Hyperparameter(
+            "baseline_similarity", "numeric", self.baseline_similarity_bounds
+        )
 
     def _f(self, s1, s2):
-        '''
+        """
         kernel value between a pair of sequences
-        '''
-        return sum([1.0 if c1 == c2 else self.baseline_similarity
-                   for c1 in s1
-                   for c2 in s2])
+        """
+        return sum(
+            [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
+        )
 
     def _g(self, s1, s2):
-        '''
+        """
         kernel derivative between a pair of sequences
-        '''
-        return sum([0.0 if c1 == c2 else 1.0
-                    for c1 in s1
-                    for c2 in s2])
+        """
+        return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
 
     def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
             Y = X
 
         if eval_gradient:
-            return (np.array([[self._f(x, y) for y in Y] for x in X]),
-                    np.array([[[self._g(x, y)] for y in Y] for x in X]))
+            return (
+                np.array([[self._f(x, y) for y in Y] for x in X]),
+                np.array([[[self._g(x, y)] for y in Y] for x in X]),
+            )
         else:
             return np.array([[self._f(x, y) for y in Y] for x in X])
 
@@ -103,28 +102,28 @@ def clone_with_theta(self, theta):
 
 kernel = SequenceKernel()
 
-'''
+"""
 Sequence similarity matrix under the kernel
 ===========================================
-'''
+"""
 
-X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
+X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
 
 K = kernel(X)
 D = kernel.diag(X)
 
 plt.figure(figsize=(8, 5))
-plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
+plt.imshow(np.diag(D ** -0.5).dot(K).dot(np.diag(D ** -0.5)))
 plt.xticks(np.arange(len(X)), X)
 plt.yticks(np.arange(len(X)), X)
-plt.title('Sequence similarity under the kernel')
+plt.title("Sequence similarity under the kernel")
 
-'''
+"""
 Regression
 ==========
-'''
+"""
 
-X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
+X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
 Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
 
 training_idx = [0, 1, 3, 4]
@@ -132,43 +131,58 @@ def clone_with_theta(self, theta):
 gp.fit(X[training_idx], Y[training_idx])
 
 plt.figure(figsize=(8, 5))
-plt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction')
-plt.bar(training_idx, Y[training_idx], width=0.2, color='r',
-        alpha=1, label='training')
+plt.bar(np.arange(len(X)), gp.predict(X), color="b", label="prediction")
+plt.bar(training_idx, Y[training_idx], width=0.2, color="r", alpha=1, label="training")
 plt.xticks(np.arange(len(X)), X)
-plt.title('Regression on sequences')
+plt.title("Regression on sequences")
 plt.legend()
 
-'''
+"""
 Classification
 ==============
-'''
+"""
 
-X_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])
+X_train = np.array(["AGCT", "CGA", "TAAC", "TCG", "CTTT", "TGCT"])
 # whether there are 'A's in the sequence
 Y_train = np.array([True, True, True, False, False, False])
 
 gp = GaussianProcessClassifier(kernel)
 gp.fit(X_train, Y_train)
 
-X_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']
+X_test = ["AAA", "ATAG", "CTC", "CT", "C"]
 Y_test = [True, True, False, False, False]
 
 plt.figure(figsize=(8, 5))
-plt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train],
-            s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),
-            label='training')
-plt.scatter(len(X_train) + np.arange(len(X_test)),
-            [1.0 if c else -1.0 for c in Y_test],
-            s=100, marker='o', edgecolor='none', facecolor='r', label='truth')
-plt.scatter(len(X_train) + np.arange(len(X_test)),
-            [1.0 if c else -1.0 for c in gp.predict(X_test)],
-            s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,
-            label='prediction')
-plt.xticks(np.arange(len(X_train) + len(X_test)),
-           np.concatenate((X_train, X_test)))
+plt.scatter(
+    np.arange(len(X_train)),
+    [1.0 if c else -1.0 for c in Y_train],
+    s=100,
+    marker="o",
+    edgecolor="none",
+    facecolor=(1, 0.75, 0),
+    label="training",
+)
+plt.scatter(
+    len(X_train) + np.arange(len(X_test)),
+    [1.0 if c else -1.0 for c in Y_test],
+    s=100,
+    marker="o",
+    edgecolor="none",
+    facecolor="r",
+    label="truth",
+)
+plt.scatter(
+    len(X_train) + np.arange(len(X_test)),
+    [1.0 if c else -1.0 for c in gp.predict(X_test)],
+    s=100,
+    marker="x",
+    edgecolor=(0, 1.0, 0.3),
+    linewidth=2,
+    label="prediction",
+)
+plt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test)))
 plt.yticks([-1, 1], [False, True])
-plt.title('Classification on sequences')
+plt.title("Classification on sequences")
 plt.legend()
 
 plt.show()
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 877cfd37c0067..77f015fd9ae12 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -71,10 +71,9 @@
 br_estimator = BayesianRidge()
 score_full_data = pd.DataFrame(
     cross_val_score(
-        br_estimator, X_full, y_full, scoring='neg_mean_squared_error',
-        cv=N_SPLITS
+        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
     ),
-    columns=['Full Data']
+    columns=["Full Data"],
 )
 
 # Add a single missing value to each row
@@ -86,39 +85,35 @@
 
 # Estimate the score after imputation (mean and median strategies)
 score_simple_imputer = pd.DataFrame()
-for strategy in ('mean', 'median'):
+for strategy in ("mean", "median"):
     estimator = make_pipeline(
-        SimpleImputer(missing_values=np.nan, strategy=strategy),
-        br_estimator
+        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
     )
     score_simple_imputer[strategy] = cross_val_score(
-        estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
-        cv=N_SPLITS
+        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
 
 # Estimate the score after iterative imputation of the missing values
 # with different estimators
 estimators = [
     BayesianRidge(),
-    DecisionTreeRegressor(max_features='sqrt', random_state=0),
+    DecisionTreeRegressor(max_features="sqrt", random_state=0),
     ExtraTreesRegressor(n_estimators=10, random_state=0),
-    KNeighborsRegressor(n_neighbors=15)
+    KNeighborsRegressor(n_neighbors=15),
 ]
 score_iterative_imputer = pd.DataFrame()
 for impute_estimator in estimators:
     estimator = make_pipeline(
-        IterativeImputer(random_state=0, estimator=impute_estimator),
-        br_estimator
+        IterativeImputer(random_state=0, estimator=impute_estimator), br_estimator
+    )
+    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
+        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
-    score_iterative_imputer[impute_estimator.__class__.__name__] = \
-        cross_val_score(
-            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
-            cv=N_SPLITS
-        )
 
 scores = pd.concat(
     [score_full_data, score_simple_imputer, score_iterative_imputer],
-    keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
+    keys=["Original", "SimpleImputer", "IterativeImputer"],
+    axis=1,
 )
 
 # plot california housing results
@@ -126,8 +121,8 @@
 means = -scores.mean()
 errors = scores.std()
 means.plot.barh(xerr=errors, ax=ax)
-ax.set_title('California Housing Regression with Different Imputation Methods')
-ax.set_xlabel('MSE (smaller is better)')
+ax.set_title("California Housing Regression with Different Imputation Methods")
+ax.set_xlabel("MSE (smaller is better)")
 ax.set_yticks(np.arange(means.shape[0]))
 ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
 plt.tight_layout(pad=1)
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 3ea5c61427ff0..ec9d881af85b1 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -65,7 +65,7 @@ def add_missing_values(X_full, y_full):
     n_missing_samples = int(n_samples * missing_rate)
 
     missing_samples = np.zeros(n_samples, dtype=bool)
-    missing_samples[: n_missing_samples] = True
+    missing_samples[:n_missing_samples] = True
 
     rng.shuffle(missing_samples)
     missing_features = rng.randint(0, n_features, n_missing_samples)
@@ -76,11 +76,9 @@ def add_missing_values(X_full, y_full):
     return X_missing, y_missing
 
 
-X_miss_california, y_miss_california = add_missing_values(
-    X_california, y_california)
+X_miss_california, y_miss_california = add_missing_values(X_california, y_california)
 
-X_miss_diabetes, y_miss_diabetes = add_missing_values(
-    X_diabetes, y_diabetes)
+X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes)
 
 
 # %%
@@ -115,9 +113,9 @@ def add_missing_values(X_full, y_full):
 
 def get_scores_for_imputer(imputer, X_missing, y_missing):
     estimator = make_pipeline(imputer, regressor)
-    impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                    scoring='neg_mean_squared_error',
-                                    cv=N_SPLITS)
+    impute_scores = cross_val_score(
+        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
+    )
     return impute_scores
 
 
@@ -136,16 +134,15 @@ def get_scores_for_imputer(imputer, X_missing, y_missing):
 
 
 def get_full_score(X_full, y_full):
-    full_scores = cross_val_score(regressor, X_full, y_full,
-                                  scoring='neg_mean_squared_error',
-                                  cv=N_SPLITS)
+    full_scores = cross_val_score(
+        regressor, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
+    )
     return full_scores.mean(), full_scores.std()
 
 
-mses_california[0], stds_california[0] = get_full_score(X_california,
-                                                        y_california)
+mses_california[0], stds_california[0] = get_full_score(X_california, y_california)
 mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
-x_labels.append('Full data')
+x_labels.append("Full data")
 
 
 # %%
@@ -159,17 +156,20 @@ def get_full_score(X_full, y_full):
 
 def get_impute_zero_score(X_missing, y_missing):
 
-    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True,
-                            strategy='constant', fill_value=0)
+    imputer = SimpleImputer(
+        missing_values=np.nan, add_indicator=True, strategy="constant", fill_value=0
+    )
     zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
     return zero_impute_scores.mean(), zero_impute_scores.std()
 
 
 mses_california[1], stds_california[1] = get_impute_zero_score(
-    X_miss_california, y_miss_california)
-mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
-                                                           y_miss_diabetes)
-x_labels.append('Zero imputation')
+    X_miss_california, y_miss_california
+)
+mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(
+    X_miss_diabetes, y_miss_diabetes
+)
+x_labels.append("Zero imputation")
 
 
 # %%
@@ -179,6 +179,7 @@ def get_impute_zero_score(X_missing, y_missing):
 # :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted
 # or unweighted mean of the desired number of nearest neighbors.
 
+
 def get_impute_knn_score(X_missing, y_missing):
     imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
@@ -186,10 +187,12 @@ def get_impute_knn_score(X_missing, y_missing):
 
 
 mses_california[2], stds_california[2] = get_impute_knn_score(
-    X_miss_california, y_miss_california)
-mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
-                                                          y_miss_diabetes)
-x_labels.append('KNN Imputation')
+    X_miss_california, y_miss_california
+)
+mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(
+    X_miss_diabetes, y_miss_diabetes
+)
+x_labels.append("KNN Imputation")
 
 
 # %%
@@ -197,18 +200,18 @@ def get_impute_knn_score(X_missing, y_missing):
 # -------------------------------
 #
 
+
 def get_impute_mean(X_missing, y_missing):
-    imputer = SimpleImputer(missing_values=np.nan, strategy="mean",
-                            add_indicator=True)
+    imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True)
     mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
     return mean_impute_scores.mean(), mean_impute_scores.std()
 
 
-mses_california[3], stds_california[3] = get_impute_mean(X_miss_california,
-                                                         y_miss_california)
-mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
-                                                     y_miss_diabetes)
-x_labels.append('Mean Imputation')
+mses_california[3], stds_california[3] = get_impute_mean(
+    X_miss_california, y_miss_california
+)
+mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes)
+x_labels.append("Mean Imputation")
 
 
 # %%
@@ -223,21 +226,26 @@ def get_impute_mean(X_missing, y_missing):
 # to potentially improve performance.
 #
 
+
 def get_impute_iterative(X_missing, y_missing):
-    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,
-                               random_state=0, n_nearest_features=5,
-                               sample_posterior=True)
-    iterative_impute_scores = get_scores_for_imputer(imputer,
-                                                     X_missing,
-                                                     y_missing)
+    imputer = IterativeImputer(
+        missing_values=np.nan,
+        add_indicator=True,
+        random_state=0,
+        n_nearest_features=5,
+        sample_posterior=True,
+    )
+    iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
     return iterative_impute_scores.mean(), iterative_impute_scores.std()
 
 
 mses_california[4], stds_california[4] = get_impute_iterative(
-    X_miss_california, y_miss_california)
-mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
-                                                          y_miss_diabetes)
-x_labels.append('Iterative Imputation')
+    X_miss_california, y_miss_california
+)
+mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(
+    X_miss_diabetes, y_miss_diabetes
+)
+x_labels.append("Iterative Imputation")
 
 mses_diabetes = mses_diabetes * -1
 mses_california = mses_california * -1
@@ -255,34 +263,45 @@ def get_impute_iterative(X_missing, y_missing):
 n_bars = len(mses_diabetes)
 xval = np.arange(n_bars)
 
-colors = ['r', 'g', 'b', 'orange', 'black']
+colors = ["r", "g", "b", "orange", "black"]
 
 # plot diabetes results
 plt.figure(figsize=(12, 6))
 ax1 = plt.subplot(121)
 for j in xval:
-    ax1.barh(j, mses_diabetes[j], xerr=stds_diabetes[j],
-             color=colors[j], alpha=0.6, align='center')
-
-ax1.set_title('Imputation Techniques with Diabetes Data')
-ax1.set_xlim(left=np.min(mses_diabetes) * 0.9,
-             right=np.max(mses_diabetes) * 1.1)
+    ax1.barh(
+        j,
+        mses_diabetes[j],
+        xerr=stds_diabetes[j],
+        color=colors[j],
+        alpha=0.6,
+        align="center",
+    )
+
+ax1.set_title("Imputation Techniques with Diabetes Data")
+ax1.set_xlim(left=np.min(mses_diabetes) * 0.9, right=np.max(mses_diabetes) * 1.1)
 ax1.set_yticks(xval)
-ax1.set_xlabel('MSE')
+ax1.set_xlabel("MSE")
 ax1.invert_yaxis()
 ax1.set_yticklabels(x_labels)
 
 # plot california dataset results
 ax2 = plt.subplot(122)
 for j in xval:
-    ax2.barh(j, mses_california[j], xerr=stds_california[j],
-             color=colors[j], alpha=0.6, align='center')
-
-ax2.set_title('Imputation Techniques with California Data')
+    ax2.barh(
+        j,
+        mses_california[j],
+        xerr=stds_california[j],
+        color=colors[j],
+        alpha=0.6,
+        align="center",
+    )
+
+ax2.set_title("Imputation Techniques with California Data")
 ax2.set_yticks(xval)
-ax2.set_xlabel('MSE')
+ax2.set_xlabel("MSE")
 ax2.invert_yaxis()
-ax2.set_yticklabels([''] * n_bars)
+ax2.set_yticklabels([""] * n_bars)
 
 plt.show()
 
diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
index 766a01fbeb12d..c67b24326e128 100644
--- a/examples/inspection/plot_permutation_importance.py
+++ b/examples/inspection/plot_permutation_importance.py
@@ -52,30 +52,32 @@
 #   values).
 X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
 rng = np.random.RandomState(seed=42)
-X['random_cat'] = rng.randint(3, size=X.shape[0])
-X['random_num'] = rng.randn(X.shape[0])
+X["random_cat"] = rng.randint(3, size=X.shape[0])
+X["random_num"] = rng.randn(X.shape[0])
 
-categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat']
-numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num']
+categorical_columns = ["pclass", "sex", "embarked", "random_cat"]
+numerical_columns = ["age", "sibsp", "parch", "fare", "random_num"]
 
 X = X[categorical_columns + numerical_columns]
 
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, stratify=y, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
 
-categorical_encoder = OneHotEncoder(handle_unknown='ignore')
-numerical_pipe = Pipeline([
-    ('imputer', SimpleImputer(strategy='mean'))
-])
+categorical_encoder = OneHotEncoder(handle_unknown="ignore")
+numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))])
 
 preprocessing = ColumnTransformer(
-    [('cat', categorical_encoder, categorical_columns),
-     ('num', numerical_pipe, numerical_columns)])
-
-rf = Pipeline([
-    ('preprocess', preprocessing),
-    ('classifier', RandomForestClassifier(random_state=42))
-])
+    [
+        ("cat", categorical_encoder, categorical_columns),
+        ("num", numerical_pipe, numerical_columns),
+    ]
+)
+
+rf = Pipeline(
+    [
+        ("preprocess", preprocessing),
+        ("classifier", RandomForestClassifier(random_state=42)),
+    ]
+)
 rf.fit(X_train, y_train)
 
 # %%
@@ -118,13 +120,11 @@
 #   therefore do not reflect the ability of feature to be useful to make
 #   predictions that generalize to the test set (when the model has enough
 #   capacity).
-ohe = (rf.named_steps['preprocess']
-         .named_transformers_['cat'])
+ohe = rf.named_steps["preprocess"].named_transformers_["cat"]
 feature_names = ohe.get_feature_names_out(categorical_columns)
 feature_names = np.r_[feature_names, numerical_columns]
 
-tree_feature_importances = (
-    rf.named_steps['classifier'].feature_importances_)
+tree_feature_importances = rf.named_steps["classifier"].feature_importances_
 sorted_idx = tree_feature_importances.argsort()
 
 y_ticks = np.arange(0, len(feature_names))
@@ -144,13 +144,15 @@
 #
 # Also note that both random features have very low importances (close to 0) as
 # expected.
-result = permutation_importance(rf, X_test, y_test, n_repeats=10,
-                                random_state=42, n_jobs=2)
+result = permutation_importance(
+    rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
+)
 sorted_idx = result.importances_mean.argsort()
 
 fig, ax = plt.subplots()
-ax.boxplot(result.importances[sorted_idx].T,
-           vert=False, labels=X_test.columns[sorted_idx])
+ax.boxplot(
+    result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]
+)
 ax.set_title("Permutation Importances (test set)")
 fig.tight_layout()
 plt.show()
@@ -162,13 +164,15 @@
 # plots is a confirmation that the RF model has enough capacity to use that
 # random numerical feature to overfit. You can further confirm this by
 # re-running this example with constrained RF with min_samples_leaf=10.
-result = permutation_importance(rf, X_train, y_train, n_repeats=10,
-                                random_state=42, n_jobs=2)
+result = permutation_importance(
+    rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
+)
 sorted_idx = result.importances_mean.argsort()
 
 fig, ax = plt.subplots()
-ax.boxplot(result.importances[sorted_idx].T,
-           vert=False, labels=X_train.columns[sorted_idx])
+ax.boxplot(
+    result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx]
+)
 ax.set_title("Permutation Importances (train set)")
 fig.tight_layout()
 plt.show()
diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py
index 12fcd5ebf7bcd..23edc95583483 100644
--- a/examples/inspection/plot_permutation_importance_multicollinear.py
+++ b/examples/inspection/plot_permutation_importance_multicollinear.py
@@ -51,21 +51,22 @@
 # computed above: some feature must be important. The permutation importance
 # is calculated on the training set to show how much the model relies on each
 # feature during training.
-result = permutation_importance(clf, X_train, y_train, n_repeats=10,
-                                random_state=42)
+result = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=42)
 perm_sorted_idx = result.importances_mean.argsort()
 
 tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
 tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5
 
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
-ax1.barh(tree_indices,
-         clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
+ax1.barh(tree_indices, clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
 ax1.set_yticks(tree_indices)
 ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
 ax1.set_ylim((0, len(clf.feature_importances_)))
-ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
-            labels=data.feature_names[perm_sorted_idx])
+ax2.boxplot(
+    result.importances[perm_sorted_idx].T,
+    vert=False,
+    labels=data.feature_names[perm_sorted_idx],
+)
 fig.tight_layout()
 plt.show()
 
@@ -82,7 +83,7 @@
 corr = spearmanr(X).correlation
 
 # Ensure the correlation matrix is symmetric
-corr = (corr + corr.T)/2
+corr = (corr + corr.T) / 2
 np.fill_diagonal(corr, 1)
 
 # We convert the correlation matrix to a distance matrix before performing
@@ -92,13 +93,13 @@
 dendro = hierarchy.dendrogram(
     dist_linkage, labels=data.feature_names.tolist(), ax=ax1, leaf_rotation=90
 )
-dendro_idx = np.arange(0, len(dendro['ivl']))
+dendro_idx = np.arange(0, len(dendro["ivl"]))
 
-ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
+ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
 ax2.set_xticks(dendro_idx)
 ax2.set_yticks(dendro_idx)
-ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
-ax2.set_yticklabels(dendro['ivl'])
+ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
+ax2.set_yticklabels(dendro["ivl"])
 fig.tight_layout()
 plt.show()
 
@@ -108,7 +109,7 @@
 # keep, select those features from our dataset, and train a new random forest.
 # The test accuracy of the new random forest did not change much compared to
 # the random forest trained on the complete dataset.
-cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion='distance')
+cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance")
 cluster_id_to_feature_ids = defaultdict(list)
 for idx, cluster_id in enumerate(cluster_ids):
     cluster_id_to_feature_ids[cluster_id].append(idx)
@@ -119,5 +120,8 @@
 
 clf_sel = RandomForestClassifier(n_estimators=100, random_state=42)
 clf_sel.fit(X_train_sel, y_train)
-print("Accuracy on test data with features removed: {:.2f}".format(
-      clf_sel.score(X_test_sel, y_test)))
+print(
+    "Accuracy on test data with features removed: {:.2f}".format(
+        clf_sel.score(X_test_sel, y_test)
+    )
+)
diff --git a/examples/kernel_approximation/plot_scalable_poly_kernels.py b/examples/kernel_approximation/plot_scalable_poly_kernels.py
index 7d026dbcf16d6..c52b9fd59668d 100644
--- a/examples/kernel_approximation/plot_scalable_poly_kernels.py
+++ b/examples/kernel_approximation/plot_scalable_poly_kernels.py
@@ -50,9 +50,9 @@
 # To actually reproduce the results in the original Tensor Sketch paper,
 # select 100,000 for training.
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5_000,
-                                                    test_size=10_000,
-                                                    random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, train_size=5_000, test_size=10_000, random_state=42
+)
 
 # %%
 # Now scale features to the range [0, 1] to match the format of the dataset in
@@ -104,11 +104,15 @@
     ps_lsvm_score = 0
     for _ in range(n_runs):
 
-        pipeline = Pipeline(steps=[("kernel_approximator",
-                                    PolynomialCountSketch(
-                                        n_components=n_components,
-                                        degree=4)),
-                                   ("linear_classifier", LinearSVC())])
+        pipeline = Pipeline(
+            steps=[
+                (
+                    "kernel_approximator",
+                    PolynomialCountSketch(n_components=n_components, degree=4),
+                ),
+                ("linear_classifier", LinearSVC()),
+            ]
+        )
 
         start = time.time()
         pipeline.fit(X_train, y_train)
@@ -119,10 +123,13 @@
     ps_lsvm_score /= n_runs
 
     results[f"LSVM + PS({n_components})"] = {
-        "time": ps_lsvm_time, "score": ps_lsvm_score
+        "time": ps_lsvm_time,
+        "score": ps_lsvm_score,
     }
-    print(f"Linear SVM score on {n_components} PolynomialCountSketch " +
-          f"features: {ps_lsvm_score:.2f}%")
+    print(
+        f"Linear SVM score on {n_components} PolynomialCountSketch "
+        + f"features: {ps_lsvm_score:.2f}%"
+    )
 
 # %%
 # Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
@@ -132,7 +139,7 @@
 
 from sklearn.svm import SVC
 
-ksvm = SVC(C=500., kernel="poly", degree=4, coef0=0, gamma=1.)
+ksvm = SVC(C=500.0, kernel="poly", degree=4, coef0=0, gamma=1.0)
 
 start = time.time()
 ksvm.fit(X_train, y_train)
@@ -151,23 +158,59 @@
 N_COMPONENTS = [250, 500, 1000, 2000]
 
 fig, ax = plt.subplots(figsize=(7, 7))
-ax.scatter([results["LSVM"]["time"], ], [results["LSVM"]["score"], ],
-           label="Linear SVM", c="green", marker="^")
-
-ax.scatter([results["LSVM + PS(250)"]["time"], ],
-           [results["LSVM + PS(250)"]["score"], ],
-           label="Linear SVM + PolynomialCountSketch", c="blue")
+ax.scatter(
+    [
+        results["LSVM"]["time"],
+    ],
+    [
+        results["LSVM"]["score"],
+    ],
+    label="Linear SVM",
+    c="green",
+    marker="^",
+)
+
+ax.scatter(
+    [
+        results["LSVM + PS(250)"]["time"],
+    ],
+    [
+        results["LSVM + PS(250)"]["score"],
+    ],
+    label="Linear SVM + PolynomialCountSketch",
+    c="blue",
+)
 for n_components in N_COMPONENTS:
-    ax.scatter([results[f"LSVM + PS({n_components})"]["time"], ],
-               [results[f"LSVM + PS({n_components})"]["score"], ],
-               c="blue")
-    ax.annotate(f"n_comp.={n_components}",
-                (results[f"LSVM + PS({n_components})"]["time"],
-                 results[f"LSVM + PS({n_components})"]["score"]),
-                xytext=(-30, 10), textcoords="offset pixels")
-
-ax.scatter([results["KSVM"]["time"], ], [results["KSVM"]["score"], ],
-           label="Kernel SVM", c="red", marker="x")
+    ax.scatter(
+        [
+            results[f"LSVM + PS({n_components})"]["time"],
+        ],
+        [
+            results[f"LSVM + PS({n_components})"]["score"],
+        ],
+        c="blue",
+    )
+    ax.annotate(
+        f"n_comp.={n_components}",
+        (
+            results[f"LSVM + PS({n_components})"]["time"],
+            results[f"LSVM + PS({n_components})"]["score"],
+        ),
+        xytext=(-30, 10),
+        textcoords="offset pixels",
+    )
+
+ax.scatter(
+    [
+        results["KSVM"]["time"],
+    ],
+    [
+        results["KSVM"]["score"],
+    ],
+    label="Kernel SVM",
+    c="red",
+    marker="x",
+)
 
 ax.set_xlabel("Training time (s)")
 ax.set_ylabel("Accuracy (%)")
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index 177bd8ce24ad1..58529fe37a2cc 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -39,15 +39,15 @@
 # Create Gaussian data
 X = np.random.randn(n_samples, n_features)
 # Create weights with a precision lambda_ of 4.
-lambda_ = 4.
+lambda_ = 4.0
 w = np.zeros(n_features)
 # Only keep 10 weights of interest
 relevant_features = np.random.randint(0, n_features, 10)
 for i in relevant_features:
-    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
+    w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_))
 # Create noise with a precision alpha of 50.
-alpha_ = 50.
-noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
+alpha_ = 50.0
+noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples)
 # Create the target
 y = np.dot(X, w) + noise
 
@@ -64,27 +64,32 @@
 # weights, and predictions with standard deviations
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
-plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2,
-         label="ARD estimate")
-plt.plot(ols.coef_, color='yellowgreen', linestyle=':', linewidth=2,
-         label="OLS estimate")
-plt.plot(w, color='orange', linestyle='-', linewidth=2, label="Ground truth")
+plt.plot(clf.coef_, color="darkblue", linestyle="-", linewidth=2, label="ARD estimate")
+plt.plot(
+    ols.coef_, color="yellowgreen", linestyle=":", linewidth=2, label="OLS estimate"
+)
+plt.plot(w, color="orange", linestyle="-", linewidth=2, label="Ground truth")
 plt.xlabel("Features")
 plt.ylabel("Values of the weights")
 plt.legend(loc=1)
 
 plt.figure(figsize=(6, 5))
 plt.title("Histogram of the weights")
-plt.hist(clf.coef_, bins=n_features, color='navy', log=True)
-plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.),
-            color='gold', marker='o', label="Relevant features")
+plt.hist(clf.coef_, bins=n_features, color="navy", log=True)
+plt.scatter(
+    clf.coef_[relevant_features],
+    np.full(len(relevant_features), 5.0),
+    color="gold",
+    marker="o",
+    label="Relevant features",
+)
 plt.ylabel("Features")
 plt.xlabel("Values of the weights")
 plt.legend(loc=1)
 
 plt.figure(figsize=(6, 5))
 plt.title("Marginal log-likelihood")
-plt.plot(clf.scores_, color='navy', linewidth=2)
+plt.plot(clf.scores_, color="navy", linewidth=2)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
 
@@ -106,10 +111,8 @@ def f(x, noise_amount):
 y_plot = f(X_plot, noise_amount=0)
 y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
 plt.figure(figsize=(6, 5))
-plt.errorbar(X_plot, y_mean, y_std, color='navy',
-             label="Polynomial ARD", linewidth=2)
-plt.plot(X_plot, y_plot, color='gold', linewidth=2,
-         label="Ground Truth")
+plt.errorbar(X_plot, y_mean, y_std, color="navy", label="Polynomial ARD", linewidth=2)
+plt.plot(X_plot, y_plot, color="gold", linewidth=2, label="Ground Truth")
 plt.ylabel("Output y")
 plt.xlabel("Feature X")
 plt.legend(loc="lower left")
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index 43925e72c591c..a9593a5b41061 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -36,15 +36,15 @@
 n_samples, n_features = 100, 100
 X = np.random.randn(n_samples, n_features)  # Create Gaussian data
 # Create weights with a precision lambda_ of 4.
-lambda_ = 4.
+lambda_ = 4.0
 w = np.zeros(n_features)
 # Only keep 10 weights of interest
 relevant_features = np.random.randint(0, n_features, 10)
 for i in relevant_features:
-    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
+    w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_))
 # Create noise with a precision alpha of 50.
-alpha_ = 50.
-noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
+alpha_ = 50.0
+noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples)
 # Create the target
 y = np.dot(X, w) + noise
 
@@ -62,27 +62,29 @@
 lw = 2
 plt.figure(figsize=(6, 5))
 plt.title("Weights of the model")
-plt.plot(clf.coef_, color='lightgreen', linewidth=lw,
-         label="Bayesian Ridge estimate")
-plt.plot(w, color='gold', linewidth=lw, label="Ground truth")
-plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
+plt.plot(clf.coef_, color="lightgreen", linewidth=lw, label="Bayesian Ridge estimate")
+plt.plot(w, color="gold", linewidth=lw, label="Ground truth")
+plt.plot(ols.coef_, color="navy", linestyle="--", label="OLS estimate")
 plt.xlabel("Features")
 plt.ylabel("Values of the weights")
 plt.legend(loc="best", prop=dict(size=12))
 
 plt.figure(figsize=(6, 5))
 plt.title("Histogram of the weights")
-plt.hist(clf.coef_, bins=n_features, color='gold', log=True,
-         edgecolor='black')
-plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.),
-            color='navy', label="Relevant features")
+plt.hist(clf.coef_, bins=n_features, color="gold", log=True, edgecolor="black")
+plt.scatter(
+    clf.coef_[relevant_features],
+    np.full(len(relevant_features), 5.0),
+    color="navy",
+    label="Relevant features",
+)
 plt.ylabel("Features")
 plt.xlabel("Values of the weights")
 plt.legend(loc="upper left")
 
 plt.figure(figsize=(6, 5))
 plt.title("Marginal log-likelihood")
-plt.plot(clf.scores_, color='navy', linewidth=lw)
+plt.plot(clf.scores_, color="navy", linewidth=lw)
 plt.ylabel("Score")
 plt.xlabel("Iterations")
 
@@ -104,10 +106,15 @@ def f(x, noise_amount):
 y_plot = f(X_plot, noise_amount=0)
 y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
 plt.figure(figsize=(6, 5))
-plt.errorbar(X_plot, y_mean, y_std, color='navy',
-             label="Polynomial Bayesian Ridge Regression", linewidth=lw)
-plt.plot(X_plot, y_plot, color='gold', linewidth=lw,
-         label="Ground Truth")
+plt.errorbar(
+    X_plot,
+    y_mean,
+    y_std,
+    color="navy",
+    label="Polynomial Bayesian Ridge Regression",
+    linewidth=lw,
+)
+plt.plot(X_plot, y_plot, color="gold", linewidth=lw, label="Ground Truth")
 plt.ylabel("Output y")
 plt.xlabel("Feature X")
 plt.legend(loc="lower left")
diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py
index 2f4a36d47d9a6..fc4f9e6cc384c 100644
--- a/examples/linear_model/plot_bayesian_ridge_curvefit.py
+++ b/examples/linear_model/plot_bayesian_ridge_curvefit.py
@@ -34,16 +34,17 @@
 from sklearn.linear_model import BayesianRidge
 
 
-def func(x): return np.sin(2*np.pi*x)
+def func(x):
+    return np.sin(2 * np.pi * x)
 
 
 # #############################################################################
 # Generate sinusoidal data with noise
 size = 25
 rng = np.random.RandomState(1234)
-x_train = rng.uniform(0., 1., size)
+x_train = rng.uniform(0.0, 1.0, size)
 y_train = func(x_train) + rng.normal(scale=0.1, size=size)
-x_test = np.linspace(0., 1., 100)
+x_test = np.linspace(0.0, 1.0, 100)
 
 
 # #############################################################################
@@ -59,9 +60,9 @@ def func(x): return np.sin(2*np.pi*x)
 for i, ax in enumerate(axes):
     # Bayesian ridge regression with different initial value pairs
     if i == 0:
-        init = [1 / np.var(y_train), 1.]  # Default values
+        init = [1 / np.var(y_train), 1.0]  # Default values
     elif i == 1:
-        init = [1., 1e-3]
+        init = [1.0, 1e-3]
         reg.set_params(alpha_init=init[0], lambda_init=init[1])
     reg.fit(X_train, y_train)
     ymean, ystd = reg.predict(X_test, return_std=True)
@@ -69,17 +70,18 @@ def func(x): return np.sin(2*np.pi*x)
     ax.plot(x_test, func(x_test), color="blue", label="sin($2\\pi x$)")
     ax.scatter(x_train, y_train, s=50, alpha=0.5, label="observation")
     ax.plot(x_test, ymean, color="red", label="predict mean")
-    ax.fill_between(x_test, ymean-ystd, ymean+ystd,
-                    color="pink", alpha=0.5, label="predict std")
+    ax.fill_between(
+        x_test, ymean - ystd, ymean + ystd, color="pink", alpha=0.5, label="predict std"
+    )
     ax.set_ylim(-1.3, 1.3)
     ax.legend()
-    title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format(
-            init[0], init[1])
+    title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format(init[0], init[1])
     if i == 0:
         title += " (Default)"
     ax.set_title(title, fontsize=12)
     text = "$\\alpha={:.1f}$\n$\\lambda={:.3f}$\n$L={:.1f}$".format(
-           reg.alpha_, reg.lambda_, reg.scores_[-1])
+        reg.alpha_, reg.lambda_, reg.scores_[-1]
+    )
     ax.text(0.05, -1.0, text, fontsize=12)
 
 plt.tight_layout()
diff --git a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
index 852ea545c5fd6..977759bc66b4c 100644
--- a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
+++ b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
@@ -38,7 +38,7 @@
 # weights, we must first center the design matrix,  and rescale it by the
 # normalized weights prior to computing the gram matrix.
 X_offset = np.average(X, axis=0, weights=normalized_weights)
-X_centered = (X - np.average(X, axis=0, weights=normalized_weights))
+X_centered = X - np.average(X, axis=0, weights=normalized_weights)
 X_scaled = X_centered * np.sqrt(normalized_weights)[:, np.newaxis]
 gram = np.dot(X_scaled.T, X_scaled)
 
diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py
index e5f71cc861d88..d3137f3ac26cb 100644
--- a/examples/linear_model/plot_huber_vs_ridge.py
+++ b/examples/linear_model/plot_huber_vs_ridge.py
@@ -25,22 +25,23 @@
 
 # Generate toy data.
 rng = np.random.RandomState(0)
-X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0,
-                       bias=100.0)
+X, y = make_regression(
+    n_samples=20, n_features=1, random_state=0, noise=4.0, bias=100.0
+)
 
 # Add four strong outliers to the dataset.
 X_outliers = rng.normal(0, 0.5, size=(4, 1))
 y_outliers = rng.normal(0, 2.0, size=4)
-X_outliers[:2, :] += X.max() + X.mean() / 4.
-X_outliers[2:, :] += X.min() - X.mean() / 4.
-y_outliers[:2] += y.min() - y.mean() / 4.
-y_outliers[2:] += y.max() + y.mean() / 4.
+X_outliers[:2, :] += X.max() + X.mean() / 4.0
+X_outliers[2:, :] += X.min() - X.mean() / 4.0
+y_outliers[:2] += y.min() - y.mean() / 4.0
+y_outliers[2:] += y.max() + y.mean() / 4.0
 X = np.vstack((X, X_outliers))
 y = np.concatenate((y, y_outliers))
-plt.plot(X, y, 'b.')
+plt.plot(X, y, "b.")
 
 # Fit the huber regressor over a series of epsilon values.
-colors = ['r-', 'b-', 'y-', 'm-']
+colors = ["r-", "b-", "y-", "m-"]
 
 x = np.linspace(X.min(), X.max(), 7)
 epsilon_values = [1, 1.5, 1.75, 1.9]
@@ -55,7 +56,7 @@
 ridge.fit(X, y)
 coef_ridge = ridge.coef_
 coef_ = ridge.coef_ * x + ridge.intercept_
-plt.plot(x, coef_, 'g-', label="ridge regression")
+plt.plot(x, coef_, "g-", label="ridge regression")
 
 plt.title("Comparison of HuberRegressor vs Ridge")
 plt.xlabel("X")
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index b04094a647e90..63853198f2adc 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -34,9 +34,9 @@
 
 # Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, x_max]x[y_min, y_max].
-x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
-h = .02  # step size in the mesh
+x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
+h = 0.02  # step size in the mesh
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
 
@@ -46,9 +46,9 @@
 plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
 
 # Plot also the training points
-plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
-plt.xlabel('Sepal length')
-plt.ylabel('Sepal width')
+plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors="k", cmap=plt.cm.Paired)
+plt.xlabel("Sepal length")
+plt.ylabel("Sepal width")
 
 plt.xlim(xx.min(), xx.max())
 plt.ylim(yy.min(), yy.max())
diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py
index 8934ddc76395c..6a4b481048d8b 100644
--- a/examples/linear_model/plot_lasso_and_elasticnet.py
+++ b/examples/linear_model/plot_lasso_and_elasticnet.py
@@ -33,8 +33,8 @@
 
 # Split data in train set and test set
 n_samples = X.shape[0]
-X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
-X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]
+X_train, y_train = X[: n_samples // 2], y[: n_samples // 2]
+X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :]
 
 # #############################################################################
 # Lasso
@@ -59,18 +59,32 @@
 print(enet)
 print("r^2 on test data : %f" % r2_score_enet)
 
-m, s, _ = plt.stem(np.where(enet.coef_)[0], enet.coef_[enet.coef_ != 0],
-                   markerfmt='x', label='Elastic net coefficients',
-                   use_line_collection=True)
+m, s, _ = plt.stem(
+    np.where(enet.coef_)[0],
+    enet.coef_[enet.coef_ != 0],
+    markerfmt="x",
+    label="Elastic net coefficients",
+    use_line_collection=True,
+)
 plt.setp([m, s], color="#2ca02c")
-m, s, _ = plt.stem(np.where(lasso.coef_)[0], lasso.coef_[lasso.coef_ != 0],
-                   markerfmt='x', label='Lasso coefficients',
-                   use_line_collection=True)
-plt.setp([m, s], color='#ff7f0e')
-plt.stem(np.where(coef)[0], coef[coef != 0], label='true coefficients',
-         markerfmt='bx', use_line_collection=True)
-
-plt.legend(loc='best')
-plt.title("Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f"
-          % (r2_score_lasso, r2_score_enet))
+m, s, _ = plt.stem(
+    np.where(lasso.coef_)[0],
+    lasso.coef_[lasso.coef_ != 0],
+    markerfmt="x",
+    label="Lasso coefficients",
+    use_line_collection=True,
+)
+plt.setp([m, s], color="#ff7f0e")
+plt.stem(
+    np.where(coef)[0],
+    coef[coef != 0],
+    label="true coefficients",
+    markerfmt="bx",
+    use_line_collection=True,
+)
+
+plt.legend(loc="best")
+plt.title(
+    "Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f" % (r2_score_lasso, r2_score_enet)
+)
 plt.show()
diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
index ba8f4f7bb4089..a7e8c22042459 100644
--- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
+++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
@@ -36,8 +36,10 @@
 dense_lasso.fit(X, y)
 print("Dense Lasso done in %fs" % (time() - t0))
 
-print("Distance between coefficients : %s"
-      % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_))
+print(
+    "Distance between coefficients : %s"
+    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
+)
 
 # #############################################################################
 # The two Lasso implementations on Sparse data
@@ -62,5 +64,7 @@
 dense_lasso.fit(Xs.toarray(), y)
 print("Dense Lasso done in %fs" % (time() - t0))
 
-print("Distance between coefficients : %s"
-      % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_))
+print(
+    "Distance between coefficients : %s"
+    % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
+)
diff --git a/examples/linear_model/plot_lasso_lars.py b/examples/linear_model/plot_lasso_lars.py
index 06d4c94bbed70..2ebeb46037072 100644
--- a/examples/linear_model/plot_lasso_lars.py
+++ b/examples/linear_model/plot_lasso_lars.py
@@ -25,16 +25,16 @@
 X, y = datasets.load_diabetes(return_X_y=True)
 
 print("Computing regularization path using the LARS ...")
-_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)
+_, _, coefs = linear_model.lars_path(X, y, method="lasso", verbose=True)
 
 xx = np.sum(np.abs(coefs.T), axis=1)
 xx /= xx[-1]
 
 plt.plot(xx, coefs.T)
 ymin, ymax = plt.ylim()
-plt.vlines(xx, ymin, ymax, linestyle='dashed')
-plt.xlabel('|coef| / max|coef|')
-plt.ylabel('Coefficients')
-plt.title('LASSO Path')
-plt.axis('tight')
+plt.vlines(xx, ymin, ymax, linestyle="dashed")
+plt.xlabel("|coef| / max|coef|")
+plt.ylabel("Coefficients")
+plt.title("LASSO Path")
+plt.axis("tight")
 plt.show()
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 845a86c3bda4a..2bbc4a9e8d921 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -68,33 +68,42 @@
 # #############################################################################
 # LassoLarsIC: least angle regression with BIC/AIC criterion
 
-model_bic = LassoLarsIC(criterion='bic', normalize=False)
+model_bic = LassoLarsIC(criterion="bic", normalize=False)
 t1 = time.time()
 model_bic.fit(X, y)
 t_bic = time.time() - t1
 alpha_bic_ = model_bic.alpha_
 
-model_aic = LassoLarsIC(criterion='aic', normalize=False)
+model_aic = LassoLarsIC(criterion="aic", normalize=False)
 model_aic.fit(X, y)
 alpha_aic_ = model_aic.alpha_
 
 
 def plot_ic_criterion(model, name, color):
     criterion_ = model.criterion_
-    plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color,
-                 linewidth=3, label='%s criterion' % name)
-    plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3,
-                label='alpha: %s estimate' % name)
-    plt.xlabel(r'$\alpha$')
-    plt.ylabel('criterion')
+    plt.semilogx(
+        model.alphas_ + EPSILON,
+        criterion_,
+        "--",
+        color=color,
+        linewidth=3,
+        label="%s criterion" % name,
+    )
+    plt.axvline(
+        model.alpha_ + EPSILON,
+        color=color,
+        linewidth=3,
+        label="alpha: %s estimate" % name,
+    )
+    plt.xlabel(r"$\alpha$")
+    plt.ylabel("criterion")
 
 
 plt.figure()
-plot_ic_criterion(model_aic, 'AIC', 'b')
-plot_ic_criterion(model_bic, 'BIC', 'r')
+plot_ic_criterion(model_aic, "AIC", "b")
+plot_ic_criterion(model_bic, "BIC", "r")
 plt.legend()
-plt.title('Information-criterion for model selection (training time %.3fs)'
-          % t_bic)
+plt.title("Information-criterion for model selection (training time %.3fs)" % t_bic)
 
 # #############################################################################
 # LassoCV: coordinate descent
@@ -108,19 +117,27 @@ def plot_ic_criterion(model, name, color):
 # Display results
 plt.figure()
 ymin, ymax = 2300, 3800
-plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':')
-plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
-         label='Average across the folds', linewidth=2)
-plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k',
-            label='alpha: CV estimate')
+plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":")
+plt.plot(
+    model.alphas_ + EPSILON,
+    model.mse_path_.mean(axis=-1),
+    "k",
+    label="Average across the folds",
+    linewidth=2,
+)
+plt.axvline(
+    model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate"
+)
 
 plt.legend()
 
-plt.xlabel(r'$\alpha$')
-plt.ylabel('Mean square error')
-plt.title('Mean square error on each fold: coordinate descent '
-          '(train time: %.2fs)' % t_lasso_cv)
-plt.axis('tight')
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean square error")
+plt.title(
+    "Mean square error on each fold: coordinate descent (train time: %.2fs)"
+    % t_lasso_cv
+)
+plt.axis("tight")
 plt.ylim(ymin, ymax)
 
 # #############################################################################
@@ -134,18 +151,21 @@ def plot_ic_criterion(model, name, color):
 
 # Display results
 plt.figure()
-plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':')
-plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
-             label='Average across the folds', linewidth=2)
-plt.axvline(model.alpha_, linestyle='--', color='k',
-            label='alpha CV')
+plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":")
+plt.semilogx(
+    model.cv_alphas_ + EPSILON,
+    model.mse_path_.mean(axis=-1),
+    "k",
+    label="Average across the folds",
+    linewidth=2,
+)
+plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV")
 plt.legend()
 
-plt.xlabel(r'$\alpha$')
-plt.ylabel('Mean square error')
-plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
-          % t_lasso_lars_cv)
-plt.axis('tight')
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean square error")
+plt.title("Mean square error on each fold: Lars (train time: %.2fs)" % t_lasso_lars_cv)
+plt.axis("tight")
 plt.ylim(ymin, ymax)
 
 plt.show()
diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index 6b843007624a9..49806a775328f 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -31,7 +31,7 @@
 X = np.random.normal(size=n_samples)
 y = (X > 0).astype(float)
 X[X > 0] *= 4
-X += .3 * np.random.normal(size=n_samples)
+X += 0.3 * np.random.normal(size=n_samples)
 
 X = X[:, np.newaxis]
 
@@ -42,24 +42,27 @@
 # and plot the result
 plt.figure(1, figsize=(4, 3))
 plt.clf()
-plt.scatter(X.ravel(), y, color='black', zorder=20)
+plt.scatter(X.ravel(), y, color="black", zorder=20)
 X_test = np.linspace(-5, 10, 300)
 
 loss = expit(X_test * clf.coef_ + clf.intercept_).ravel()
-plt.plot(X_test, loss, color='red', linewidth=3)
+plt.plot(X_test, loss, color="red", linewidth=3)
 
 ols = linear_model.LinearRegression()
 ols.fit(X, y)
 plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
-plt.axhline(.5, color='.5')
+plt.axhline(0.5, color=".5")
 
-plt.ylabel('y')
-plt.xlabel('X')
+plt.ylabel("y")
+plt.xlabel("X")
 plt.xticks(range(-5, 10))
 plt.yticks([0, 0.5, 1])
-plt.ylim(-.25, 1.25)
+plt.ylim(-0.25, 1.25)
 plt.xlim(-4, 10)
-plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
-           loc="lower right", fontsize='small')
+plt.legend(
+    ("Logistic Regression Model", "Linear Regression Model"),
+    loc="lower right",
+    fontsize="small",
+)
 plt.tight_layout()
 plt.show()
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index 3e518e8ec1e7a..bd42966604075 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -42,10 +42,11 @@
 # Set regularization parameter
 for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
     # turn down tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01, solver='saga')
-    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga')
-    clf_en_LR = LogisticRegression(C=C, penalty='elasticnet', solver='saga',
-                                   l1_ratio=l1_ratio, tol=0.01)
+    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
+    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
+    clf_en_LR = LogisticRegression(
+        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
+    )
     clf_l1_LR.fit(X, y)
     clf_l2_LR.fit(X, y)
     clf_en_LR.fit(X, y)
@@ -63,15 +64,13 @@
 
     print("C=%.2f" % C)
     print("{:<40} {:.2f}%".format("Sparsity with L1 penalty:", sparsity_l1_LR))
-    print("{:<40} {:.2f}%".format("Sparsity with Elastic-Net penalty:",
-                                  sparsity_en_LR))
+    print("{:<40} {:.2f}%".format("Sparsity with Elastic-Net penalty:", sparsity_en_LR))
     print("{:<40} {:.2f}%".format("Sparsity with L2 penalty:", sparsity_l2_LR))
-    print("{:<40} {:.2f}".format("Score with L1 penalty:",
-                                 clf_l1_LR.score(X, y)))
-    print("{:<40} {:.2f}".format("Score with Elastic-Net penalty:",
-                                 clf_en_LR.score(X, y)))
-    print("{:<40} {:.2f}".format("Score with L2 penalty:",
-                                 clf_l2_LR.score(X, y)))
+    print("{:<40} {:.2f}".format("Score with L1 penalty:", clf_l1_LR.score(X, y)))
+    print(
+        "{:<40} {:.2f}".format("Score with Elastic-Net penalty:", clf_en_LR.score(X, y))
+    )
+    print("{:<40} {:.2f}".format("Score with L2 penalty:", clf_l2_LR.score(X, y)))
 
     if i == 0:
         axes_row[0].set_title("L1 penalty")
@@ -79,11 +78,16 @@
         axes_row[2].set_title("L2 penalty")
 
     for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
-        ax.imshow(np.abs(coefs.reshape(8, 8)), interpolation='nearest',
-                  cmap='binary', vmax=1, vmin=0)
+        ax.imshow(
+            np.abs(coefs.reshape(8, 8)),
+            interpolation="nearest",
+            cmap="binary",
+            vmax=1,
+            vmin=0,
+        )
         ax.set_xticks(())
         ax.set_yticks(())
 
-    axes_row[0].set_ylabel('C = %s' % C)
+    axes_row[0].set_ylabel("C = %s" % C)
 
 plt.show()
diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py
index 518a2aeade61c..ec22777f71e8a 100644
--- a/examples/linear_model/plot_logistic_multinomial.py
+++ b/examples/linear_model/plot_logistic_multinomial.py
@@ -22,19 +22,19 @@
 transformation = [[0.4, 0.2], [-0.4, 1.2]]
 X = np.dot(X, transformation)
 
-for multi_class in ('multinomial', 'ovr'):
-    clf = LogisticRegression(solver='sag', max_iter=100, random_state=42,
-                             multi_class=multi_class).fit(X, y)
+for multi_class in ("multinomial", "ovr"):
+    clf = LogisticRegression(
+        solver="sag", max_iter=100, random_state=42, multi_class=multi_class
+    ).fit(X, y)
 
     # print the training scores
     print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))
 
     # create a mesh to plot in
-    h = .02  # step size in the mesh
+    h = 0.02  # step size in the mesh
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
     # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, x_max]x[y_min, y_max].
@@ -44,14 +44,15 @@
     plt.figure()
     plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
     plt.title("Decision surface of LogisticRegression (%s)" % multi_class)
-    plt.axis('tight')
+    plt.axis("tight")
 
     # Plot also the training points
     colors = "bry"
     for i, color in zip(clf.classes_, colors):
         idx = np.where(y == i)
-        plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired,
-                    edgecolor='black', s=20)
+        plt.scatter(
+            X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired, edgecolor="black", s=20
+        )
 
     # Plot the three one-against-all classifiers
     xmin, xmax = plt.xlim()
@@ -62,8 +63,8 @@
     def plot_hyperplane(c, color):
         def line(x0):
             return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
-        plt.plot([xmin, xmax], [line(xmin), line(xmax)],
-                 ls="--", color=color)
+
+        plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color)
 
     for i, color in zip(clf.classes_, colors):
         plot_hyperplane(i, color)
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 7aead065f3445..eacb74f22cf75 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -50,15 +50,19 @@
 # #############################################################################
 # Demo path functions
 
-cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16)
+cs = l1_min_c(X, y, loss="log") * np.logspace(0, 7, 16)
 
 
 print("Computing regularization path ...")
 start = time()
-clf = linear_model.LogisticRegression(penalty='l1', solver='liblinear',
-                                      tol=1e-6, max_iter=int(1e6),
-                                      warm_start=True,
-                                      intercept_scaling=10000.)
+clf = linear_model.LogisticRegression(
+    penalty="l1",
+    solver="liblinear",
+    tol=1e-6,
+    max_iter=int(1e6),
+    warm_start=True,
+    intercept_scaling=10000.0,
+)
 coefs_ = []
 for c in cs:
     clf.set_params(C=c)
@@ -67,10 +71,10 @@
 print("This took %0.3fs" % (time() - start))
 
 coefs_ = np.array(coefs_)
-plt.plot(np.log10(cs), coefs_, marker='o')
+plt.plot(np.log10(cs), coefs_, marker="o")
 ymin, ymax = plt.ylim()
-plt.xlabel('log(C)')
-plt.ylabel('Coefficients')
-plt.title('Logistic Regression Path')
-plt.axis('tight')
+plt.xlabel("log(C)")
+plt.ylabel("Coefficients")
+plt.title("Logistic Regression Path")
+plt.axis("tight")
 plt.show()
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index c7a9536383bc2..3802971ccf345 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -31,39 +31,43 @@
 coef = np.zeros((n_tasks, n_features))
 times = np.linspace(0, 2 * np.pi, n_tasks)
 for k in range(n_relevant_features):
-    coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1))
+    coef[:, k] = np.sin((1.0 + rng.randn(1)) * times + 3 * rng.randn(1))
 
 X = rng.randn(n_samples, n_features)
 Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks)
 
 coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
-coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_
+coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.0).fit(X, Y).coef_
 
 # #############################################################################
 # Plot support and time series
 fig = plt.figure(figsize=(8, 5))
 plt.subplot(1, 2, 1)
 plt.spy(coef_lasso_)
-plt.xlabel('Feature')
-plt.ylabel('Time (or Task)')
-plt.text(10, 5, 'Lasso')
+plt.xlabel("Feature")
+plt.ylabel("Time (or Task)")
+plt.text(10, 5, "Lasso")
 plt.subplot(1, 2, 2)
 plt.spy(coef_multi_task_lasso_)
-plt.xlabel('Feature')
-plt.ylabel('Time (or Task)')
-plt.text(10, 5, 'MultiTaskLasso')
-fig.suptitle('Coefficient non-zero location')
+plt.xlabel("Feature")
+plt.ylabel("Time (or Task)")
+plt.text(10, 5, "MultiTaskLasso")
+fig.suptitle("Coefficient non-zero location")
 
 feature_to_plot = 0
 plt.figure()
 lw = 2
-plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw,
-         label='Ground truth')
-plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw,
-         label='Lasso')
-plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw,
-         label='MultiTaskLasso')
-plt.legend(loc='upper center')
-plt.axis('tight')
+plt.plot(coef[:, feature_to_plot], color="seagreen", linewidth=lw, label="Ground truth")
+plt.plot(
+    coef_lasso_[:, feature_to_plot], color="cornflowerblue", linewidth=lw, label="Lasso"
+)
+plt.plot(
+    coef_multi_task_lasso_[:, feature_to_plot],
+    color="gold",
+    linewidth=lw,
+    label="MultiTaskLasso",
+)
+plt.legend(loc="upper center")
+plt.axis("tight")
 plt.ylim([-1.1, 1.1])
 plt.show()
diff --git a/examples/linear_model/plot_nnls.py b/examples/linear_model/plot_nnls.py
index 56f357c4214a6..02a6dade30cae 100644
--- a/examples/linear_model/plot_nnls.py
+++ b/examples/linear_model/plot_nnls.py
@@ -24,7 +24,7 @@
 y = np.dot(X, true_coef)
 
 # Add some noise
-y += 5 * np.random.normal(size=(n_samples, ))
+y += 5 * np.random.normal(size=(n_samples,))
 
 # %%
 # Split the data in train set and test set
@@ -62,6 +62,6 @@
 low_y, high_y = ax.get_ylim()
 low = max(low_x, low_y)
 high = min(high_x, high_y)
-ax.plot([low, high], [low, high], ls="--", c=".3", alpha=.5)
+ax.plot([low, high], [low, high], ls="--", c=".3", alpha=0.5)
 ax.set_xlabel("OLS regression coefficients", fontweight="bold")
 ax.set_ylabel("NNLS regression coefficients", fontweight="bold")
diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py
index 5f8c0079c3582..567596e82a8f1 100644
--- a/examples/linear_model/plot_ols.py
+++ b/examples/linear_model/plot_ols.py
@@ -51,17 +51,15 @@
 diabetes_y_pred = regr.predict(diabetes_X_test)
 
 # The coefficients
-print('Coefficients: \n', regr.coef_)
+print("Coefficients: \n", regr.coef_)
 # The mean squared error
-print('Mean squared error: %.2f'
-      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
+print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
 # The coefficient of determination: 1 is perfect prediction
-print('Coefficient of determination: %.2f'
-      % r2_score(diabetes_y_test, diabetes_y_pred))
+print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))
 
 # Plot outputs
-plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
-plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
+plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
+plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)
 
 plt.xticks(())
 plt.yticks(())
diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py
index aa34b3b680202..34b5a76954730 100644
--- a/examples/linear_model/plot_ols_3d.py
+++ b/examples/linear_model/plot_ols_3d.py
@@ -44,16 +44,18 @@ def plot_figs(fig_num, elev, azim, X_train, clf):
     plt.clf()
     ax = Axes3D(fig, elev=elev, azim=azim)
 
-    ax.scatter(X_train[:, 0], X_train[:, 1], y_train, c='k', marker='+')
-    ax.plot_surface(np.array([[-.1, -.1], [.15, .15]]),
-                    np.array([[-.1, .15], [-.1, .15]]),
-                    clf.predict(np.array([[-.1, -.1, .15, .15],
-                                          [-.1, .15, -.1, .15]]).T
-                                ).reshape((2, 2)),
-                    alpha=.5)
-    ax.set_xlabel('X_1')
-    ax.set_ylabel('X_2')
-    ax.set_zlabel('Y')
+    ax.scatter(X_train[:, 0], X_train[:, 1], y_train, c="k", marker="+")
+    ax.plot_surface(
+        np.array([[-0.1, -0.1], [0.15, 0.15]]),
+        np.array([[-0.1, 0.15], [-0.1, 0.15]]),
+        clf.predict(
+            np.array([[-0.1, -0.1, 0.15, 0.15], [-0.1, 0.15, -0.1, 0.15]]).T
+        ).reshape((2, 2)),
+        alpha=0.5,
+    )
+    ax.set_xlabel("X_1")
+    ax.set_ylabel("X_2")
+    ax.set_zlabel("Y")
     ax.w_xaxis.set_ticklabels([])
     ax.w_yaxis.set_ticklabels([])
     ax.w_zaxis.set_ticklabels([])
@@ -64,11 +66,11 @@ def plot_figs(fig_num, elev, azim, X_train, clf):
 azim = -110
 plot_figs(1, elev, azim, X_train, ols)
 
-elev = -.5
+elev = -0.5
 azim = 0
 plot_figs(2, elev, azim, X_train, ols)
 
-elev = -.5
+elev = -0.5
 azim = 90
 plot_figs(3, elev, azim, X_train, ols)
 
diff --git a/examples/linear_model/plot_ols_ridge_variance.py b/examples/linear_model/plot_ols_ridge_variance.py
index ba5f65575f927..e94979077a230 100644
--- a/examples/linear_model/plot_ols_ridge_variance.py
+++ b/examples/linear_model/plot_ols_ridge_variance.py
@@ -33,34 +33,35 @@
 
 from sklearn import linear_model
 
-X_train = np.c_[.5, 1].T
-y_train = [.5, 1]
+X_train = np.c_[0.5, 1].T
+y_train = [0.5, 1]
 X_test = np.c_[0, 2].T
 
 np.random.seed(0)
 
-classifiers = dict(ols=linear_model.LinearRegression(),
-                   ridge=linear_model.Ridge(alpha=.1))
+classifiers = dict(
+    ols=linear_model.LinearRegression(), ridge=linear_model.Ridge(alpha=0.1)
+)
 
 for name, clf in classifiers.items():
     fig, ax = plt.subplots(figsize=(4, 3))
 
     for _ in range(6):
-        this_X = .1 * np.random.normal(size=(2, 1)) + X_train
+        this_X = 0.1 * np.random.normal(size=(2, 1)) + X_train
         clf.fit(this_X, y_train)
 
-        ax.plot(X_test, clf.predict(X_test), color='gray')
-        ax.scatter(this_X, y_train, s=3, c='gray', marker='o', zorder=10)
+        ax.plot(X_test, clf.predict(X_test), color="gray")
+        ax.scatter(this_X, y_train, s=3, c="gray", marker="o", zorder=10)
 
     clf.fit(X_train, y_train)
-    ax.plot(X_test, clf.predict(X_test), linewidth=2, color='blue')
-    ax.scatter(X_train, y_train, s=30, c='red', marker='+', zorder=10)
+    ax.plot(X_test, clf.predict(X_test), linewidth=2, color="blue")
+    ax.scatter(X_train, y_train, s=30, c="red", marker="+", zorder=10)
 
     ax.set_title(name)
     ax.set_xlim(0, 2)
     ax.set_ylim((0, 1.6))
-    ax.set_xlabel('X')
-    ax.set_ylabel('y')
+    ax.set_xlabel("X")
+    ax.set_ylabel("y")
 
     fig.tight_layout()
 
diff --git a/examples/linear_model/plot_omp.py b/examples/linear_model/plot_omp.py
index 6052942fe9f48..2431ca10f5fb7 100644
--- a/examples/linear_model/plot_omp.py
+++ b/examples/linear_model/plot_omp.py
@@ -22,13 +22,15 @@
 # y = Xw
 # |x|_0 = n_nonzero_coefs
 
-y, X, w = make_sparse_coded_signal(n_samples=1,
-                                   n_components=n_components,
-                                   n_features=n_features,
-                                   n_nonzero_coefs=n_nonzero_coefs,
-                                   random_state=0)
+y, X, w = make_sparse_coded_signal(
+    n_samples=1,
+    n_components=n_components,
+    n_features=n_features,
+    n_nonzero_coefs=n_nonzero_coefs,
+    random_state=0,
+)
 
-idx, = w.nonzero()
+(idx,) = w.nonzero()
 
 # distort the clean signal
 y_noisy = y + 0.05 * np.random.randn(len(y))
@@ -41,11 +43,10 @@
 plt.stem(idx, w[idx], use_line_collection=True)
 
 # plot the noise-free reconstruction
-omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs,
-                                normalize=False)
+omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, normalize=False)
 omp.fit(X, y)
 coef = omp.coef_
-idx_r, = coef.nonzero()
+(idx_r,) = coef.nonzero()
 plt.subplot(4, 1, 2)
 plt.xlim(0, 512)
 plt.title("Recovered signal from noise-free measurements")
@@ -54,7 +55,7 @@
 # plot the noisy reconstruction
 omp.fit(X, y_noisy)
 coef = omp.coef_
-idx_r, = coef.nonzero()
+(idx_r,) = coef.nonzero()
 plt.subplot(4, 1, 3)
 plt.xlim(0, 512)
 plt.title("Recovered signal from noisy measurements")
@@ -64,13 +65,12 @@
 omp_cv = OrthogonalMatchingPursuitCV(normalize=False)
 omp_cv.fit(X, y_noisy)
 coef = omp_cv.coef_
-idx_r, = coef.nonzero()
+(idx_r,) = coef.nonzero()
 plt.subplot(4, 1, 4)
 plt.xlim(0, 512)
 plt.title("Recovered signal from noisy measurements with CV")
 plt.stem(idx_r, coef[idx_r], use_line_collection=True)
 
 plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38)
-plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit',
-             fontsize=16)
+plt.suptitle("Sparse signal recovery with Orthogonal Matching Pursuit", fontsize=16)
 plt.show()
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 570baee9e1f67..7c9abd261937d 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -70,12 +70,15 @@
 
 df["Frequency"] = df["ClaimNb"] / df["Exposure"]
 
-print("Average Frequency = {}"
-      .format(np.average(df["Frequency"], weights=df["Exposure"])))
+print(
+    "Average Frequency = {}".format(np.average(df["Frequency"], weights=df["Exposure"]))
+)
 
-print("Fraction of exposure with zero claims = {0:.1%}"
-      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
-              df["Exposure"].sum()))
+print(
+    "Fraction of exposure with zero claims = {0:.1%}".format(
+        df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / df["Exposure"].sum()
+    )
+)
 
 fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4))
 ax0.set_title("Number of claims")
@@ -100,20 +103,19 @@
 
 
 log_scale_transformer = make_pipeline(
-    FunctionTransformer(np.log, validate=False),
-    StandardScaler()
+    FunctionTransformer(np.log, validate=False), StandardScaler()
 )
 
 linear_model_preprocessor = ColumnTransformer(
     [
-        ("passthrough_numeric", "passthrough",
-            ["BonusMalus"]),
-        ("binned_numeric", KBinsDiscretizer(n_bins=10),
-            ["VehAge", "DrivAge"]),
-        ("log_scaled_numeric", log_scale_transformer,
-            ["Density"]),
-        ("onehot_categorical", OneHotEncoder(),
-            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
+        (
+            "onehot_categorical",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
     ],
     remainder="drop",
 )
@@ -137,11 +139,12 @@
 
 df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)
 
-dummy = Pipeline([
-    ("preprocessor", linear_model_preprocessor),
-    ("regressor", DummyRegressor(strategy='mean')),
-]).fit(df_train, df_train["Frequency"],
-       regressor__sample_weight=df_train["Exposure"])
+dummy = Pipeline(
+    [
+        ("preprocessor", linear_model_preprocessor),
+        ("regressor", DummyRegressor(strategy="mean")),
+    ]
+).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"])
 
 
 ##############################################################################
@@ -157,26 +160,38 @@ def score_estimator(estimator, df_test):
     """Score an estimator on the test set."""
     y_pred = estimator.predict(df_test)
 
-    print("MSE: %.3f" %
-          mean_squared_error(df_test["Frequency"], y_pred,
-                             sample_weight=df_test["Exposure"]))
-    print("MAE: %.3f" %
-          mean_absolute_error(df_test["Frequency"], y_pred,
-                              sample_weight=df_test["Exposure"]))
+    print(
+        "MSE: %.3f"
+        % mean_squared_error(
+            df_test["Frequency"], y_pred, sample_weight=df_test["Exposure"]
+        )
+    )
+    print(
+        "MAE: %.3f"
+        % mean_absolute_error(
+            df_test["Frequency"], y_pred, sample_weight=df_test["Exposure"]
+        )
+    )
 
     # Ignore non-positive predictions, as they are invalid for
     # the Poisson deviance.
     mask = y_pred > 0
     if (~mask).any():
         n_masked, n_samples = (~mask).sum(), mask.shape[0]
-        print(f"WARNING: Estimator yields invalid, non-positive predictions "
-              f" for {n_masked} samples out of {n_samples}. These predictions "
-              f"are ignored when computing the Poisson deviance.")
+        print(
+            "WARNING: Estimator yields invalid, non-positive predictions "
+            f" for {n_masked} samples out of {n_samples}. These predictions "
+            "are ignored when computing the Poisson deviance."
+        )
 
-    print("mean Poisson deviance: %.3f" %
-          mean_poisson_deviance(df_test["Frequency"][mask],
-                                y_pred[mask],
-                                sample_weight=df_test["Exposure"][mask]))
+    print(
+        "mean Poisson deviance: %.3f"
+        % mean_poisson_deviance(
+            df_test["Frequency"][mask],
+            y_pred[mask],
+            sample_weight=df_test["Exposure"][mask],
+        )
+    )
 
 
 print("Constant mean frequency evaluation:")
@@ -194,11 +209,12 @@ def score_estimator(estimator, df_test):
 from sklearn.linear_model import Ridge
 
 
-ridge_glm = Pipeline([
-    ("preprocessor", linear_model_preprocessor),
-    ("regressor", Ridge(alpha=1e-6)),
-]).fit(df_train, df_train["Frequency"],
-       regressor__sample_weight=df_train["Exposure"])
+ridge_glm = Pipeline(
+    [
+        ("preprocessor", linear_model_preprocessor),
+        ("regressor", Ridge(alpha=1e-6)),
+    ]
+).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"])
 
 # %%
 # The Poisson deviance cannot be computed on non-positive values predicted by
@@ -227,12 +243,15 @@ def score_estimator(estimator, df_test):
 
 n_samples = df_train.shape[0]
 
-poisson_glm = Pipeline([
-    ("preprocessor", linear_model_preprocessor),
-    ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
-])
-poisson_glm.fit(df_train, df_train["Frequency"],
-                regressor__sample_weight=df_train["Exposure"])
+poisson_glm = Pipeline(
+    [
+        ("preprocessor", linear_model_preprocessor),
+        ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)),
+    ]
+)
+poisson_glm.fit(
+    df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]
+)
 
 print("PoissonRegressor evaluation:")
 score_estimator(poisson_glm, df_test)
@@ -264,20 +283,27 @@ def score_estimator(estimator, df_test):
 
 tree_preprocessor = ColumnTransformer(
     [
-        ("categorical", OrdinalEncoder(),
-            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
-        ("numeric", "passthrough",
-            ["VehAge", "DrivAge", "BonusMalus", "Density"]),
+        (
+            "categorical",
+            OrdinalEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("numeric", "passthrough", ["VehAge", "DrivAge", "BonusMalus", "Density"]),
     ],
     remainder="drop",
 )
-poisson_gbrt = Pipeline([
-    ("preprocessor", tree_preprocessor),
-    ("regressor", HistGradientBoostingRegressor(loss="poisson",
-                                                max_leaf_nodes=128)),
-])
-poisson_gbrt.fit(df_train, df_train["Frequency"],
-                 regressor__sample_weight=df_train["Exposure"])
+poisson_gbrt = Pipeline(
+    [
+        ("preprocessor", tree_preprocessor),
+        (
+            "regressor",
+            HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128),
+        ),
+    ]
+)
+poisson_gbrt.fit(
+    df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]
+)
 
 print("Poisson Gradient Boosted Trees evaluation:")
 score_estimator(poisson_gbrt, df_test)
@@ -298,14 +324,11 @@ def score_estimator(estimator, df_test):
 fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True)
 fig.subplots_adjust(bottom=0.2)
 n_bins = 20
-for row_idx, label, df in zip(range(2),
-                              ["train", "test"],
-                              [df_train, df_test]):
-    df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins),
-                         ax=axes[row_idx, 0])
+for row_idx, label, df in zip(range(2), ["train", "test"], [df_train, df_test]):
+    df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), ax=axes[row_idx, 0])
 
     axes[row_idx, 0].set_title("Data")
-    axes[row_idx, 0].set_yscale('log')
+    axes[row_idx, 0].set_yscale("log")
     axes[row_idx, 0].set_xlabel("y (observed Frequency)")
     axes[row_idx, 0].set_ylim([1e1, 5e5])
     axes[row_idx, 0].set_ylabel(label + " samples")
@@ -313,12 +336,13 @@ def score_estimator(estimator, df_test):
     for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]):
         y_pred = model.predict(df)
 
-        pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
-                               ax=axes[row_idx, idx+1])
+        pd.Series(y_pred).hist(
+            bins=np.linspace(-1, 4, n_bins), ax=axes[row_idx, idx + 1]
+        )
         axes[row_idx, idx + 1].set(
             title=model[-1].__class__.__name__,
-            yscale='log',
-            xlabel="y_pred (predicted expected Frequency)"
+            yscale="log",
+            xlabel="y_pred (predicted expected Frequency)",
         )
 plt.tight_layout()
 
@@ -361,8 +385,7 @@ def score_estimator(estimator, df_test):
 from sklearn.utils import gen_even_slices
 
 
-def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
-                                  n_bins=100):
+def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100):
     """Compare predictions and observations for bins ordered by y_pred.
 
     We order the samples by ``y_pred`` and split it in bins.
@@ -389,19 +412,14 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
         average y_pred for each bin
     """
     idx_sort = np.argsort(y_pred)
-    bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins
+    bin_centers = np.arange(0, 1, 1 / n_bins) + 0.5 / n_bins
     y_pred_bin = np.zeros(n_bins)
     y_true_bin = np.zeros(n_bins)
 
     for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)):
         weights = sample_weight[idx_sort][sl]
-        y_pred_bin[n] = np.average(
-            y_pred[idx_sort][sl], weights=weights
-        )
-        y_true_bin[n] = np.average(
-            y_true[idx_sort][sl],
-            weights=weights
-        )
+        y_pred_bin[n] = np.average(y_pred[idx_sort][sl], weights=weights)
+        y_true_bin[n] = np.average(y_true[idx_sort][sl], weights=weights)
     return bin_centers, y_true_bin, y_pred_bin
 
 
@@ -409,27 +427,26 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
 fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
 plt.subplots_adjust(wspace=0.3)
 
-for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt,
-                                   dummy]):
+for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, dummy]):
     y_pred = model.predict(df_test)
     y_true = df_test["Frequency"].values
     exposure = df_test["Exposure"].values
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
-        y_true, y_pred, sample_weight=exposure, n_bins=10)
+        y_true, y_pred, sample_weight=exposure, n_bins=10
+    )
 
     # Name of the model after the estimator used in the last step of the
     # pipeline.
-    print(f"Predicted number of claims by {model[-1]}: "
-          f"{np.sum(y_pred * exposure):.1f}")
+    print(f"Predicted number of claims by {model[-1]}: {np.sum(y_pred * exposure):.1f}")
 
-    axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions")
-    axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations")
+    axi.plot(q, y_pred_seg, marker="x", linestyle="--", label="predictions")
+    axi.plot(q, y_true_seg, marker="o", linestyle="--", label="observations")
     axi.set_xlim(0, 1.0)
     axi.set_ylim(0, 0.5)
     axi.set(
         title=model[-1],
-        xlabel='Fraction of samples sorted by y_pred',
-        ylabel='Mean Frequency (y_pred)'
+        xlabel="Fraction of samples sorted by y_pred",
+        ylabel="Mean Frequency (y_pred)",
     )
     axi.legend()
 plt.tight_layout()
@@ -489,27 +506,27 @@ def lorenz_curve(y_true, y_pred, exposure):
 
 for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
     y_pred = model.predict(df_test)
-    cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred,
-                                            df_test["Exposure"])
+    cum_exposure, cum_claims = lorenz_curve(
+        df_test["Frequency"], y_pred, df_test["Exposure"]
+    )
     gini = 1 - 2 * auc(cum_exposure, cum_claims)
     label = "{} (Gini: {:.2f})".format(model[-1], gini)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"],
-                                        df_test["Frequency"],
-                                        df_test["Exposure"])
+cum_exposure, cum_claims = lorenz_curve(
+    df_test["Frequency"], df_test["Frequency"], df_test["Exposure"]
+)
 gini = 1 - 2 * auc(cum_exposure, cum_claims)
 label = "Oracle (Gini: {:.2f})".format(gini)
 ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random Baseline
-ax.plot([0, 1], [0, 1], linestyle="--", color="black",
-        label="Random baseline")
+ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
 ax.set(
     title="Lorenz curves by model",
-    xlabel='Cumulative proportion of exposure (from safest to riskiest)',
-    ylabel='Cumulative proportion of claims'
+    xlabel="Cumulative proportion of exposure (from safest to riskiest)",
+    ylabel="Cumulative proportion of claims",
 )
 ax.legend(loc="upper left")
 
diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index c6cd1f9d591bd..db1666168c88e 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -54,6 +54,7 @@
 # We start by defining a function that we intent to approximate and prepare
 # plotting it.
 
+
 def f(x):
     """Function to be approximated by polynomial interpolation."""
     return x * np.sin(x)
@@ -81,9 +82,9 @@ def f(x):
 # plot function
 lw = 2
 fig, ax = plt.subplots()
-ax.set_prop_cycle(color=[
-    "black", "teal", "yellowgreen", "gold", "darkorange", "tomato"
-])
+ax.set_prop_cycle(
+    color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"]
+)
 ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth")
 
 # plot training points
@@ -97,13 +98,12 @@ def f(x):
     ax.plot(x_plot, y_plot, label=f"degree {degree}")
 
 # B-spline with 4 + 3 - 1 = 6 basis functions
-model = make_pipeline(SplineTransformer(n_knots=4, degree=3),
-                      Ridge(alpha=1e-3))
+model = make_pipeline(SplineTransformer(n_knots=4, degree=3), Ridge(alpha=1e-3))
 model.fit(X_train, y_train)
 
 y_plot = model.predict(X_plot)
 ax.plot(x_plot, y_plot, label="B-spline")
-ax.legend(loc='lower center')
+ax.legend(loc="lower center")
 ax.set_ylim(-20, 10)
 plt.show()
 
@@ -133,7 +133,7 @@ def f(x):
 
 # plot knots of spline
 knots = splt.bsplines_[0].t
-axes[1].vlines(knots[3:-3], ymin=0, ymax=0.8, linestyles='dashed')
+axes[1].vlines(knots[3:-3], ymin=0, ymax=0.8, linestyles="dashed")
 plt.show()
 
 # %%
@@ -187,12 +187,15 @@ def g(x):
 ax.scatter(x_train, y_train, label="training points")
 
 for transformer, label in [
-  (SplineTransformer(degree=3, n_knots=10), "spline"),
-  (SplineTransformer(
-      degree=3,
-      knots=np.linspace(0, 2 * np.pi, 10)[:, None],
-      extrapolation="periodic"
-  ), "periodic spline")
+    (SplineTransformer(degree=3, n_knots=10), "spline"),
+    (
+        SplineTransformer(
+            degree=3,
+            knots=np.linspace(0, 2 * np.pi, 10)[:, None],
+            extrapolation="periodic",
+        ),
+        "periodic spline",
+    ),
 ]:
     model = make_pipeline(transformer, Ridge(alpha=1e-3))
     model.fit(X_train, y_train)
@@ -205,11 +208,9 @@ def g(x):
 # %% We again plot the underlying splines.
 fig, ax = plt.subplots()
 knots = np.linspace(0, 2 * np.pi, 4)
-splt = SplineTransformer(
-  knots=knots[:, None],
-  degree=3,
-  extrapolation="periodic"
-).fit(X_train)
+splt = SplineTransformer(knots=knots[:, None], degree=3, extrapolation="periodic").fit(
+    X_train
+)
 ax.plot(x_plot_ext, splt.transform(X_plot_ext))
 ax.legend(ax.lines, [f"spline {n}" for n in range(3)])
 plt.show()
diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py
index e1f1d484bf6b5..9fae4720499af 100644
--- a/examples/linear_model/plot_quantile_regression.py
+++ b/examples/linear_model/plot_quantile_regression.py
@@ -41,9 +41,7 @@
 #
 # - in the first case, a heteroscedastic Normal noise is added;
 # - in the second case, an asymmetric Pareto noise is added.
-y_normal = y_true_mean + rng.normal(
-    loc=0, scale=0.5 + 0.5 * x, size=x.shape[0]
-)
+y_normal = y_true_mean + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0])
 a = 5
 y_pareto = y_true_mean + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1))
 
@@ -52,21 +50,15 @@
 # residuals `y - mean(y)`.
 import matplotlib.pyplot as plt
 
-_, axs = plt.subplots(
-    nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row"
-)
+_, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row")
 
 axs[0, 0].plot(x, y_true_mean, label="True mean")
-axs[0, 0].scatter(
-    x, y_normal, color="black", alpha=0.5, label="Observations"
-)
+axs[0, 0].scatter(x, y_normal, color="black", alpha=0.5, label="Observations")
 axs[1, 0].hist(y_true_mean - y_normal, edgecolor="black")
 
 
 axs[0, 1].plot(x, y_true_mean, label="True mean")
-axs[0, 1].scatter(
-    x, y_pareto, color="black", alpha=0.5, label="Observations"
-)
+axs[0, 1].scatter(x, y_pareto, color="black", alpha=0.5, label="Observations")
 axs[1, 1].hist(y_true_mean - y_pareto, edgecolor="black")
 
 axs[0, 0].set_title("Dataset with heteroscedastic Normal distributed targets")
@@ -74,9 +66,7 @@
 axs[1, 0].set_title(
     "Residuals distribution for heteroscedastic Normal distributed targets"
 )
-axs[1, 1].set_title(
-    "Residuals distribution for asymmetric Pareto distributed target"
-)
+axs[1, 1].set_title("Residuals distribution for asymmetric Pareto distributed target")
 axs[0, 0].legend()
 axs[0, 1].legend()
 axs[0, 0].set_ylabel("y")
diff --git a/examples/linear_model/plot_ransac.py b/examples/linear_model/plot_ransac.py
index 0bafe4ee4a394..0c4070daf2fe9 100644
--- a/examples/linear_model/plot_ransac.py
+++ b/examples/linear_model/plot_ransac.py
@@ -17,9 +17,14 @@
 n_outliers = 50
 
 
-X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1,
-                                      n_informative=1, noise=10,
-                                      coef=True, random_state=0)
+X, y, coef = datasets.make_regression(
+    n_samples=n_samples,
+    n_features=1,
+    n_informative=1,
+    noise=10,
+    coef=True,
+    random_state=0,
+)
 
 # Add outlier data
 np.random.seed(0)
@@ -46,14 +51,21 @@
 print(coef, lr.coef_, ransac.estimator_.coef_)
 
 lw = 2
-plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.',
-            label='Inliers')
-plt.scatter(X[outlier_mask], y[outlier_mask], color='gold', marker='.',
-            label='Outliers')
-plt.plot(line_X, line_y, color='navy', linewidth=lw, label='Linear regressor')
-plt.plot(line_X, line_y_ransac, color='cornflowerblue', linewidth=lw,
-         label='RANSAC regressor')
-plt.legend(loc='lower right')
+plt.scatter(
+    X[inlier_mask], y[inlier_mask], color="yellowgreen", marker=".", label="Inliers"
+)
+plt.scatter(
+    X[outlier_mask], y[outlier_mask], color="gold", marker=".", label="Outliers"
+)
+plt.plot(line_X, line_y, color="navy", linewidth=lw, label="Linear regressor")
+plt.plot(
+    line_X,
+    line_y_ransac,
+    color="cornflowerblue",
+    linewidth=lw,
+    label="RANSAC regressor",
+)
+plt.legend(loc="lower right")
 plt.xlabel("Input")
 plt.ylabel("Response")
 plt.show()
diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py
index 3f3e574708d48..0dd395baf5fcd 100644
--- a/examples/linear_model/plot_ridge_coeffs.py
+++ b/examples/linear_model/plot_ridge_coeffs.py
@@ -50,8 +50,9 @@
 
 clf = Ridge()
 
-X, y, w = make_regression(n_samples=10, n_features=10, coef=True,
-                          random_state=1, bias=3.5)
+X, y, w = make_regression(
+    n_samples=10, n_features=10, coef=True, random_state=1, bias=3.5
+)
 
 coefs = []
 errors = []
@@ -71,19 +72,19 @@
 plt.subplot(121)
 ax = plt.gca()
 ax.plot(alphas, coefs)
-ax.set_xscale('log')
-plt.xlabel('alpha')
-plt.ylabel('weights')
-plt.title('Ridge coefficients as a function of the regularization')
-plt.axis('tight')
+ax.set_xscale("log")
+plt.xlabel("alpha")
+plt.ylabel("weights")
+plt.title("Ridge coefficients as a function of the regularization")
+plt.axis("tight")
 
 plt.subplot(122)
 ax = plt.gca()
 ax.plot(alphas, errors)
-ax.set_xscale('log')
-plt.xlabel('alpha')
-plt.ylabel('error')
-plt.title('Coefficient error as a function of the regularization')
-plt.axis('tight')
+ax.set_xscale("log")
+plt.xlabel("alpha")
+plt.ylabel("error")
+plt.title("Coefficient error as a function of the regularization")
+plt.axis("tight")
 
 plt.show()
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
index b16212cbd3718..14fac0b5bdc92 100644
--- a/examples/linear_model/plot_ridge_path.py
+++ b/examples/linear_model/plot_ridge_path.py
@@ -36,7 +36,7 @@
 from sklearn import linear_model
 
 # X is the 10x10 Hilbert matrix
-X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
+X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
 y = np.ones(10)
 
 # #############################################################################
@@ -57,10 +57,10 @@
 ax = plt.gca()
 
 ax.plot(alphas, coefs)
-ax.set_xscale('log')
+ax.set_xscale("log")
 ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
-plt.xlabel('alpha')
-plt.ylabel('weights')
-plt.title('Ridge coefficients as a function of the regularization')
-plt.axis('tight')
+plt.xlabel("alpha")
+plt.ylabel("weights")
+plt.title("Ridge coefficients as a function of the regularization")
+plt.axis("tight")
 plt.show()
diff --git a/examples/linear_model/plot_robust_fit.py b/examples/linear_model/plot_robust_fit.py
index 88fc05a695839..c9fe49fc0d416 100644
--- a/examples/linear_model/plot_robust_fit.py
+++ b/examples/linear_model/plot_robust_fit.py
@@ -34,7 +34,11 @@
 import numpy as np
 
 from sklearn.linear_model import (
-    LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
+    LinearRegression,
+    TheilSenRegressor,
+    RANSACRegressor,
+    HuberRegressor,
+)
 from sklearn.metrics import mean_squared_error
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
@@ -62,36 +66,50 @@
 X_errors_large = X.copy()
 X_errors_large[::3] = 10
 
-estimators = [('OLS', LinearRegression()),
-              ('Theil-Sen', TheilSenRegressor(random_state=42)),
-              ('RANSAC', RANSACRegressor(random_state=42)),
-              ('HuberRegressor', HuberRegressor())]
-colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold',
-          'RANSAC': 'lightgreen', 'HuberRegressor': 'black'}
-linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--', 'HuberRegressor': '--'}
+estimators = [
+    ("OLS", LinearRegression()),
+    ("Theil-Sen", TheilSenRegressor(random_state=42)),
+    ("RANSAC", RANSACRegressor(random_state=42)),
+    ("HuberRegressor", HuberRegressor()),
+]
+colors = {
+    "OLS": "turquoise",
+    "Theil-Sen": "gold",
+    "RANSAC": "lightgreen",
+    "HuberRegressor": "black",
+}
+linestyle = {"OLS": "-", "Theil-Sen": "-.", "RANSAC": "--", "HuberRegressor": "--"}
 lw = 3
 
 x_plot = np.linspace(X.min(), X.max())
 for title, this_X, this_y in [
-        ('Modeling Errors Only', X, y),
-        ('Corrupt X, Small Deviants', X_errors, y),
-        ('Corrupt y, Small Deviants', X, y_errors),
-        ('Corrupt X, Large Deviants', X_errors_large, y),
-        ('Corrupt y, Large Deviants', X, y_errors_large)]:
+    ("Modeling Errors Only", X, y),
+    ("Corrupt X, Small Deviants", X_errors, y),
+    ("Corrupt y, Small Deviants", X, y_errors),
+    ("Corrupt X, Large Deviants", X_errors_large, y),
+    ("Corrupt y, Large Deviants", X, y_errors_large),
+]:
     plt.figure(figsize=(5, 4))
-    plt.plot(this_X[:, 0], this_y, 'b+')
+    plt.plot(this_X[:, 0], this_y, "b+")
 
     for name, estimator in estimators:
         model = make_pipeline(PolynomialFeatures(3), estimator)
         model.fit(this_X, this_y)
         mse = mean_squared_error(model.predict(X_test), y_test)
         y_plot = model.predict(x_plot[:, np.newaxis])
-        plt.plot(x_plot, y_plot, color=colors[name], linestyle=linestyle[name],
-                 linewidth=lw, label='%s: error = %.3f' % (name, mse))
-
-    legend_title = 'Error of Mean\nAbsolute Deviation\nto Non-corrupt Data'
-    legend = plt.legend(loc='upper right', frameon=False, title=legend_title,
-                        prop=dict(size='x-small'))
+        plt.plot(
+            x_plot,
+            y_plot,
+            color=colors[name],
+            linestyle=linestyle[name],
+            linewidth=lw,
+            label="%s: error = %.3f" % (name, mse),
+        )
+
+    legend_title = "Error of Mean\nAbsolute Deviation\nto Non-corrupt Data"
+    legend = plt.legend(
+        loc="upper right", frameon=False, title=legend_title, prop=dict(size="x-small")
+    )
     plt.xlim(-4, 10.2)
     plt.ylim(-2, 10.2)
     plt.title(title)
diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py
index 346627c933d85..c75775cbe84f0 100644
--- a/examples/linear_model/plot_sgd_comparison.py
+++ b/examples/linear_model/plot_sgd_comparison.py
@@ -27,14 +27,18 @@
     ("SGD", SGDClassifier(max_iter=100)),
     ("ASGD", SGDClassifier(average=True)),
     ("Perceptron", Perceptron()),
-    ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge',
-                                                         C=1.0, tol=1e-4)),
-    ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge',
-                                                          C=1.0, tol=1e-4)),
-    ("SAG", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))
+    (
+        "Passive-Aggressive I",
+        PassiveAggressiveClassifier(loss="hinge", C=1.0, tol=1e-4),
+    ),
+    (
+        "Passive-Aggressive II",
+        PassiveAggressiveClassifier(loss="squared_hinge", C=1.0, tol=1e-4),
+    ),
+    ("SAG", LogisticRegression(solver="sag", tol=1e-1, C=1.0e4 / X.shape[0])),
 ]
 
-xx = 1. - np.array(heldout)
+xx = 1.0 - np.array(heldout)
 
 for name, clf in classifiers:
     print("training %s" % name)
@@ -43,8 +47,9 @@
     for i in heldout:
         yy_ = []
         for r in range(rounds):
-            X_train, X_test, y_train, y_test = \
-                train_test_split(X, y, test_size=i, random_state=rng)
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=i, random_state=rng
+            )
             clf.fit(X_train, y_train)
             y_pred = clf.predict(X_test)
             yy_.append(1 - np.mean(y_pred == y_test))
diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py
index bd7461dfb5f92..baa63f770bb6c 100644
--- a/examples/linear_model/plot_sgd_early_stopping.py
+++ b/examples/linear_model/plot_sgd_early_stopping.py
@@ -55,10 +55,10 @@
 print(__doc__)
 
 
-def load_mnist(n_samples=None, class_0='0', class_1='8'):
+def load_mnist(n_samples=None, class_0="0", class_1="8"):
     """Load MNIST, select two classes, shuffle and return only n_samples."""
     # Load data from http://openml.org/d/554
-    mnist = fetch_openml('mnist_784', version=1)
+    mnist = fetch_openml("mnist_784", version=1)
 
     # take only two classes for binary classification
     mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)
@@ -88,55 +88,58 @@ def fit_and_score(estimator, max_iter, X_train, X_test, y_train, y_test):
 
 # Define the estimators to compare
 estimator_dict = {
-    'No stopping criterion':
-    linear_model.SGDClassifier(n_iter_no_change=3),
-    'Training loss':
-    linear_model.SGDClassifier(early_stopping=False, n_iter_no_change=3,
-                               tol=0.1),
-    'Validation score':
-    linear_model.SGDClassifier(early_stopping=True, n_iter_no_change=3,
-                               tol=0.0001, validation_fraction=0.2)
+    "No stopping criterion": linear_model.SGDClassifier(n_iter_no_change=3),
+    "Training loss": linear_model.SGDClassifier(
+        early_stopping=False, n_iter_no_change=3, tol=0.1
+    ),
+    "Validation score": linear_model.SGDClassifier(
+        early_stopping=True, n_iter_no_change=3, tol=0.0001, validation_fraction=0.2
+    ),
 }
 
 # Load the dataset
 X, y = load_mnist(n_samples=10000)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
-                                                    random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 
 results = []
 for estimator_name, estimator in estimator_dict.items():
-    print(estimator_name + ': ', end='')
+    print(estimator_name + ": ", end="")
     for max_iter in range(1, 50):
-        print('.', end='')
+        print(".", end="")
         sys.stdout.flush()
 
         fit_time, n_iter, train_score, test_score = fit_and_score(
-            estimator, max_iter, X_train, X_test, y_train, y_test)
+            estimator, max_iter, X_train, X_test, y_train, y_test
+        )
 
-        results.append((estimator_name, max_iter, fit_time, n_iter,
-                        train_score, test_score))
-    print('')
+        results.append(
+            (estimator_name, max_iter, fit_time, n_iter, train_score, test_score)
+        )
+    print("")
 
 # Transform the results in a pandas dataframe for easy plotting
 columns = [
-    'Stopping criterion', 'max_iter', 'Fit time (sec)', 'n_iter_',
-    'Train score', 'Test score'
+    "Stopping criterion",
+    "max_iter",
+    "Fit time (sec)",
+    "n_iter_",
+    "Train score",
+    "Test score",
 ]
 results_df = pd.DataFrame(results, columns=columns)
 
 # Define what to plot (x_axis, y_axis)
-lines = 'Stopping criterion'
+lines = "Stopping criterion"
 plot_list = [
-    ('max_iter', 'Train score'),
-    ('max_iter', 'Test score'),
-    ('max_iter', 'n_iter_'),
-    ('max_iter', 'Fit time (sec)'),
+    ("max_iter", "Train score"),
+    ("max_iter", "Test score"),
+    ("max_iter", "n_iter_"),
+    ("max_iter", "Fit time (sec)"),
 ]
 
 nrows = 2
-ncols = int(np.ceil(len(plot_list) / 2.))
-fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols,
-                                                            4 * nrows))
+ncols = int(np.ceil(len(plot_list) / 2.0))
+fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols, 4 * nrows))
 axes[0, 0].get_shared_y_axes().join(axes[0, 0], axes[0, 1])
 
 for ax, (x_axis, y_axis) in zip(axes.ravel(), plot_list):
diff --git a/examples/linear_model/plot_sgd_iris.py b/examples/linear_model/plot_sgd_iris.py
index 0dddf7475728d..367ec6a1f69af 100644
--- a/examples/linear_model/plot_sgd_iris.py
+++ b/examples/linear_model/plot_sgd_iris.py
@@ -36,15 +36,14 @@
 std = X.std(axis=0)
 X = (X - mean) / std
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y)
 
 # create a mesh to plot in
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                     np.arange(y_min, y_max, h))
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
 # Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, x_max]x[y_min, y_max].
@@ -52,15 +51,22 @@
 # Put the result into a color plot
 Z = Z.reshape(xx.shape)
 cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
-plt.axis('tight')
+plt.axis("tight")
 
 # Plot also the training points
 for i, color in zip(clf.classes_, colors):
     idx = np.where(y == i)
-    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
-                cmap=plt.cm.Paired, edgecolor='black', s=20)
+    plt.scatter(
+        X[idx, 0],
+        X[idx, 1],
+        c=color,
+        label=iris.target_names[i],
+        cmap=plt.cm.Paired,
+        edgecolor="black",
+        s=20,
+    )
 plt.title("Decision surface of multi-class SGD")
-plt.axis('tight')
+plt.axis("tight")
 
 # Plot the three one-against-all classifiers
 xmin, xmax = plt.xlim()
@@ -73,8 +79,7 @@ def plot_hyperplane(c, color):
     def line(x0):
         return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
 
-    plt.plot([xmin, xmax], [line(xmin), line(xmax)],
-             ls="--", color=color)
+    plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color)
 
 
 for i, color in zip(clf.classes_, colors):
diff --git a/examples/linear_model/plot_sgd_loss_functions.py b/examples/linear_model/plot_sgd_loss_functions.py
index 4a7ad9ce9f0be..0cc4378ac9286 100644
--- a/examples/linear_model/plot_sgd_loss_functions.py
+++ b/examples/linear_model/plot_sgd_loss_functions.py
@@ -16,25 +16,32 @@ def modified_huber_loss(y_true, y_pred):
     z = y_pred * y_true
     loss = -4 * z
     loss[z >= -1] = (1 - z[z >= -1]) ** 2
-    loss[z >= 1.] = 0
+    loss[z >= 1.0] = 0
     return loss
 
 
 xmin, xmax = -4, 4
 xx = np.linspace(xmin, xmax, 100)
 lw = 2
-plt.plot([xmin, 0, 0, xmax], [1, 1, 0, 0], color='gold', lw=lw,
-         label="Zero-one loss")
-plt.plot(xx, np.where(xx < 1, 1 - xx, 0), color='teal', lw=lw,
-         label="Hinge loss")
-plt.plot(xx, -np.minimum(xx, 0), color='yellowgreen', lw=lw,
-         label="Perceptron loss")
-plt.plot(xx, np.log2(1 + np.exp(-xx)), color='cornflowerblue', lw=lw,
-         label="Log loss")
-plt.plot(xx, np.where(xx < 1, 1 - xx, 0) ** 2, color='orange', lw=lw,
-         label="Squared hinge loss")
-plt.plot(xx, modified_huber_loss(xx, 1), color='darkorchid', lw=lw,
-         linestyle='--', label="Modified Huber loss")
+plt.plot([xmin, 0, 0, xmax], [1, 1, 0, 0], color="gold", lw=lw, label="Zero-one loss")
+plt.plot(xx, np.where(xx < 1, 1 - xx, 0), color="teal", lw=lw, label="Hinge loss")
+plt.plot(xx, -np.minimum(xx, 0), color="yellowgreen", lw=lw, label="Perceptron loss")
+plt.plot(xx, np.log2(1 + np.exp(-xx)), color="cornflowerblue", lw=lw, label="Log loss")
+plt.plot(
+    xx,
+    np.where(xx < 1, 1 - xx, 0) ** 2,
+    color="orange",
+    lw=lw,
+    label="Squared hinge loss",
+)
+plt.plot(
+    xx,
+    modified_huber_loss(xx, 1),
+    color="darkorchid",
+    lw=lw,
+    linestyle="--",
+    label="Modified Huber loss",
+)
 plt.ylim((0, 8))
 plt.legend(loc="upper right")
 plt.xlabel(r"Decision function $f(x)$")
diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py
index 0307fb0e8ed94..795be3a15c4dc 100644
--- a/examples/linear_model/plot_sgd_penalties.py
+++ b/examples/linear_model/plot_sgd_penalties.py
@@ -30,22 +30,26 @@
 plt.figure(figsize=(10, 10), dpi=100)
 ax = plt.gca()
 
-elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1],
-                                  colors=elastic_net_color)
+elastic_net_contour = plt.contour(
+    xx, yy, elastic_net, levels=[1], colors=elastic_net_color
+)
 l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color)
 l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color)
 ax.set_aspect("equal")
-ax.spines['left'].set_position('center')
-ax.spines['right'].set_color('none')
-ax.spines['bottom'].set_position('center')
-ax.spines['top'].set_color('none')
-
-plt.clabel(elastic_net_contour, inline=1, fontsize=18,
-           fmt={1.0: 'elastic-net'}, manual=[(-1, -1)])
-plt.clabel(l2_contour, inline=1, fontsize=18,
-           fmt={1.0: 'L2'}, manual=[(-1, -1)])
-plt.clabel(l1_contour, inline=1, fontsize=18,
-           fmt={1.0: 'L1'}, manual=[(-1, -1)])
+ax.spines["left"].set_position("center")
+ax.spines["right"].set_color("none")
+ax.spines["bottom"].set_position("center")
+ax.spines["top"].set_color("none")
+
+plt.clabel(
+    elastic_net_contour,
+    inline=1,
+    fontsize=18,
+    fmt={1.0: "elastic-net"},
+    manual=[(-1, -1)],
+)
+plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)])
+plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)])
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/linear_model/plot_sgd_separating_hyperplane.py b/examples/linear_model/plot_sgd_separating_hyperplane.py
index e7263e4ecd347..b485c32fbd4d5 100644
--- a/examples/linear_model/plot_sgd_separating_hyperplane.py
+++ b/examples/linear_model/plot_sgd_separating_hyperplane.py
@@ -34,11 +34,10 @@
     p = clf.decision_function([[x1, x2]])
     Z[i, j] = p[0]
 levels = [-1.0, 0.0, 1.0]
-linestyles = ['dashed', 'solid', 'dashed']
-colors = 'k'
+linestyles = ["dashed", "solid", "dashed"]
+colors = "k"
 plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
-plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired,
-            edgecolor='black', s=20)
+plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor="black", s=20)
 
-plt.axis('tight')
+plt.axis("tight")
 plt.show()
diff --git a/examples/linear_model/plot_sgd_weighted_samples.py b/examples/linear_model/plot_sgd_weighted_samples.py
index 3617d81b0a063..64156fe7096c1 100644
--- a/examples/linear_model/plot_sgd_weighted_samples.py
+++ b/examples/linear_model/plot_sgd_weighted_samples.py
@@ -23,25 +23,35 @@
 # plot the weighted data points
 xx, yy = np.meshgrid(np.linspace(-4, 5, 500), np.linspace(-4, 5, 500))
 plt.figure()
-plt.scatter(X[:, 0], X[:, 1], c=y, s=sample_weight, alpha=0.9,
-            cmap=plt.cm.bone, edgecolor='black')
+plt.scatter(
+    X[:, 0],
+    X[:, 1],
+    c=y,
+    s=sample_weight,
+    alpha=0.9,
+    cmap=plt.cm.bone,
+    edgecolor="black",
+)
 
 # fit the unweighted model
 clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100)
 clf.fit(X, y)
 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
-no_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=['solid'])
+no_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=["solid"])
 
 # fit the weighted model
 clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100)
 clf.fit(X, y, sample_weight=sample_weight)
 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
-samples_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=['dashed'])
+samples_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=["dashed"])
 
-plt.legend([no_weights.collections[0], samples_weights.collections[0]],
-           ["no weights", "with weights"], loc="lower left")
+plt.legend(
+    [no_weights.collections[0], samples_weights.collections[0]],
+    ["no weights", "with weights"],
+    loc="lower left",
+)
 
 plt.xticks(())
 plt.yticks(())
diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
index e70694cdb1c1b..2252ad1fc98e9 100644
--- a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
+++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
@@ -27,10 +27,9 @@
 from sklearn.kernel_approximation import Nystroem
 from sklearn.pipeline import make_pipeline
 
-font = {'weight': 'normal',
-        'size': 15}
+font = {"weight": "normal", "size": 15}
 
-matplotlib.rc('font', **font)
+matplotlib.rc("font", **font)
 
 random_state = 42
 rng = np.random.RandomState(random_state)
@@ -48,10 +47,10 @@
 
 # OCSVM hyperparameters
 nu = 0.05
-gamma = 2.
+gamma = 2.0
 
 # Fit the One-Class SVM
-clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu)
+clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu)
 clf.fit(X_train)
 y_pred_train = clf.predict(X_train)
 y_pred_test = clf.predict(X_test)
@@ -66,8 +65,9 @@
 
 # Fit the One-Class SVM using a kernel approximation and SGD
 transform = Nystroem(gamma=gamma, random_state=random_state)
-clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True,
-                         random_state=random_state, tol=1e-4)
+clf_sgd = SGDOneClassSVM(
+    nu=nu, shuffle=True, fit_intercept=True, random_state=random_state, tol=1e-4
+)
 pipe_sgd = make_pipeline(transform, clf_sgd)
 pipe_sgd.fit(X_train)
 y_pred_train_sgd = pipe_sgd.predict(X_train)
@@ -82,54 +82,73 @@
 
 # plot the level sets of the decision function
 plt.figure(figsize=(9, 6))
-plt.title('One Class SVM')
+plt.title("One Class SVM")
 plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
-a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
-plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
+a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
+plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")
 
 s = 20
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
-                 edgecolors='k')
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
-                edgecolors='k')
-plt.axis('tight')
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")
+plt.axis("tight")
 plt.xlim((-4.5, 4.5))
 plt.ylim((-4.5, 4.5))
-plt.legend([a.collections[0], b1, b2, c],
-           ["learned frontier", "training observations",
-            "new regular observations", "new abnormal observations"],
-           loc="upper left")
+plt.legend(
+    [a.collections[0], b1, b2, c],
+    [
+        "learned frontier",
+        "training observations",
+        "new regular observations",
+        "new abnormal observations",
+    ],
+    loc="upper left",
+)
 plt.xlabel(
-    "error train: %d/%d; errors novel regular: %d/%d; "
-    "errors novel abnormal: %d/%d"
-    % (n_error_train, X_train.shape[0], n_error_test, X_test.shape[0],
-       n_error_outliers, X_outliers.shape[0]))
+    "error train: %d/%d; errors novel regular: %d/%d; errors novel abnormal: %d/%d"
+    % (
+        n_error_train,
+        X_train.shape[0],
+        n_error_test,
+        X_test.shape[0],
+        n_error_outliers,
+        X_outliers.shape[0],
+    )
+)
 plt.show()
 
 plt.figure(figsize=(9, 6))
-plt.title('Online One-Class SVM')
-plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7),
-             cmap=plt.cm.PuBu)
-a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors='darkred')
-plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors='palevioletred')
+plt.title("Online One-Class SVM")
+plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7), cmap=plt.cm.PuBu)
+a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors="darkred")
+plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors="palevioletred")
 
 s = 20
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
-                 edgecolors='k')
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
-                edgecolors='k')
-plt.axis('tight')
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")
+plt.axis("tight")
 plt.xlim((-4.5, 4.5))
 plt.ylim((-4.5, 4.5))
-plt.legend([a.collections[0], b1, b2, c],
-           ["learned frontier", "training observations",
-            "new regular observations", "new abnormal observations"],
-           loc="upper left")
+plt.legend(
+    [a.collections[0], b1, b2, c],
+    [
+        "learned frontier",
+        "training observations",
+        "new regular observations",
+        "new abnormal observations",
+    ],
+    loc="upper left",
+)
 plt.xlabel(
-    "error train: %d/%d; errors novel regular: %d/%d; "
-    "errors novel abnormal: %d/%d"
-    % (n_error_train_sgd, X_train.shape[0], n_error_test_sgd, X_test.shape[0],
-       n_error_outliers_sgd, X_outliers.shape[0]))
+    "error train: %d/%d; errors novel regular: %d/%d; errors novel abnormal: %d/%d"
+    % (
+        n_error_train_sgd,
+        X_train.shape[0],
+        n_error_test_sgd,
+        X_test.shape[0],
+        n_error_outliers_sgd,
+        X_outliers.shape[0],
+    )
+)
 plt.show()
diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
index bab97be5acd8b..71de01bbf34a1 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py
@@ -32,32 +32,34 @@
 print(__doc__)
 # Author: Arthur Mensch
 
-warnings.filterwarnings("ignore", category=ConvergenceWarning,
-                        module="sklearn")
+warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
 t0 = timeit.default_timer()
 
 # We use SAGA solver
-solver = 'saga'
+solver = "saga"
 
 # Turn down for faster run time
 n_samples = 10000
 
-X, y = fetch_20newsgroups_vectorized(subset='all', return_X_y=True)
+X, y = fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
 X = X[:n_samples]
 y = y[:n_samples]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                    random_state=42,
-                                                    stratify=y,
-                                                    test_size=0.1)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, random_state=42, stratify=y, test_size=0.1
+)
 train_samples, n_features = X_train.shape
 n_classes = np.unique(y).shape[0]
 
-print('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'
-      % (train_samples, n_features, n_classes))
+print(
+    "Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i"
+    % (train_samples, n_features, n_classes)
+)
 
-models = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]},
-          'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}
+models = {
+    "ovr": {"name": "One versus Rest", "iters": [1, 2, 4]},
+    "multinomial": {"name": "Multinomial", "iters": [1, 3, 7]},
+}
 
 for model in models:
     # Add initial chance-level values for plotting purpose
@@ -68,15 +70,18 @@
     model_params = models[model]
 
     # Small number of epochs for fast runtime
-    for this_max_iter in model_params['iters']:
-        print('[model=%s, solver=%s] Number of epochs: %s' %
-              (model_params['name'], solver, this_max_iter))
-        lr = LogisticRegression(solver=solver,
-                                multi_class=model,
-                                penalty='l1',
-                                max_iter=this_max_iter,
-                                random_state=42,
-                                )
+    for this_max_iter in model_params["iters"]:
+        print(
+            "[model=%s, solver=%s] Number of epochs: %s"
+            % (model_params["name"], solver, this_max_iter)
+        )
+        lr = LogisticRegression(
+            solver=solver,
+            multi_class=model,
+            penalty="l1",
+            max_iter=this_max_iter,
+            random_state=42,
+        )
         t1 = timeit.default_timer()
         lr.fit(X_train, y_train)
         train_time = timeit.default_timer() - t1
@@ -87,31 +92,33 @@
         accuracies.append(accuracy)
         densities.append(density)
         times.append(train_time)
-    models[model]['times'] = times
-    models[model]['densities'] = densities
-    models[model]['accuracies'] = accuracies
-    print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))
-    print('%% non-zero coefficients for model %s, '
-          'per class:\n %s' % (model, densities[-1]))
-    print('Run time (%i epochs) for model %s:'
-          '%.2f' % (model_params['iters'][-1], model, times[-1]))
+    models[model]["times"] = times
+    models[model]["densities"] = densities
+    models[model]["accuracies"] = accuracies
+    print("Test accuracy for model %s: %.4f" % (model, accuracies[-1]))
+    print(
+        "%% non-zero coefficients for model %s, per class:\n %s"
+        % (model, densities[-1])
+    )
+    print(
+        "Run time (%i epochs) for model %s:%.2f"
+        % (model_params["iters"][-1], model, times[-1])
+    )
 
 fig = plt.figure()
 ax = fig.add_subplot(111)
 
 for model in models:
-    name = models[model]['name']
-    times = models[model]['times']
-    accuracies = models[model]['accuracies']
-    ax.plot(times, accuracies, marker='o',
-            label='Model: %s' % name)
-    ax.set_xlabel('Train time (s)')
-    ax.set_ylabel('Test accuracy')
+    name = models[model]["name"]
+    times = models[model]["times"]
+    accuracies = models[model]["accuracies"]
+    ax.plot(times, accuracies, marker="o", label="Model: %s" % name)
+    ax.set_xlabel("Train time (s)")
+    ax.set_ylabel("Test accuracy")
 ax.legend()
-fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n'
-             'Dataset %s' % '20newsgroups')
+fig.suptitle("Multinomial vs One-vs-Rest Logistic L1\nDataset %s" % "20newsgroups")
 fig.tight_layout()
 fig.subplots_adjust(top=0.85)
 run_time = timeit.default_timer() - t0
-print('Example run in %.3f s' % run_time)
+print("Example run in %.3f s" % run_time)
 plt.show()
diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
index 968d597a5cac7..27d428881216f 100644
--- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py
+++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
@@ -36,7 +36,7 @@
 train_samples = 5000
 
 # Load data from https://www.openml.org/d/554
-X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
+X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
 
 random_state = check_random_state(0)
 permutation = random_state.permutation(X.shape[0])
@@ -45,16 +45,15 @@
 X = X.reshape((X.shape[0], -1))
 
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, train_size=train_samples, test_size=10000)
+    X, y, train_size=train_samples, test_size=10000
+)
 
 scaler = StandardScaler()
 X_train = scaler.fit_transform(X_train)
 X_test = scaler.transform(X_test)
 
 # Turn up tolerance for faster convergence
-clf = LogisticRegression(
-    C=50. / train_samples, penalty='l1', solver='saga', tol=0.1
-)
+clf = LogisticRegression(C=50.0 / train_samples, penalty="l1", solver="saga", tol=0.1)
 clf.fit(X_train, y_train)
 sparsity = np.mean(clf.coef_ == 0) * 100
 score = clf.score(X_test, y_test)
@@ -67,13 +66,18 @@
 scale = np.abs(coef).max()
 for i in range(10):
     l1_plot = plt.subplot(2, 5, i + 1)
-    l1_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest',
-                   cmap=plt.cm.RdBu, vmin=-scale, vmax=scale)
+    l1_plot.imshow(
+        coef[i].reshape(28, 28),
+        interpolation="nearest",
+        cmap=plt.cm.RdBu,
+        vmin=-scale,
+        vmax=scale,
+    )
     l1_plot.set_xticks(())
     l1_plot.set_yticks(())
-    l1_plot.set_xlabel('Class %i' % i)
-plt.suptitle('Classification vector for...')
+    l1_plot.set_xlabel("Class %i" % i)
+plt.suptitle("Classification vector for...")
 
 run_time = time.time() - t0
-print('Example run in %.3f s' % run_time)
+print("Example run in %.3f s" % run_time)
 plt.show()
diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py
index c80b4a409937b..54196188d864a 100644
--- a/examples/linear_model/plot_theilsen.py
+++ b/examples/linear_model/plot_theilsen.py
@@ -45,10 +45,12 @@
 
 print(__doc__)
 
-estimators = [('OLS', LinearRegression()),
-              ('Theil-Sen', TheilSenRegressor(random_state=42)),
-              ('RANSAC', RANSACRegressor(random_state=42)), ]
-colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'}
+estimators = [
+    ("OLS", LinearRegression()),
+    ("Theil-Sen", TheilSenRegressor(random_state=42)),
+    ("RANSAC", RANSACRegressor(random_state=42)),
+]
+colors = {"OLS": "turquoise", "Theil-Sen": "gold", "RANSAC": "lightgreen"}
 lw = 2
 
 # #############################################################################
@@ -58,26 +60,31 @@
 n_samples = 200
 # Linear model y = 3*x + N(2, 0.1**2)
 x = np.random.randn(n_samples)
-w = 3.
-c = 2.
+w = 3.0
+c = 2.0
 noise = 0.1 * np.random.randn(n_samples)
 y = w * x + c + noise
 # 10% outliers
 y[-20:] += -20 * x[-20:]
 X = x[:, np.newaxis]
 
-plt.scatter(x, y, color='indigo', marker='x', s=40)
+plt.scatter(x, y, color="indigo", marker="x", s=40)
 line_x = np.array([-3, 3])
 for name, estimator in estimators:
     t0 = time.time()
     estimator.fit(X, y)
     elapsed_time = time.time() - t0
     y_pred = estimator.predict(line_x.reshape(2, 1))
-    plt.plot(line_x, y_pred, color=colors[name], linewidth=lw,
-             label='%s (fit time: %.2fs)' % (name, elapsed_time))
-
-plt.axis('tight')
-plt.legend(loc='upper left')
+    plt.plot(
+        line_x,
+        y_pred,
+        color=colors[name],
+        linewidth=lw,
+        label="%s (fit time: %.2fs)" % (name, elapsed_time),
+    )
+
+plt.axis("tight")
+plt.legend(loc="upper left")
 plt.title("Corrupt y")
 
 # #############################################################################
@@ -94,7 +101,7 @@
 X = x[:, np.newaxis]
 
 plt.figure()
-plt.scatter(x, y, color='indigo', marker='x', s=40)
+plt.scatter(x, y, color="indigo", marker="x", s=40)
 
 line_x = np.array([-3, 10])
 for name, estimator in estimators:
@@ -102,10 +109,15 @@
     estimator.fit(X, y)
     elapsed_time = time.time() - t0
     y_pred = estimator.predict(line_x.reshape(2, 1))
-    plt.plot(line_x, y_pred, color=colors[name], linewidth=lw,
-             label='%s (fit time: %.2fs)' % (name, elapsed_time))
-
-plt.axis('tight')
-plt.legend(loc='upper left')
+    plt.plot(
+        line_x,
+        y_pred,
+        color=colors[name],
+        linewidth=lw,
+        label="%s (fit time: %.2fs)" % (name, elapsed_time),
+    )
+
+plt.axis("tight")
+plt.legend(loc="upper left")
 plt.title("Corrupt x")
 plt.show()
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 8edf97d0738a9..1bcf26742d2f1 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -72,15 +72,15 @@ def load_mtpl2(n_samples=100000):
       678013 samples.
     """
     # freMTPL2freq dataset from https://www.openml.org/d/41214
-    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
-    df_freq['IDpol'] = df_freq['IDpol'].astype(int)
-    df_freq.set_index('IDpol', inplace=True)
+    df_freq = fetch_openml(data_id=41214, as_frame=True)["data"]
+    df_freq["IDpol"] = df_freq["IDpol"].astype(int)
+    df_freq.set_index("IDpol", inplace=True)
 
     # freMTPL2sev dataset from https://www.openml.org/d/41215
-    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
+    df_sev = fetch_openml(data_id=41215, as_frame=True)["data"]
 
     # sum ClaimAmount over identical IDs
-    df_sev = df_sev.groupby('IDpol').sum()
+    df_sev = df_sev.groupby("IDpol").sum()
 
     df = df_freq.join(df_sev, how="left")
     df["ClaimAmount"].fillna(0, inplace=True)
@@ -91,8 +91,17 @@ def load_mtpl2(n_samples=100000):
     return df.iloc[:n_samples]
 
 
-def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
-                  title=None, ax=None, fill_legend=False):
+def plot_obs_pred(
+    df,
+    feature,
+    weight,
+    observed,
+    predicted,
+    y_label=None,
+    title=None,
+    ax=None,
+    fill_legend=False,
+):
     """Plot observed and predicted - aggregated per feature level.
 
     Parameters
@@ -139,21 +148,30 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
 
 
 def score_estimator(
-    estimator, X_train, X_test, df_train, df_test, target, weights,
+    estimator,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target,
+    weights,
     tweedie_powers=None,
 ):
     """Evaluate an estimator on train and test sets with different metrics"""
 
     metrics = [
-        ("D² explained", None),   # Use default scorer if it exists
+        ("D² explained", None),  # Use default scorer if it exists
         ("mean abs. error", mean_absolute_error),
         ("mean squared error", mean_squared_error),
     ]
     if tweedie_powers:
-        metrics += [(
-            "mean Tweedie dev p={:.4f}".format(power),
-            partial(mean_tweedie_deviance, power=power)
-        ) for power in tweedie_powers]
+        metrics += [
+            (
+                "mean Tweedie dev p={:.4f}".format(power),
+                partial(mean_tweedie_deviance, power=power),
+            )
+            for power in tweedie_powers
+        ]
 
     res = []
     for subset_label, X, df in [
@@ -177,16 +195,14 @@ def score_estimator(
             else:
                 score = metric(y, y_pred, sample_weight=_weights)
 
-            res.append(
-                {"subset": subset_label, "metric": score_label, "score": score}
-            )
+            res.append({"subset": subset_label, "metric": score_label, "score": score})
 
     res = (
         pd.DataFrame(res)
         .set_index(["metric", "subset"])
         .score.unstack(-1)
         .round(4)
-        .loc[:, ['train', 'test']]
+        .loc[:, ["train", "test"]]
     )
     return res
 
@@ -213,20 +229,19 @@ def score_estimator(
 df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
 
 log_scale_transformer = make_pipeline(
-    FunctionTransformer(func=np.log),
-    StandardScaler()
+    FunctionTransformer(func=np.log), StandardScaler()
 )
 
 column_trans = ColumnTransformer(
     [
-        ("binned_numeric", KBinsDiscretizer(n_bins=10),
-            ["VehAge", "DrivAge"]),
-        ("onehot_categorical", OneHotEncoder(),
-            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
-        ("passthrough_numeric", "passthrough",
-            ["BonusMalus"]),
-        ("log_scaled_numeric", log_scale_transformer,
-            ["Density"]),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "onehot_categorical",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
+        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
     ],
     remainder="drop",
 )
@@ -263,8 +278,7 @@ def score_estimator(
 # on the training set via a quasi-Newton solver: l-BFGS. Some of the features
 # are collinear, we use a weak penalization to avoid numerical issues.
 glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
-glm_freq.fit(X_train, df_train["Frequency"],
-             sample_weight=df_train["Exposure"])
+glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"])
 
 scores = score_estimator(
     glm_freq,
@@ -306,7 +320,7 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[0, 1],
-    fill_legend=True
+    fill_legend=True,
 )
 
 plot_obs_pred(
@@ -318,7 +332,7 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 0],
-    fill_legend=True
+    fill_legend=True,
 )
 
 plot_obs_pred(
@@ -330,7 +344,7 @@ def score_estimator(
     y_label="Claim Frequency",
     title="test data",
     ax=ax[1, 1],
-    fill_legend=True
+    fill_legend=True,
 )
 
 
@@ -356,7 +370,7 @@ def score_estimator(
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
 
-glm_sev = GammaRegressor(alpha=10., max_iter=10000)
+glm_sev = GammaRegressor(alpha=10.0, max_iter=10000)
 
 glm_sev.fit(
     X_train[mask_train.values],
@@ -385,12 +399,18 @@ def score_estimator(
 # such, it is conditional on having at least one claim, and cannot be used to
 # predict the average claim amount per policy in general.
 
-print("Mean AvgClaim Amount per policy:              %.2f "
-      % df_train["AvgClaimAmount"].mean())
-print("Mean AvgClaim Amount | NbClaim > 0:           %.2f"
-      % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean())
-print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
-      % glm_sev.predict(X_train).mean())
+print(
+    "Mean AvgClaim Amount per policy:              %.2f "
+    % df_train["AvgClaimAmount"].mean()
+)
+print(
+    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
+    % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()
+)
+print(
+    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    % glm_sev.predict(X_train).mean()
+)
 
 
 # %%
@@ -419,7 +439,7 @@ def score_estimator(
     y_label="Average Claim Severity",
     title="test data",
     ax=ax[1],
-    fill_legend=True
+    fill_legend=True,
 )
 plt.tight_layout()
 
@@ -455,9 +475,10 @@ def score_estimator(
 # Ideally, we hope that one model will be consistently better than the other,
 # regardless of `power`.
 
-glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000)
-glm_pure_premium.fit(X_train, df_train["PurePremium"],
-                     sample_weight=df_train["Exposure"])
+glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
+glm_pure_premium.fit(
+    X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]
+)
 
 tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]
 
@@ -480,15 +501,17 @@ def score_estimator(
     df_test,
     target="PurePremium",
     weights="Exposure",
-    tweedie_powers=tweedie_powers
+    tweedie_powers=tweedie_powers,
 )
 
-scores = pd.concat([scores_product_model, scores_glm_pure_premium],
-                   axis=1, sort=True,
-                   keys=('Product Model', 'TweedieRegressor'))
-print("Evaluation of the Product Model and the Tweedie Regressor "
-      "on target PurePremium")
-with pd.option_context('display.expand_frame_repr', False):
+scores = pd.concat(
+    [scores_product_model, scores_glm_pure_premium],
+    axis=1,
+    sort=True,
+    keys=("Product Model", "TweedieRegressor"),
+)
+print("Evaluation of the Product Model and the Tweedie Regressor on target PurePremium")
+with pd.option_context("display.expand_frame_repr", False):
     print(scores)
 
 # %%
@@ -515,8 +538,7 @@ def score_estimator(
                 exposure * glm_freq.predict(X) * glm_sev.predict(X)
             ),
             "predicted, tweedie, power=%.2f"
-            % glm_pure_premium.power: np.sum(
-                exposure * glm_pure_premium.predict(X)),
+            % glm_pure_premium.power: np.sum(exposure * glm_pure_premium.predict(X)),
         }
     )
 
@@ -567,30 +589,31 @@ def lorenz_curve(y_true, y_pred, exposure):
 y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
 y_pred_total = glm_pure_premium.predict(X_test)
 
-for label, y_pred in [("Frequency * Severity model", y_pred_product),
-                      ("Compound Poisson Gamma", y_pred_total)]:
+for label, y_pred in [
+    ("Frequency * Severity model", y_pred_product),
+    ("Compound Poisson Gamma", y_pred_total),
+]:
     ordered_samples, cum_claims = lorenz_curve(
-        df_test["PurePremium"], y_pred, df_test["Exposure"])
+        df_test["PurePremium"], y_pred, df_test["Exposure"]
+    )
     gini = 1 - 2 * auc(ordered_samples, cum_claims)
     label += " (Gini index: {:.3f})".format(gini)
     ax.plot(ordered_samples, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
 ordered_samples, cum_claims = lorenz_curve(
-    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"])
+    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]
+)
 gini = 1 - 2 * auc(ordered_samples, cum_claims)
 label = "Oracle (Gini index: {:.3f})".format(gini)
-ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray",
-        label=label)
+ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random baseline
-ax.plot([0, 1], [0, 1], linestyle="--", color="black",
-        label="Random baseline")
+ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
 ax.set(
     title="Lorenz Curves",
-    xlabel=('Fraction of policyholders\n'
-            '(ordered by model from safest to riskiest)'),
-    ylabel='Fraction of total claim amount'
+    xlabel="Fraction of policyholders\n(ordered by model from safest to riskiest)",
+    ylabel="Fraction of total claim amount",
 )
 ax.legend(loc="upper left")
 plt.plot()
diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index c78ecc234186a..a8485f07bf150 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -43,31 +43,34 @@
 
 # Create figure
 fig = plt.figure(figsize=(15, 8))
-fig.suptitle("Manifold Learning with %i points, %i neighbors"
-             % (1000, n_neighbors), fontsize=14)
+fig.suptitle(
+    "Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14
+)
 
 # Add 3d scatter plot
-ax = fig.add_subplot(251, projection='3d')
+ax = fig.add_subplot(251, projection="3d")
 ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
 ax.view_init(4, -72)
 
 # Set-up manifold methods
-LLE = partial(manifold.LocallyLinearEmbedding,
-              n_neighbors=n_neighbors, n_components=n_components,
-              eigen_solver='auto')
+LLE = partial(
+    manifold.LocallyLinearEmbedding,
+    n_neighbors=n_neighbors,
+    n_components=n_components,
+    eigen_solver="auto",
+)
 
 methods = OrderedDict()
-methods['LLE'] = LLE(method='standard')
-methods['LTSA'] = LLE(method='ltsa')
-methods['Hessian LLE'] = LLE(method='hessian')
-methods['Modified LLE'] = LLE(method='modified')
-methods['Isomap'] = manifold.Isomap(n_neighbors=n_neighbors,
-                                    n_components=n_components)
-methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
-methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
-                                           n_neighbors=n_neighbors)
-methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
-                                 random_state=0)
+methods["LLE"] = LLE(method="standard")
+methods["LTSA"] = LLE(method="ltsa")
+methods["Hessian LLE"] = LLE(method="hessian")
+methods["Modified LLE"] = LLE(method="modified")
+methods["Isomap"] = manifold.Isomap(n_neighbors=n_neighbors, n_components=n_components)
+methods["MDS"] = manifold.MDS(n_components, max_iter=100, n_init=1)
+methods["SE"] = manifold.SpectralEmbedding(
+    n_components=n_components, n_neighbors=n_neighbors
+)
+methods["t-SNE"] = manifold.TSNE(n_components=n_components, init="pca", random_state=0)
 
 # Plot results
 for i, (label, method) in enumerate(methods.items()):
@@ -80,6 +83,6 @@
     ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    ax.axis('tight')
+    ax.axis("tight")
 
 plt.show()
diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py
index fbc125fb8773f..41b0df181b344 100644
--- a/examples/manifold/plot_manifold_sphere.py
+++ b/examples/manifold/plot_manifold_sphere.py
@@ -55,32 +55,39 @@
 t = random_state.rand(n_samples) * np.pi
 
 # Sever the poles from the sphere.
-indices = ((t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8))))
+indices = (t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8)))
 colors = p[indices]
-x, y, z = np.sin(t[indices]) * np.cos(p[indices]), \
-    np.sin(t[indices]) * np.sin(p[indices]), \
-    np.cos(t[indices])
+x, y, z = (
+    np.sin(t[indices]) * np.cos(p[indices]),
+    np.sin(t[indices]) * np.sin(p[indices]),
+    np.cos(t[indices]),
+)
 
 # Plot our dataset.
 fig = plt.figure(figsize=(15, 8))
-plt.suptitle("Manifold Learning with %i points, %i neighbors"
-             % (1000, n_neighbors), fontsize=14)
+plt.suptitle(
+    "Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14
+)
 
-ax = fig.add_subplot(251, projection='3d')
+ax = fig.add_subplot(251, projection="3d")
 ax.scatter(x, y, z, c=p[indices], cmap=plt.cm.rainbow)
 ax.view_init(40, -10)
 
 sphere_data = np.array([x, y, z]).T
 
 # Perform Locally Linear Embedding Manifold learning
-methods = ['standard', 'ltsa', 'hessian', 'modified']
-labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']
+methods = ["standard", "ltsa", "hessian", "modified"]
+labels = ["LLE", "LTSA", "Hessian LLE", "Modified LLE"]
 
 for i, method in enumerate(methods):
     t0 = time()
-    trans_data = manifold.LocallyLinearEmbedding(
-        n_neighbors=n_neighbors, n_components=2,
-        method=method).fit_transform(sphere_data).T
+    trans_data = (
+        manifold.LocallyLinearEmbedding(
+            n_neighbors=n_neighbors, n_components=2, method=method
+        )
+        .fit_transform(sphere_data)
+        .T
+    )
     t1 = time()
     print("%s: %.2g sec" % (methods[i], t1 - t0))
 
@@ -89,21 +96,24 @@
     plt.title("%s (%.2g sec)" % (labels[i], t1 - t0))
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    plt.axis('tight')
+    plt.axis("tight")
 
 # Perform Isomap Manifold learning.
 t0 = time()
-trans_data = manifold.Isomap(n_neighbors=n_neighbors,
-                             n_components=2).fit_transform(sphere_data).T
+trans_data = (
+    manifold.Isomap(n_neighbors=n_neighbors, n_components=2)
+    .fit_transform(sphere_data)
+    .T
+)
 t1 = time()
-print("%s: %.2g sec" % ('ISO', t1 - t0))
+print("%s: %.2g sec" % ("ISO", t1 - t0))
 
 ax = fig.add_subplot(257)
 plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
-plt.title("%s (%.2g sec)" % ('Isomap', t1 - t0))
+plt.title("%s (%.2g sec)" % ("Isomap", t1 - t0))
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+plt.axis("tight")
 
 # Perform Multi-dimensional scaling.
 t0 = time()
@@ -117,12 +127,11 @@
 plt.title("MDS (%.2g sec)" % (t1 - t0))
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+plt.axis("tight")
 
 # Perform Spectral Embedding.
 t0 = time()
-se = manifold.SpectralEmbedding(n_components=2,
-                                n_neighbors=n_neighbors)
+se = manifold.SpectralEmbedding(n_components=2, n_neighbors=n_neighbors)
 trans_data = se.fit_transform(sphere_data).T
 t1 = time()
 print("Spectral Embedding: %.2g sec" % (t1 - t0))
@@ -132,11 +141,11 @@
 plt.title("Spectral Embedding (%.2g sec)" % (t1 - t0))
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+plt.axis("tight")
 
 # Perform t-distributed stochastic neighbor embedding.
 t0 = time()
-tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
+tsne = manifold.TSNE(n_components=2, init="pca", random_state=0)
 trans_data = tsne.fit_transform(sphere_data).T
 t1 = time()
 print("t-SNE: %.2g sec" % (t1 - t0))
@@ -146,6 +155,6 @@
 plt.title("t-SNE (%.2g sec)" % (t1 - t0))
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+plt.axis("tight")
 
 plt.show()
diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py
index aa2218a399a34..1605ae5b20164 100644
--- a/examples/manifold/plot_mds.py
+++ b/examples/manifold/plot_mds.py
@@ -38,13 +38,26 @@
 noise[np.arange(noise.shape[0]), np.arange(noise.shape[0])] = 0
 similarities += noise
 
-mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
-                   dissimilarity="precomputed", n_jobs=1)
+mds = manifold.MDS(
+    n_components=2,
+    max_iter=3000,
+    eps=1e-9,
+    random_state=seed,
+    dissimilarity="precomputed",
+    n_jobs=1,
+)
 pos = mds.fit(similarities).embedding_
 
-nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
-                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
-                    n_init=1)
+nmds = manifold.MDS(
+    n_components=2,
+    metric=False,
+    max_iter=3000,
+    eps=1e-12,
+    dissimilarity="precomputed",
+    random_state=seed,
+    n_jobs=1,
+    n_init=1,
+)
 npos = nmds.fit_transform(similarities, init=pos)
 
 # Rescale the data
@@ -60,14 +73,13 @@
 npos = clf.fit_transform(npos)
 
 fig = plt.figure(1)
-ax = plt.axes([0., 0., 1., 1.])
+ax = plt.axes([0.0, 0.0, 1.0, 1.0])
 
 s = 100
-plt.scatter(X_true[:, 0], X_true[:, 1], color='navy', s=s, lw=0,
-            label='True Position')
-plt.scatter(pos[:, 0], pos[:, 1], color='turquoise', s=s, lw=0, label='MDS')
-plt.scatter(npos[:, 0], npos[:, 1], color='darkorange', s=s, lw=0, label='NMDS')
-plt.legend(scatterpoints=1, loc='best', shadow=False)
+plt.scatter(X_true[:, 0], X_true[:, 1], color="navy", s=s, lw=0, label="True Position")
+plt.scatter(pos[:, 0], pos[:, 1], color="turquoise", s=s, lw=0, label="MDS")
+plt.scatter(npos[:, 0], npos[:, 1], color="darkorange", s=s, lw=0, label="NMDS")
+plt.legend(scatterpoints=1, loc="best", shadow=False)
 
 similarities = similarities.max() / (similarities + EPSILON) * 100
 np.fill_diagonal(similarities, 0)
@@ -75,12 +87,13 @@
 start_idx, end_idx = np.where(pos)
 # a sequence of (*line0*, *line1*, *line2*), where::
 #            linen = (x0, y0), (x1, y1), ... (xm, ym)
-segments = [[X_true[i, :], X_true[j, :]]
-            for i in range(len(pos)) for j in range(len(pos))]
+segments = [
+    [X_true[i, :], X_true[j, :]] for i in range(len(pos)) for j in range(len(pos))
+]
 values = np.abs(similarities)
-lc = LineCollection(segments,
-                    zorder=0, cmap=plt.cm.Blues,
-                    norm=plt.Normalize(0, values.max()))
+lc = LineCollection(
+    segments, zorder=0, cmap=plt.cm.Blues, norm=plt.Normalize(0, values.max())
+)
 lc.set_array(similarities.flatten())
 lc.set_linewidths(np.full(len(segments), 0.5))
 ax.add_collection(lc)
diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py
index d049658f8f775..3aa2088c22687 100644
--- a/examples/manifold/plot_swissroll.py
+++ b/examples/manifold/plot_swissroll.py
@@ -16,17 +16,18 @@
 
 # This import is needed to modify the way figure behaves
 from mpl_toolkits.mplot3d import Axes3D
+
 Axes3D
 
 # ----------------------------------------------------------------------
 # Locally linear embedding of the swiss roll
 
 from sklearn import manifold, datasets
+
 X, color = datasets.make_swiss_roll(n_samples=1500)
 
 print("Computing LLE embedding")
-X_r, err = manifold.locally_linear_embedding(X, n_neighbors=12,
-                                             n_components=2)
+X_r, err = manifold.locally_linear_embedding(X, n_neighbors=12, n_components=2)
 print("Done. Reconstruction error: %g" % err)
 
 # ----------------------------------------------------------------------
@@ -34,13 +35,13 @@
 
 fig = plt.figure()
 
-ax = fig.add_subplot(211, projection='3d')
+ax = fig.add_subplot(211, projection="3d")
 ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
 
 ax.set_title("Original data")
 ax = fig.add_subplot(212)
 ax.scatter(X_r[:, 0], X_r[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.axis('tight')
+plt.axis("tight")
 plt.xticks([]), plt.yticks([])
-plt.title('Projected data')
+plt.title("Projected data")
 plt.show()
diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py
index dd7b4d1f21a09..04da2eb51acb1 100644
--- a/examples/manifold/plot_t_sne_perplexity.py
+++ b/examples/manifold/plot_t_sne_perplexity.py
@@ -40,7 +40,7 @@
 (fig, subplots) = plt.subplots(3, 5, figsize=(15, 8))
 perplexities = [5, 30, 50, 100]
 
-X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
+X, y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
 
 red = y == 0
 green = y == 1
@@ -50,14 +50,15 @@
 ax.scatter(X[green, 0], X[green, 1], c="g")
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
-plt.axis('tight')
+plt.axis("tight")
 
 for i, perplexity in enumerate(perplexities):
     ax = subplots[0][i + 1]
 
     t0 = time()
-    tsne = manifold.TSNE(n_components=n_components, init='random',
-                         random_state=0, perplexity=perplexity)
+    tsne = manifold.TSNE(
+        n_components=n_components, init="random", random_state=0, perplexity=perplexity
+    )
     Y = tsne.fit_transform(X)
     t1 = time()
     print("circles, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
@@ -66,7 +67,7 @@
     ax.scatter(Y[green, 0], Y[green, 1], c="g")
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    ax.axis('tight')
+    ax.axis("tight")
 
 # Another example using s-curve
 X, color = datasets.make_s_curve(n_samples, random_state=0)
@@ -80,8 +81,9 @@
     ax = subplots[1][i + 1]
 
     t0 = time()
-    tsne = manifold.TSNE(n_components=n_components, init='random',
-                         random_state=0, perplexity=perplexity)
+    tsne = manifold.TSNE(
+        n_components=n_components, init="random", random_state=0, perplexity=perplexity
+    )
     Y = tsne.fit_transform(X)
     t1 = time()
     print("S-curve, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
@@ -90,16 +92,18 @@
     ax.scatter(Y[:, 0], Y[:, 1], c=color)
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    ax.axis('tight')
+    ax.axis("tight")
 
 
 # Another example using a 2D uniform grid
 x = np.linspace(0, 1, int(np.sqrt(n_samples)))
 xx, yy = np.meshgrid(x, x)
-X = np.hstack([
-    xx.ravel().reshape(-1, 1),
-    yy.ravel().reshape(-1, 1),
-])
+X = np.hstack(
+    [
+        xx.ravel().reshape(-1, 1),
+        yy.ravel().reshape(-1, 1),
+    ]
+)
 color = xx.ravel()
 ax = subplots[2][0]
 ax.scatter(X[:, 0], X[:, 1], c=color)
@@ -110,8 +114,9 @@
     ax = subplots[2][i + 1]
 
     t0 = time()
-    tsne = manifold.TSNE(n_components=n_components, init='random',
-                         random_state=0, perplexity=perplexity)
+    tsne = manifold.TSNE(
+        n_components=n_components, init="random", random_state=0, perplexity=perplexity
+    )
     Y = tsne.fit_transform(X)
     t1 = time()
     print("uniform grid, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
@@ -120,7 +125,7 @@
     ax.scatter(Y[:, 0], Y[:, 1], c=color)
     ax.xaxis.set_major_formatter(NullFormatter())
     ax.yaxis.set_major_formatter(NullFormatter())
-    ax.axis('tight')
+    ax.axis("tight")
 
 
 plt.show()
diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py
index c0c3a4f890923..924ab47fa81ca 100644
--- a/examples/miscellaneous/plot_anomaly_comparison.py
+++ b/examples/miscellaneous/plot_anomaly_comparison.py
@@ -82,7 +82,7 @@
 
 print(__doc__)
 
-matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
+matplotlib.rcParams["contour.negative_linestyle"] = "solid"
 
 # Example settings
 n_samples = 300
@@ -95,46 +95,58 @@
 # to give similar results to the OneClassSVM
 anomaly_algorithms = [
     ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
-    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
-                                      gamma=0.1)),
-    ("One-Class SVM (SGD)", make_pipeline(
-        Nystroem(gamma=0.1, random_state=42, n_components=150),
-        SGDOneClassSVM(nu=outliers_fraction, shuffle=True,
-                       fit_intercept=True, random_state=42, tol=1e-6)
-    )),
-    ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
-                                         random_state=42)),
-    ("Local Outlier Factor", LocalOutlierFactor(
-        n_neighbors=35, contamination=outliers_fraction))]
+    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
+    (
+        "One-Class SVM (SGD)",
+        make_pipeline(
+            Nystroem(gamma=0.1, random_state=42, n_components=150),
+            SGDOneClassSVM(
+                nu=outliers_fraction,
+                shuffle=True,
+                fit_intercept=True,
+                random_state=42,
+                tol=1e-6,
+            ),
+        ),
+    ),
+    (
+        "Isolation Forest",
+        IsolationForest(contamination=outliers_fraction, random_state=42),
+    ),
+    (
+        "Local Outlier Factor",
+        LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction),
+    ),
+]
 
 # Define datasets
 blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
 datasets = [
-    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
-               **blobs_params)[0],
-    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],
-               **blobs_params)[0],
-    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
-               **blobs_params)[0],
-    4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
-          np.array([0.5, 0.25])),
-    14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]
+    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0],
+    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0],
+    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, 0.3], **blobs_params)[0],
+    4.0
+    * (
+        make_moons(n_samples=n_samples, noise=0.05, random_state=0)[0]
+        - np.array([0.5, 0.25])
+    ),
+    14.0 * (np.random.RandomState(42).rand(n_samples, 2) - 0.5),
+]
 
 # Compare given classifiers under given settings
-xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
-                     np.linspace(-7, 7, 150))
+xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150))
 
 plt.figure(figsize=(len(anomaly_algorithms) * 2 + 4, 12.5))
-plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
-                    hspace=.01)
+plt.subplots_adjust(
+    left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01
+)
 
 plot_num = 1
 rng = np.random.RandomState(42)
 
 for i_dataset, X in enumerate(datasets):
     # Add outliers
-    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))],
-                       axis=0)
+    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0)
 
     for name, algorithm in anomaly_algorithms:
         t0 = time.time()
@@ -154,18 +166,23 @@
         if name != "Local Outlier Factor":  # LOF does not implement predict
             Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])
             Z = Z.reshape(xx.shape)
-            plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
+            plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="black")
 
-        colors = np.array(['#377eb8', '#ff7f00'])
+        colors = np.array(["#377eb8", "#ff7f00"])
         plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2])
 
         plt.xlim(-7, 7)
         plt.ylim(-7, 7)
         plt.xticks(())
         plt.yticks(())
-        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
-                 transform=plt.gca().transAxes, size=15,
-                 horizontalalignment='right')
+        plt.text(
+            0.99,
+            0.01,
+            ("%.2fs" % (t1 - t0)).lstrip("0"),
+            transform=plt.gca().transAxes,
+            size=15,
+            horizontalalignment="right",
+        )
         plot_num += 1
 
 plt.show()
diff --git a/examples/miscellaneous/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
index a35471105b6c1..d27b17f3cc82d 100644
--- a/examples/miscellaneous/plot_changed_only_pprint_parameter.py
+++ b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
@@ -15,8 +15,8 @@
 from sklearn import set_config
 
 
-lr = LogisticRegression(penalty='l1')
-print('Default representation:')
+lr = LogisticRegression(penalty="l1")
+print("Default representation:")
 print(lr)
 # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 #                    intercept_scaling=1, l1_ratio=None, max_iter=100,
@@ -25,6 +25,6 @@
 #                    warm_start=False)
 
 set_config(print_changed_only=True)
-print('\nWith changed_only option:')
+print("\nWith changed_only option:")
 print(lr)
 # LogisticRegression(penalty='l1')
diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py
index a05f17fc9aba5..bb9d252fe830c 100644
--- a/examples/miscellaneous/plot_display_object_visualization.py
+++ b/examples/miscellaneous/plot_display_object_visualization.py
@@ -58,6 +58,7 @@
 # a decision function, we will use it to plot the roc curve:
 from sklearn.metrics import roc_curve
 from sklearn.metrics import RocCurveDisplay
+
 y_score = clf.decision_function(X_test)
 
 fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
@@ -71,8 +72,7 @@
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import PrecisionRecallDisplay
 
-prec, recall, _ = precision_recall_curve(y_test, y_score,
-                                         pos_label=clf.classes_[1])
+prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label=clf.classes_[1])
 pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()
 
 # %%
@@ -85,6 +85,7 @@
 
 # sphinx_gallery_thumbnail_number = 4
 import matplotlib.pyplot as plt
+
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
 
 roc_display.plot(ax=ax1)
diff --git a/examples/miscellaneous/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py
index 8848b1151e1fa..f75bc1144e881 100644
--- a/examples/miscellaneous/plot_isotonic_regression.py
+++ b/examples/miscellaneous/plot_isotonic_regression.py
@@ -35,7 +35,7 @@
 n = 100
 x = np.arange(n)
 rs = check_random_state(0)
-y = rs.randint(-50, 50, size=(n,)) + 50. * np.log1p(np.arange(n))
+y = rs.randint(-50, 50, size=(n,)) + 50.0 * np.log1p(np.arange(n))
 
 # %%
 # Fit IsotonicRegression and LinearRegression models:
@@ -56,16 +56,16 @@
 
 fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 6))
 
-ax0.plot(x, y, 'C0.', markersize=12)
-ax0.plot(x, y_, 'C1.-', markersize=12)
-ax0.plot(x, lr.predict(x[:, np.newaxis]), 'C2-')
+ax0.plot(x, y, "C0.", markersize=12)
+ax0.plot(x, y_, "C1.-", markersize=12)
+ax0.plot(x, lr.predict(x[:, np.newaxis]), "C2-")
 ax0.add_collection(lc)
-ax0.legend(('Training data', 'Isotonic fit', 'Linear fit'), loc='lower right')
-ax0.set_title('Isotonic regression fit on noisy data (n=%d)' % n)
+ax0.legend(("Training data", "Isotonic fit", "Linear fit"), loc="lower right")
+ax0.set_title("Isotonic regression fit on noisy data (n=%d)" % n)
 
 x_test = np.linspace(-10, 110, 1000)
-ax1.plot(x_test, ir.predict(x_test), 'C1-')
-ax1.plot(ir.X_thresholds_, ir.y_thresholds_, 'C1.', markersize=12)
+ax1.plot(x_test, ir.predict(x_test), "C1-")
+ax1.plot(ir.X_thresholds_, ir.y_thresholds_, "C1.", markersize=12)
 ax1.set_title("Prediction function (%d thresholds)" % len(ir.X_thresholds_))
 
 plt.show()
diff --git a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
index 433f5a7f05d37..64815751efa36 100644
--- a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
+++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
@@ -27,10 +27,10 @@
 from sklearn.utils.fixes import parse_version
 
 # `normed` is being deprecated in favor of `density` in histograms
-if parse_version(matplotlib.__version__) >= parse_version('2.1'):
-    density_param = {'density': True}
+if parse_version(matplotlib.__version__) >= parse_version("2.1"):
+    density_param = {"density": True}
 else:
-    density_param = {'normed': True}
+    density_param = {"normed": True}
 
 # %%
 # Theoretical bounds
@@ -119,7 +119,7 @@
 # digits dataset, pass the ``--use-digits-dataset`` command line argument to
 # this script.
 
-if '--use-digits-dataset' in sys.argv:
+if "--use-digits-dataset" in sys.argv:
     data = load_digits().data[:500]
 else:
     data = fetch_20newsgroups_vectorized().data[:500]
@@ -133,8 +133,10 @@
 # - 1D histogram of the ratio of those distances (projected / original).
 
 n_samples, n_features = data.shape
-print("Embedding %d samples with dim %d using various random projections"
-      % (n_samples, n_features))
+print(
+    "Embedding %d samples with dim %d using various random projections"
+    % (n_samples, n_features)
+)
 
 n_components_range = np.array([300, 1000, 10000])
 dists = euclidean_distances(data, squared=True).ravel()
@@ -147,38 +149,41 @@
     t0 = time()
     rp = SparseRandomProjection(n_components=n_components)
     projected_data = rp.fit_transform(data)
-    print("Projected %d samples from %d to %d in %0.3fs"
-          % (n_samples, n_features, n_components, time() - t0))
-    if hasattr(rp, 'components_'):
+    print(
+        "Projected %d samples from %d to %d in %0.3fs"
+        % (n_samples, n_features, n_components, time() - t0)
+    )
+    if hasattr(rp, "components_"):
         n_bytes = rp.components_.data.nbytes
         n_bytes += rp.components_.indices.nbytes
         print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))
 
-    projected_dists = euclidean_distances(
-        projected_data, squared=True).ravel()[nonzero]
+    projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]
 
     plt.figure()
     min_dist = min(projected_dists.min(), dists.min())
     max_dist = max(projected_dists.max(), dists.max())
-    plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu,
-               extent=[min_dist, max_dist, min_dist, max_dist])
+    plt.hexbin(
+        dists,
+        projected_dists,
+        gridsize=100,
+        cmap=plt.cm.PuBu,
+        extent=[min_dist, max_dist, min_dist, max_dist],
+    )
     plt.xlabel("Pairwise squared distances in original space")
     plt.ylabel("Pairwise squared distances in projected space")
-    plt.title("Pairwise distances distribution for n_components=%d" %
-              n_components)
+    plt.title("Pairwise distances distribution for n_components=%d" % n_components)
     cb = plt.colorbar()
-    cb.set_label('Sample pairs counts')
+    cb.set_label("Sample pairs counts")
 
     rates = projected_dists / dists
-    print("Mean distances rate: %0.2f (%0.2f)"
-          % (np.mean(rates), np.std(rates)))
+    print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates)))
 
     plt.figure()
-    plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
+    plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", **density_param)
     plt.xlabel("Squared distances rate: projected / original")
     plt.ylabel("Distribution of samples pairs")
-    plt.title("Histogram of pairwise distance rates for n_components=%d" %
-              n_components)
+    plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components)
 
     # TODO: compute the expected value of eps and add them to the previous plot
     # as vertical lines / region
diff --git a/examples/miscellaneous/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py
index 80b4582a45670..ffd50e9dca06e 100644
--- a/examples/miscellaneous/plot_kernel_approximation.py
+++ b/examples/miscellaneous/plot_kernel_approximation.py
@@ -47,8 +47,7 @@
 
 # Import datasets, classifiers and performance metrics
 from sklearn import datasets, svm, pipeline
-from sklearn.kernel_approximation import (RBFSampler,
-                                          Nystroem)
+from sklearn.kernel_approximation import RBFSampler, Nystroem
 from sklearn.decomposition import PCA
 
 # The digits dataset
@@ -61,32 +60,32 @@
 # To apply an classifier on this data, we need to flatten the image, to
 # turn the data in a (samples, feature) matrix:
 n_samples = len(digits.data)
-data = digits.data / 16.
+data = digits.data / 16.0
 data -= data.mean(axis=0)
 
 # We learn the digits on the first half of the digits
-data_train, targets_train = (data[:n_samples // 2],
-                             digits.target[:n_samples // 2])
+data_train, targets_train = (data[: n_samples // 2], digits.target[: n_samples // 2])
 
 
 # Now predict the value of the digit on the second half:
-data_test, targets_test = (data[n_samples // 2:],
-                           digits.target[n_samples // 2:])
+data_test, targets_test = (data[n_samples // 2 :], digits.target[n_samples // 2 :])
 # data_test = scaler.transform(data_test)
 
 # Create a classifier: a support vector classifier
-kernel_svm = svm.SVC(gamma=.2)
+kernel_svm = svm.SVC(gamma=0.2)
 linear_svm = svm.LinearSVC()
 
 # create pipeline from kernel approximation
 # and linear svm
-feature_map_fourier = RBFSampler(gamma=.2, random_state=1)
-feature_map_nystroem = Nystroem(gamma=.2, random_state=1)
-fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier),
-                                        ("svm", svm.LinearSVC())])
+feature_map_fourier = RBFSampler(gamma=0.2, random_state=1)
+feature_map_nystroem = Nystroem(gamma=0.2, random_state=1)
+fourier_approx_svm = pipeline.Pipeline(
+    [("feature_map", feature_map_fourier), ("svm", svm.LinearSVC())]
+)
 
-nystroem_approx_svm = pipeline.Pipeline([("feature_map", feature_map_nystroem),
-                                        ("svm", svm.LinearSVC())])
+nystroem_approx_svm = pipeline.Pipeline(
+    [("feature_map", feature_map_nystroem), ("svm", svm.LinearSVC())]
+)
 
 # fit and predict using linear and kernel svm:
 
@@ -129,23 +128,35 @@
 timescale = plt.subplot(122)
 
 accuracy.plot(sample_sizes, nystroem_scores, label="Nystroem approx. kernel")
-timescale.plot(sample_sizes, nystroem_times, '--',
-               label='Nystroem approx. kernel')
+timescale.plot(sample_sizes, nystroem_times, "--", label="Nystroem approx. kernel")
 
 accuracy.plot(sample_sizes, fourier_scores, label="Fourier approx. kernel")
-timescale.plot(sample_sizes, fourier_times, '--',
-               label='Fourier approx. kernel')
+timescale.plot(sample_sizes, fourier_times, "--", label="Fourier approx. kernel")
 
 # horizontal lines for exact rbf and linear kernels:
-accuracy.plot([sample_sizes[0], sample_sizes[-1]],
-              [linear_svm_score, linear_svm_score], label="linear svm")
-timescale.plot([sample_sizes[0], sample_sizes[-1]],
-               [linear_svm_time, linear_svm_time], '--', label='linear svm')
-
-accuracy.plot([sample_sizes[0], sample_sizes[-1]],
-              [kernel_svm_score, kernel_svm_score], label="rbf svm")
-timescale.plot([sample_sizes[0], sample_sizes[-1]],
-               [kernel_svm_time, kernel_svm_time], '--', label='rbf svm')
+accuracy.plot(
+    [sample_sizes[0], sample_sizes[-1]],
+    [linear_svm_score, linear_svm_score],
+    label="linear svm",
+)
+timescale.plot(
+    [sample_sizes[0], sample_sizes[-1]],
+    [linear_svm_time, linear_svm_time],
+    "--",
+    label="linear svm",
+)
+
+accuracy.plot(
+    [sample_sizes[0], sample_sizes[-1]],
+    [kernel_svm_score, kernel_svm_score],
+    label="rbf svm",
+)
+timescale.plot(
+    [sample_sizes[0], sample_sizes[-1]],
+    [kernel_svm_time, kernel_svm_time],
+    "--",
+    label="rbf svm",
+)
 
 # vertical line for dataset dimensionality = 64
 accuracy.plot([64, 64], [0.7, 1], label="n_features")
@@ -159,8 +170,8 @@
 timescale.set_xlabel("Sampling steps = transformed feature dimension")
 accuracy.set_ylabel("Classification accuracy")
 timescale.set_ylabel("Training time in seconds")
-accuracy.legend(loc='best')
-timescale.legend(loc='best')
+accuracy.legend(loc="best")
+timescale.legend(loc="best")
 plt.tight_layout()
 plt.show()
 
@@ -197,17 +208,16 @@
 flat_grid = grid.reshape(-1, data.shape[1])
 
 # title for the plots
-titles = ['SVC with rbf kernel',
-          'SVC (linear kernel)\n with Fourier rbf feature map\n'
-          'n_components=100',
-          'SVC (linear kernel)\n with Nystroem rbf feature map\n'
-          'n_components=100']
+titles = [
+    "SVC with rbf kernel",
+    "SVC (linear kernel)\n with Fourier rbf feature map\nn_components=100",
+    "SVC (linear kernel)\n with Nystroem rbf feature map\nn_components=100",
+]
 
 plt.figure(figsize=(18, 7.5))
-plt.rcParams.update({'font.size': 14})
+plt.rcParams.update({"font.size": 14})
 # predict and plot
-for i, clf in enumerate((kernel_svm, nystroem_approx_svm,
-                         fourier_approx_svm)):
+for i, clf in enumerate((kernel_svm, nystroem_approx_svm, fourier_approx_svm)):
     # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, x_max]x[y_min, y_max].
     plt.subplot(1, 3, i + 1)
@@ -216,11 +226,12 @@
     # Put the result into a color plot
     Z = Z.reshape(grid.shape[:-1])
     plt.contourf(multiples, multiples, Z, cmap=plt.cm.Paired)
-    plt.axis('off')
+    plt.axis("off")
 
     # Plot also the training points
-    plt.scatter(X[:, 0], X[:, 1], c=targets_train, cmap=plt.cm.Paired,
-                edgecolors=(0, 0, 0))
+    plt.scatter(
+        X[:, 0], X[:, 1], c=targets_train, cmap=plt.cm.Paired, edgecolors=(0, 0, 0)
+    )
 
     plt.title(titles[i])
 plt.tight_layout()
diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
index 1eb84d8fdac81..eaff3d91fd82d 100644
--- a/examples/miscellaneous/plot_kernel_ridge_regression.py
+++ b/examples/miscellaneous/plot_kernel_ridge_regression.py
@@ -60,25 +60,25 @@
 # #############################################################################
 # Fit regression model
 train_size = 100
-svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
-                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
-                               "gamma": np.logspace(-2, 2, 5)})
+svr = GridSearchCV(
+    SVR(kernel="rbf", gamma=0.1),
+    param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},
+)
 
-kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1),
-                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
-                              "gamma": np.logspace(-2, 2, 5)})
+kr = GridSearchCV(
+    KernelRidge(kernel="rbf", gamma=0.1),
+    param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
+)
 
 t0 = time.time()
 svr.fit(X[:train_size], y[:train_size])
 svr_fit = time.time() - t0
-print("SVR complexity and bandwidth selected and model fitted in %.3f s"
-      % svr_fit)
+print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit)
 
 t0 = time.time()
 kr.fit(X[:train_size], y[:train_size])
 kr_fit = time.time() - t0
-print("KRR complexity and bandwidth selected and model fitted in %.3f s"
-      % kr_fit)
+print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)
 
 sv_ratio = svr.best_estimator_.support_.shape[0] / train_size
 print("Support vector ratio: %.3f" % sv_ratio)
@@ -86,30 +86,39 @@
 t0 = time.time()
 y_svr = svr.predict(X_plot)
 svr_predict = time.time() - t0
-print("SVR prediction for %d inputs in %.3f s"
-      % (X_plot.shape[0], svr_predict))
+print("SVR prediction for %d inputs in %.3f s" % (X_plot.shape[0], svr_predict))
 
 t0 = time.time()
 y_kr = kr.predict(X_plot)
 kr_predict = time.time() - t0
-print("KRR prediction for %d inputs in %.3f s"
-      % (X_plot.shape[0], kr_predict))
+print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict))
 
 
 # #############################################################################
 # Look at the results
 sv_ind = svr.best_estimator_.support_
-plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors',
-            zorder=2, edgecolors=(0, 0, 0))
-plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1,
-            edgecolors=(0, 0, 0))
-plt.plot(X_plot, y_svr, c='r',
-         label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict))
-plt.plot(X_plot, y_kr, c='g',
-         label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict))
-plt.xlabel('data')
-plt.ylabel('target')
-plt.title('SVR versus Kernel Ridge')
+plt.scatter(
+    X[sv_ind],
+    y[sv_ind],
+    c="r",
+    s=50,
+    label="SVR support vectors",
+    zorder=2,
+    edgecolors=(0, 0, 0),
+)
+plt.scatter(X[:100], y[:100], c="k", label="data", zorder=1, edgecolors=(0, 0, 0))
+plt.plot(
+    X_plot,
+    y_svr,
+    c="r",
+    label="SVR (fit: %.3fs, predict: %.3fs)" % (svr_fit, svr_predict),
+)
+plt.plot(
+    X_plot, y_kr, c="g", label="KRR (fit: %.3fs, predict: %.3fs)" % (kr_fit, kr_predict)
+)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("SVR versus Kernel Ridge")
 plt.legend()
 
 # Visualize training and prediction time
@@ -120,9 +129,10 @@
 y = np.sin(X).ravel()
 y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))
 sizes = np.logspace(1, 4, 7).astype(int)
-for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1,
-                                           gamma=10),
-                        "SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items():
+for name, estimator in {
+    "KRR": KernelRidge(kernel="rbf", alpha=0.1, gamma=10),
+    "SVR": SVR(kernel="rbf", C=1e1, gamma=10),
+}.items():
     train_time = []
     test_time = []
     for train_test_size in sizes:
@@ -134,37 +144,55 @@
         estimator.predict(X_plot[:1000])
         test_time.append(time.time() - t0)
 
-    plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g",
-             label="%s (train)" % name)
-    plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g",
-             label="%s (test)" % name)
+    plt.plot(
+        sizes,
+        train_time,
+        "o-",
+        color="r" if name == "SVR" else "g",
+        label="%s (train)" % name,
+    )
+    plt.plot(
+        sizes,
+        test_time,
+        "o--",
+        color="r" if name == "SVR" else "g",
+        label="%s (test)" % name,
+    )
 
 plt.xscale("log")
 plt.yscale("log")
 plt.xlabel("Train size")
 plt.ylabel("Time (seconds)")
-plt.title('Execution Time')
+plt.title("Execution Time")
 plt.legend(loc="best")
 
 # Visualize learning curves
 plt.figure()
 
-svr = SVR(kernel='rbf', C=1e1, gamma=0.1)
-kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
-train_sizes, train_scores_svr, test_scores_svr = \
-    learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
-                   scoring="neg_mean_squared_error", cv=10)
-train_sizes_abs, train_scores_kr, test_scores_kr = \
-    learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
-                   scoring="neg_mean_squared_error", cv=10)
-
-plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r",
-         label="SVR")
-plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g",
-         label="KRR")
+svr = SVR(kernel="rbf", C=1e1, gamma=0.1)
+kr = KernelRidge(kernel="rbf", alpha=0.1, gamma=0.1)
+train_sizes, train_scores_svr, test_scores_svr = learning_curve(
+    svr,
+    X[:100],
+    y[:100],
+    train_sizes=np.linspace(0.1, 1, 10),
+    scoring="neg_mean_squared_error",
+    cv=10,
+)
+train_sizes_abs, train_scores_kr, test_scores_kr = learning_curve(
+    kr,
+    X[:100],
+    y[:100],
+    train_sizes=np.linspace(0.1, 1, 10),
+    scoring="neg_mean_squared_error",
+    cv=10,
+)
+
+plt.plot(train_sizes, -test_scores_svr.mean(1), "o-", color="r", label="SVR")
+plt.plot(train_sizes, -test_scores_kr.mean(1), "o-", color="g", label="KRR")
 plt.xlabel("Train size")
 plt.ylabel("Mean Squared Error")
-plt.title('Learning curves')
+plt.title("Learning curves")
 plt.legend(loc="best")
 
 plt.show()
diff --git a/examples/miscellaneous/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py
index 828ca17fafa23..4958084b9dbd7 100644
--- a/examples/miscellaneous/plot_multilabel.py
+++ b/examples/miscellaneous/plot_multilabel.py
@@ -64,7 +64,7 @@ def plot_subfigure(X, Y, subplot, title, transform):
     min_y = np.min(X[:, 1])
     max_y = np.max(X[:, 1])
 
-    classif = OneVsRestClassifier(SVC(kernel='linear'))
+    classif = OneVsRestClassifier(SVC(kernel="linear"))
     classif.fit(X, Y)
 
     plt.subplot(2, 2, subplot)
@@ -72,42 +72,58 @@ def plot_subfigure(X, Y, subplot, title, transform):
 
     zero_class = np.where(Y[:, 0])
     one_class = np.where(Y[:, 1])
-    plt.scatter(X[:, 0], X[:, 1], s=40, c='gray', edgecolors=(0, 0, 0))
-    plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
-                facecolors='none', linewidths=2, label='Class 1')
-    plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange',
-                facecolors='none', linewidths=2, label='Class 2')
-
-    plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--',
-                    'Boundary\nfor class 1')
-    plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.',
-                    'Boundary\nfor class 2')
+    plt.scatter(X[:, 0], X[:, 1], s=40, c="gray", edgecolors=(0, 0, 0))
+    plt.scatter(
+        X[zero_class, 0],
+        X[zero_class, 1],
+        s=160,
+        edgecolors="b",
+        facecolors="none",
+        linewidths=2,
+        label="Class 1",
+    )
+    plt.scatter(
+        X[one_class, 0],
+        X[one_class, 1],
+        s=80,
+        edgecolors="orange",
+        facecolors="none",
+        linewidths=2,
+        label="Class 2",
+    )
+
+    plot_hyperplane(
+        classif.estimators_[0], min_x, max_x, "k--", "Boundary\nfor class 1"
+    )
+    plot_hyperplane(
+        classif.estimators_[1], min_x, max_x, "k-.", "Boundary\nfor class 2"
+    )
     plt.xticks(())
     plt.yticks(())
 
-    plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x)
-    plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y)
+    plt.xlim(min_x - 0.5 * max_x, max_x + 0.5 * max_x)
+    plt.ylim(min_y - 0.5 * max_y, max_y + 0.5 * max_y)
     if subplot == 2:
-        plt.xlabel('First principal component')
-        plt.ylabel('Second principal component')
+        plt.xlabel("First principal component")
+        plt.ylabel("Second principal component")
         plt.legend(loc="upper left")
 
 
 plt.figure(figsize=(8, 6))
 
-X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
-                                      allow_unlabeled=True,
-                                      random_state=1)
+X, Y = make_multilabel_classification(
+    n_classes=2, n_labels=1, allow_unlabeled=True, random_state=1
+)
 
 plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca")
 plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca")
 
-X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
-                                      allow_unlabeled=False,
-                                      random_state=1)
+X, Y = make_multilabel_classification(
+    n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1
+)
 
 plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
 plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca")
 
-plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
+plt.subplots_adjust(0.04, 0.02, 0.97, 0.94, 0.09, 0.2)
 plt.show()
diff --git a/examples/miscellaneous/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py
index 62fd20d24645f..63b3bea4175ba 100644
--- a/examples/miscellaneous/plot_multioutput_face_completion.py
+++ b/examples/miscellaneous/plot_multioutput_face_completion.py
@@ -33,21 +33,22 @@
 # Test on a subset of people
 n_faces = 5
 rng = check_random_state(4)
-face_ids = rng.randint(test.shape[0], size=(n_faces, ))
+face_ids = rng.randint(test.shape[0], size=(n_faces,))
 test = test[face_ids, :]
 
 n_pixels = data.shape[1]
 # Upper half of the faces
-X_train = train[:, :(n_pixels + 1) // 2]
+X_train = train[:, : (n_pixels + 1) // 2]
 # Lower half of the faces
-y_train = train[:, n_pixels // 2:]
-X_test = test[:, :(n_pixels + 1) // 2]
-y_test = test[:, n_pixels // 2:]
+y_train = train[:, n_pixels // 2 :]
+X_test = test[:, : (n_pixels + 1) // 2]
+y_test = test[:, n_pixels // 2 :]
 
 # Fit estimators
 ESTIMATORS = {
-    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
-                                       random_state=0),
+    "Extra trees": ExtraTreesRegressor(
+        n_estimators=10, max_features=32, random_state=0
+    ),
     "K-nn": KNeighborsRegressor(),
     "Linear regression": LinearRegression(),
     "Ridge": RidgeCV(),
@@ -62,7 +63,7 @@
 image_shape = (64, 64)
 
 n_cols = 1 + len(ESTIMATORS)
-plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
+plt.figure(figsize=(2.0 * n_cols, 2.26 * n_faces))
 plt.suptitle("Face completion with multi-output estimators", size=16)
 
 for i in range(n_faces):
@@ -71,13 +72,12 @@
     if i:
         sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
     else:
-        sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
-                          title="true faces")
+        sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, title="true faces")
 
     sub.axis("off")
-    sub.imshow(true_face.reshape(image_shape),
-               cmap=plt.cm.gray,
-               interpolation="nearest")
+    sub.imshow(
+        true_face.reshape(image_shape), cmap=plt.cm.gray, interpolation="nearest"
+    )
 
     for j, est in enumerate(sorted(ESTIMATORS)):
         completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
@@ -86,12 +86,13 @@
             sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)
 
         else:
-            sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
-                              title=est)
+            sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, title=est)
 
         sub.axis("off")
-        sub.imshow(completed_face.reshape(image_shape),
-                   cmap=plt.cm.gray,
-                   interpolation="nearest")
+        sub.imshow(
+            completed_face.reshape(image_shape),
+            cmap=plt.cm.gray,
+            interpolation="nearest",
+        )
 
 plt.show()
diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 342ba14a338b1..5f6e61a89c4fe 100644
--- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -37,9 +37,10 @@
 y = diabetes.target
 
 tree = DecisionTreeRegressor()
-mlp = make_pipeline(StandardScaler(),
-                    MLPRegressor(hidden_layer_sizes=(100, 100),
-                                 tol=1e-2, max_iter=500, random_state=0))
+mlp = make_pipeline(
+    StandardScaler(),
+    MLPRegressor(hidden_layer_sizes=(100, 100), tol=1e-2, max_iter=500, random_state=0),
+)
 tree.fit(X, y)
 mlp.fit(X, y)
 
@@ -63,8 +64,9 @@
 # color of the curve.
 fig, ax = plt.subplots(figsize=(12, 6))
 ax.set_title("Multi-layer Perceptron")
-mlp_disp = PartialDependenceDisplay.from_estimator(mlp, X, ["age", "bmi"], ax=ax,
-                                                   line_kw={"color": "red"})
+mlp_disp = PartialDependenceDisplay.from_estimator(
+    mlp, X, ["age", "bmi"], ax=ax, line_kw={"color": "red"}
+)
 
 # %%
 # Plotting partial dependence of the two models together
@@ -101,8 +103,9 @@
 # sphinx_gallery_thumbnail_number = 4
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
 tree_disp.plot(ax=[ax1, ax2], line_kw={"label": "Decision Tree"})
-mlp_disp.plot(ax=[ax1, ax2], line_kw={"label": "Multi-layer Perceptron",
-                                      "color": "red"})
+mlp_disp.plot(
+    ax=[ax1, ax2], line_kw={"label": "Multi-layer Perceptron", "color": "red"}
+)
 ax1.legend()
 ax2.legend()
 
@@ -115,8 +118,9 @@
 # `plot` will only show the y label and y ticks on the left most plot.
 
 tree_disp.plot(line_kw={"label": "Decision Tree"})
-mlp_disp.plot(line_kw={"label": "Multi-layer Perceptron", "color": "red"},
-              ax=tree_disp.axes_)
+mlp_disp.plot(
+    line_kw={"label": "Multi-layer Perceptron", "color": "red"}, ax=tree_disp.axes_
+)
 tree_disp.figure_.set_size_inches(10, 6)
 tree_disp.axes_[0, 0].legend()
 tree_disp.axes_[0, 1].legend()
@@ -131,4 +135,5 @@
 # plot function.
 tree_disp = PartialDependenceDisplay.from_estimator(tree, X, ["age"])
 mlp_disp = PartialDependenceDisplay.from_estimator(
-    mlp, X, ["age"], ax=tree_disp.axes_, line_kw={"color": "red"})
+    mlp, X, ["age"], ax=tree_disp.axes_, line_kw={"color": "red"}
+)
diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py
index 6da379d118436..65830b6dc1182 100644
--- a/examples/mixture/plot_concentration_prior.py
+++ b/examples/mixture/plot_concentration_prior.py
@@ -49,89 +49,116 @@ def plot_ellipses(ax, weights, means, covars):
         angle = 180 * angle / np.pi
         # eigenvector normalization
         eig_vals = 2 * np.sqrt(2) * np.sqrt(eig_vals)
-        ell = mpl.patches.Ellipse(means[n], eig_vals[0], eig_vals[1],
-                                  180 + angle, edgecolor='black')
+        ell = mpl.patches.Ellipse(
+            means[n], eig_vals[0], eig_vals[1], 180 + angle, edgecolor="black"
+        )
         ell.set_clip_box(ax.bbox)
         ell.set_alpha(weights[n])
-        ell.set_facecolor('#56B4E9')
+        ell.set_facecolor("#56B4E9")
         ax.add_artist(ell)
 
 
 def plot_results(ax1, ax2, estimator, X, y, title, plot_title=False):
     ax1.set_title(title)
-    ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8)
-    ax1.set_xlim(-2., 2.)
-    ax1.set_ylim(-3., 3.)
+    ax1.scatter(X[:, 0], X[:, 1], s=5, marker="o", color=colors[y], alpha=0.8)
+    ax1.set_xlim(-2.0, 2.0)
+    ax1.set_ylim(-3.0, 3.0)
     ax1.set_xticks(())
     ax1.set_yticks(())
-    plot_ellipses(ax1, estimator.weights_, estimator.means_,
-                  estimator.covariances_)
+    plot_ellipses(ax1, estimator.weights_, estimator.means_, estimator.covariances_)
 
-    ax2.get_xaxis().set_tick_params(direction='out')
+    ax2.get_xaxis().set_tick_params(direction="out")
     ax2.yaxis.grid(True, alpha=0.7)
     for k, w in enumerate(estimator.weights_):
-        ax2.bar(k, w, width=0.9, color='#56B4E9', zorder=3,
-                align='center', edgecolor='black')
-        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.),
-                 horizontalalignment='center')
-    ax2.set_xlim(-.6, 2 * n_components - .4)
-    ax2.set_ylim(0., 1.1)
-    ax2.tick_params(axis='y', which='both', left=False,
-                    right=False, labelleft=False)
-    ax2.tick_params(axis='x', which='both', top=False)
+        ax2.bar(
+            k,
+            w,
+            width=0.9,
+            color="#56B4E9",
+            zorder=3,
+            align="center",
+            edgecolor="black",
+        )
+        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.0), horizontalalignment="center")
+    ax2.set_xlim(-0.6, 2 * n_components - 0.4)
+    ax2.set_ylim(0.0, 1.1)
+    ax2.tick_params(axis="y", which="both", left=False, right=False, labelleft=False)
+    ax2.tick_params(axis="x", which="both", top=False)
 
     if plot_title:
-        ax1.set_ylabel('Estimated Mixtures')
-        ax2.set_ylabel('Weight of each component')
+        ax1.set_ylabel("Estimated Mixtures")
+        ax2.set_ylabel("Weight of each component")
 
 
 # Parameters of the dataset
 random_state, n_components, n_features = 2, 3, 2
-colors = np.array(['#0072B2', '#F0E442', '#D55E00'])
+colors = np.array(["#0072B2", "#F0E442", "#D55E00"])
 
-covars = np.array([[[.7, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]]])
+covars = np.array(
+    [[[0.7, 0.0], [0.0, 0.1]], [[0.5, 0.0], [0.0, 0.1]], [[0.5, 0.0], [0.0, 0.1]]]
+)
 samples = np.array([200, 500, 200])
-means = np.array([[.0, -.70],
-                  [.0, .0],
-                  [.0, .70]])
+means = np.array([[0.0, -0.70], [0.0, 0.0], [0.0, 0.70]])
 
 # mean_precision_prior= 0.8 to minimize the influence of the prior
 estimators = [
-    ("Finite mixture with a Dirichlet distribution\nprior and "
-     r"$\gamma_0=$", BayesianGaussianMixture(
-         weight_concentration_prior_type="dirichlet_distribution",
-         n_components=2 * n_components, reg_covar=0, init_params='random',
-         max_iter=1500, mean_precision_prior=.8,
-         random_state=random_state), [0.001, 1, 1000]),
-    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
-     BayesianGaussianMixture(
-         weight_concentration_prior_type="dirichlet_process",
-         n_components=2 * n_components, reg_covar=0, init_params='random',
-         max_iter=1500, mean_precision_prior=.8,
-         random_state=random_state), [1, 1000, 100000])]
+    (
+        "Finite mixture with a Dirichlet distribution\nprior and " r"$\gamma_0=$",
+        BayesianGaussianMixture(
+            weight_concentration_prior_type="dirichlet_distribution",
+            n_components=2 * n_components,
+            reg_covar=0,
+            init_params="random",
+            max_iter=1500,
+            mean_precision_prior=0.8,
+            random_state=random_state,
+        ),
+        [0.001, 1, 1000],
+    ),
+    (
+        "Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
+        BayesianGaussianMixture(
+            weight_concentration_prior_type="dirichlet_process",
+            n_components=2 * n_components,
+            reg_covar=0,
+            init_params="random",
+            max_iter=1500,
+            mean_precision_prior=0.8,
+            random_state=random_state,
+        ),
+        [1, 1000, 100000],
+    ),
+]
 
 # Generate data
 rng = np.random.RandomState(random_state)
-X = np.vstack([
-    rng.multivariate_normal(means[j], covars[j], samples[j])
-    for j in range(n_components)])
-y = np.concatenate([np.full(samples[j], j, dtype=int)
-                    for j in range(n_components)])
+X = np.vstack(
+    [
+        rng.multivariate_normal(means[j], covars[j], samples[j])
+        for j in range(n_components)
+    ]
+)
+y = np.concatenate([np.full(samples[j], j, dtype=int) for j in range(n_components)])
 
 # Plot results in two different figures
 for (title, estimator, concentrations_prior) in estimators:
     plt.figure(figsize=(4.7 * 3, 8))
-    plt.subplots_adjust(bottom=.04, top=0.90, hspace=.05, wspace=.05,
-                        left=.03, right=.99)
+    plt.subplots_adjust(
+        bottom=0.04, top=0.90, hspace=0.05, wspace=0.05, left=0.03, right=0.99
+    )
 
     gs = gridspec.GridSpec(3, len(concentrations_prior))
     for k, concentration in enumerate(concentrations_prior):
         estimator.weight_concentration_prior = concentration
         estimator.fit(X)
-        plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]), estimator,
-                     X, y, r"%s$%.1e$" % (title, concentration),
-                     plot_title=k == 0)
+        plot_results(
+            plt.subplot(gs[0:2, k]),
+            plt.subplot(gs[2, k]),
+            estimator,
+            X,
+            y,
+            r"%s$%.1e$" % (title, concentration),
+            plot_title=k == 0,
+        )
 
 plt.show()
diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py
index 5f2f8596d4bbe..d3548eb6bed83 100644
--- a/examples/mixture/plot_gmm.py
+++ b/examples/mixture/plot_gmm.py
@@ -32,34 +32,32 @@
 
 from sklearn import mixture
 
-color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
-                              'darkorange'])
+color_iter = itertools.cycle(["navy", "c", "cornflowerblue", "gold", "darkorange"])
 
 
 def plot_results(X, Y_, means, covariances, index, title):
     splot = plt.subplot(2, 1, 1 + index)
-    for i, (mean, covar, color) in enumerate(zip(
-            means, covariances, color_iter)):
+    for i, (mean, covar, color) in enumerate(zip(means, covariances, color_iter)):
         v, w = linalg.eigh(covar)
-        v = 2. * np.sqrt(2.) * np.sqrt(v)
+        v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
         u = w[0] / linalg.norm(w[0])
         # as the DP will not use every component it has access to
         # unless it needs it, we shouldn't plot the redundant
         # components.
         if not np.any(Y_ == i):
             continue
-        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
+        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 0.8, color=color)
 
         # Plot an ellipse to show the Gaussian component
         angle = np.arctan(u[1] / u[0])
-        angle = 180. * angle / np.pi  # convert to degrees
-        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
+        angle = 180.0 * angle / np.pi  # convert to degrees
+        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color)
         ell.set_clip_box(splot.bbox)
         ell.set_alpha(0.5)
         splot.add_artist(ell)
 
-    plt.xlim(-9., 5.)
-    plt.ylim(-3., 6.)
+    plt.xlim(-9.0, 5.0)
+    plt.ylim(-3.0, 6.0)
     plt.xticks(())
     plt.yticks(())
     plt.title(title)
@@ -70,19 +68,25 @@ def plot_results(X, Y_, means, covariances, index, title):
 
 # Generate random sample, two components
 np.random.seed(0)
-C = np.array([[0., -0.1], [1.7, .4]])
-X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
-          .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
+C = np.array([[0.0, -0.1], [1.7, 0.4]])
+X = np.r_[
+    np.dot(np.random.randn(n_samples, 2), C),
+    0.7 * np.random.randn(n_samples, 2) + np.array([-6, 3]),
+]
 
 # Fit a Gaussian mixture with EM using five components
-gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X)
-plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
-             'Gaussian Mixture')
+gmm = mixture.GaussianMixture(n_components=5, covariance_type="full").fit(X)
+plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, "Gaussian Mixture")
 
 # Fit a Dirichlet process Gaussian mixture using five components
-dpgmm = mixture.BayesianGaussianMixture(n_components=5,
-                                        covariance_type='full').fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
-             'Bayesian Gaussian Mixture with a Dirichlet process prior')
+dpgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type="full").fit(X)
+plot_results(
+    X,
+    dpgmm.predict(X),
+    dpgmm.means_,
+    dpgmm.covariances_,
+    1,
+    "Bayesian Gaussian Mixture with a Dirichlet process prior",
+)
 
 plt.show()
diff --git a/examples/mixture/plot_gmm_covariances.py b/examples/mixture/plot_gmm_covariances.py
index f59807a971a08..123c9846156a1 100644
--- a/examples/mixture/plot_gmm_covariances.py
+++ b/examples/mixture/plot_gmm_covariances.py
@@ -41,30 +41,31 @@
 
 print(__doc__)
 
-colors = ['navy', 'turquoise', 'darkorange']
+colors = ["navy", "turquoise", "darkorange"]
 
 
 def make_ellipses(gmm, ax):
     for n, color in enumerate(colors):
-        if gmm.covariance_type == 'full':
+        if gmm.covariance_type == "full":
             covariances = gmm.covariances_[n][:2, :2]
-        elif gmm.covariance_type == 'tied':
+        elif gmm.covariance_type == "tied":
             covariances = gmm.covariances_[:2, :2]
-        elif gmm.covariance_type == 'diag':
+        elif gmm.covariance_type == "diag":
             covariances = np.diag(gmm.covariances_[n][:2])
-        elif gmm.covariance_type == 'spherical':
+        elif gmm.covariance_type == "spherical":
             covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
         v, w = np.linalg.eigh(covariances)
         u = w[0] / np.linalg.norm(w[0])
         angle = np.arctan2(u[1], u[0])
         angle = 180 * angle / np.pi  # convert to degrees
-        v = 2. * np.sqrt(2.) * np.sqrt(v)
-        ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
-                                  180 + angle, color=color)
+        v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
+        ell = mpl.patches.Ellipse(
+            gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color
+        )
         ell.set_clip_box(ax.bbox)
         ell.set_alpha(0.5)
         ax.add_artist(ell)
-        ax.set_aspect('equal', 'datalim')
+        ax.set_aspect("equal", "datalim")
 
 
 iris = datasets.load_iris()
@@ -84,22 +85,27 @@ def make_ellipses(gmm, ax):
 n_classes = len(np.unique(y_train))
 
 # Try GMMs using different types of covariances.
-estimators = {cov_type: GaussianMixture(n_components=n_classes,
-              covariance_type=cov_type, max_iter=20, random_state=0)
-              for cov_type in ['spherical', 'diag', 'tied', 'full']}
+estimators = {
+    cov_type: GaussianMixture(
+        n_components=n_classes, covariance_type=cov_type, max_iter=20, random_state=0
+    )
+    for cov_type in ["spherical", "diag", "tied", "full"]
+}
 
 n_estimators = len(estimators)
 
 plt.figure(figsize=(3 * n_estimators // 2, 6))
-plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05,
-                    left=.01, right=.99)
+plt.subplots_adjust(
+    bottom=0.01, top=0.95, hspace=0.15, wspace=0.05, left=0.01, right=0.99
+)
 
 
 for index, (name, estimator) in enumerate(estimators.items()):
     # Since we have class labels for the training data, we can
     # initialize the GMM parameters in a supervised manner.
-    estimator.means_init = np.array([X_train[y_train == i].mean(axis=0)
-                                    for i in range(n_classes)])
+    estimator.means_init = np.array(
+        [X_train[y_train == i].mean(axis=0) for i in range(n_classes)]
+    )
 
     # Train the other parameters using the EM algorithm.
     estimator.fit(X_train)
@@ -109,28 +115,27 @@ def make_ellipses(gmm, ax):
 
     for n, color in enumerate(colors):
         data = iris.data[iris.target == n]
-        plt.scatter(data[:, 0], data[:, 1], s=0.8, color=color,
-                    label=iris.target_names[n])
+        plt.scatter(
+            data[:, 0], data[:, 1], s=0.8, color=color, label=iris.target_names[n]
+        )
     # Plot the test data with crosses
     for n, color in enumerate(colors):
         data = X_test[y_test == n]
-        plt.scatter(data[:, 0], data[:, 1], marker='x', color=color)
+        plt.scatter(data[:, 0], data[:, 1], marker="x", color=color)
 
     y_train_pred = estimator.predict(X_train)
     train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
-    plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy,
-             transform=h.transAxes)
+    plt.text(0.05, 0.9, "Train accuracy: %.1f" % train_accuracy, transform=h.transAxes)
 
     y_test_pred = estimator.predict(X_test)
     test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
-    plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy,
-             transform=h.transAxes)
+    plt.text(0.05, 0.8, "Test accuracy: %.1f" % test_accuracy, transform=h.transAxes)
 
     plt.xticks(())
     plt.yticks(())
     plt.title(name)
 
-plt.legend(scatterpoints=1, loc='lower right', prop=dict(size=12))
+plt.legend(scatterpoints=1, loc="lower right", prop=dict(size=12))
 
 
 plt.show()
diff --git a/examples/mixture/plot_gmm_pdf.py b/examples/mixture/plot_gmm_pdf.py
index 4469c36a89625..73c46cd5e7fb1 100644
--- a/examples/mixture/plot_gmm_pdf.py
+++ b/examples/mixture/plot_gmm_pdf.py
@@ -22,29 +22,30 @@
 shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 20])
 
 # generate zero centered stretched Gaussian data
-C = np.array([[0., -0.7], [3.5, .7]])
+C = np.array([[0.0, -0.7], [3.5, 0.7]])
 stretched_gaussian = np.dot(np.random.randn(n_samples, 2), C)
 
 # concatenate the two datasets into the final training set
 X_train = np.vstack([shifted_gaussian, stretched_gaussian])
 
 # fit a Gaussian Mixture Model with two components
-clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
+clf = mixture.GaussianMixture(n_components=2, covariance_type="full")
 clf.fit(X_train)
 
 # display predicted scores by the model as a contour plot
-x = np.linspace(-20., 30.)
-y = np.linspace(-20., 40.)
+x = np.linspace(-20.0, 30.0)
+y = np.linspace(-20.0, 40.0)
 X, Y = np.meshgrid(x, y)
 XX = np.array([X.ravel(), Y.ravel()]).T
 Z = -clf.score_samples(XX)
 Z = Z.reshape(X.shape)
 
-CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
-                 levels=np.logspace(0, 3, 10))
-CB = plt.colorbar(CS, shrink=0.8, extend='both')
-plt.scatter(X_train[:, 0], X_train[:, 1], .8)
+CS = plt.contour(
+    X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10)
+)
+CB = plt.colorbar(CS, shrink=0.8, extend="both")
+plt.scatter(X_train[:, 0], X_train[:, 1], 0.8)
 
-plt.title('Negative log-likelihood predicted by a GMM')
-plt.axis('tight')
+plt.title("Negative log-likelihood predicted by a GMM")
+plt.axis("tight")
 plt.show()
diff --git a/examples/mixture/plot_gmm_selection.py b/examples/mixture/plot_gmm_selection.py
index 3340ea93ea965..1d2aebe7b077a 100644
--- a/examples/mixture/plot_gmm_selection.py
+++ b/examples/mixture/plot_gmm_selection.py
@@ -31,19 +31,22 @@
 
 # Generate random sample, two components
 np.random.seed(0)
-C = np.array([[0., -0.1], [1.7, .4]])
-X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
-          .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
+C = np.array([[0.0, -0.1], [1.7, 0.4]])
+X = np.r_[
+    np.dot(np.random.randn(n_samples, 2), C),
+    0.7 * np.random.randn(n_samples, 2) + np.array([-6, 3]),
+]
 
 lowest_bic = np.infty
 bic = []
 n_components_range = range(1, 7)
-cv_types = ['spherical', 'tied', 'diag', 'full']
+cv_types = ["spherical", "tied", "diag", "full"]
 for cv_type in cv_types:
     for n_components in n_components_range:
         # Fit a Gaussian mixture with EM
-        gmm = mixture.GaussianMixture(n_components=n_components,
-                                      covariance_type=cv_type)
+        gmm = mixture.GaussianMixture(
+            n_components=n_components, covariance_type=cv_type
+        )
         gmm.fit(X)
         bic.append(gmm.bic(X))
         if bic[-1] < lowest_bic:
@@ -51,8 +54,7 @@
             best_gmm = gmm
 
 bic = np.array(bic)
-color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
-                              'darkorange'])
+color_iter = itertools.cycle(["navy", "turquoise", "cornflowerblue", "darkorange"])
 clf = best_gmm
 bars = []
 
@@ -60,41 +62,50 @@
 plt.figure(figsize=(8, 6))
 spl = plt.subplot(2, 1, 1)
 for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
-    xpos = np.array(n_components_range) + .2 * (i - 2)
-    bars.append(plt.bar(xpos, bic[i * len(n_components_range):
-                                  (i + 1) * len(n_components_range)],
-                        width=.2, color=color))
+    xpos = np.array(n_components_range) + 0.2 * (i - 2)
+    bars.append(
+        plt.bar(
+            xpos,
+            bic[i * len(n_components_range) : (i + 1) * len(n_components_range)],
+            width=0.2,
+            color=color,
+        )
+    )
 plt.xticks(n_components_range)
-plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
-plt.title('BIC score per model')
-xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
-    .2 * np.floor(bic.argmin() / len(n_components_range))
-plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
-spl.set_xlabel('Number of components')
+plt.ylim([bic.min() * 1.01 - 0.01 * bic.max(), bic.max()])
+plt.title("BIC score per model")
+xpos = (
+    np.mod(bic.argmin(), len(n_components_range))
+    + 0.65
+    + 0.2 * np.floor(bic.argmin() / len(n_components_range))
+)
+plt.text(xpos, bic.min() * 0.97 + 0.03 * bic.max(), "*", fontsize=14)
+spl.set_xlabel("Number of components")
 spl.legend([b[0] for b in bars], cv_types)
 
 # Plot the winner
 splot = plt.subplot(2, 1, 2)
 Y_ = clf.predict(X)
-for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
-                                           color_iter)):
+for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)):
     v, w = linalg.eigh(cov)
     if not np.any(Y_ == i):
         continue
-    plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
+    plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 0.8, color=color)
 
     # Plot an ellipse to show the Gaussian component
     angle = np.arctan2(w[0][1], w[0][0])
-    angle = 180. * angle / np.pi  # convert to degrees
-    v = 2. * np.sqrt(2.) * np.sqrt(v)
-    ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
+    angle = 180.0 * angle / np.pi  # convert to degrees
+    v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
+    ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color)
     ell.set_clip_box(splot.bbox)
-    ell.set_alpha(.5)
+    ell.set_alpha(0.5)
     splot.add_artist(ell)
 
 plt.xticks(())
 plt.yticks(())
-plt.title(f'Selected GMM: {best_gmm.covariance_type} model, '
-          f'{best_gmm.n_components} components')
-plt.subplots_adjust(hspace=.35, bottom=.02)
+plt.title(
+    f"Selected GMM: {best_gmm.covariance_type} model, "
+    f"{best_gmm.n_components} components"
+)
+plt.subplots_adjust(hspace=0.35, bottom=0.02)
 plt.show()
diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py
index 1d436b93d15cc..1b6ac48d436a7 100644
--- a/examples/mixture/plot_gmm_sin.py
+++ b/examples/mixture/plot_gmm_sin.py
@@ -50,34 +50,32 @@
 
 print(__doc__)
 
-color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
-                              'darkorange'])
+color_iter = itertools.cycle(["navy", "c", "cornflowerblue", "gold", "darkorange"])
 
 
 def plot_results(X, Y, means, covariances, index, title):
     splot = plt.subplot(5, 1, 1 + index)
-    for i, (mean, covar, color) in enumerate(zip(
-            means, covariances, color_iter)):
+    for i, (mean, covar, color) in enumerate(zip(means, covariances, color_iter)):
         v, w = linalg.eigh(covar)
-        v = 2. * np.sqrt(2.) * np.sqrt(v)
+        v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
         u = w[0] / linalg.norm(w[0])
         # as the DP will not use every component it has access to
         # unless it needs it, we shouldn't plot the redundant
         # components.
         if not np.any(Y == i):
             continue
-        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], 0.8, color=color)
 
         # Plot an ellipse to show the Gaussian component
         angle = np.arctan(u[1] / u[0])
-        angle = 180. * angle / np.pi  # convert to degrees
-        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
+        angle = 180.0 * angle / np.pi  # convert to degrees
+        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color)
         ell.set_clip_box(splot.bbox)
         ell.set_alpha(0.5)
         splot.add_artist(ell)
 
-    plt.xlim(-6., 4. * np.pi - 6.)
-    plt.ylim(-5., 5.)
+    plt.xlim(-6.0, 4.0 * np.pi - 6.0)
+    plt.ylim(-5.0, 5.0)
     plt.title(title)
     plt.xticks(())
     plt.yticks(())
@@ -91,10 +89,10 @@ def plot_samples(X, Y, n_components, index, title):
         # components.
         if not np.any(Y == i):
             continue
-        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], 0.8, color=color)
 
-    plt.xlim(-6., 4. * np.pi - 6.)
-    plt.ylim(-5., 5.)
+    plt.xlim(-6.0, 4.0 * np.pi - 6.0)
+    plt.ylim(-5.0, 5.0)
     plt.title(title)
     plt.xticks(())
     plt.yticks(())
@@ -106,49 +104,86 @@ def plot_samples(X, Y, n_components, index, title):
 # Generate random sample following a sine curve
 np.random.seed(0)
 X = np.zeros((n_samples, 2))
-step = 4. * np.pi / n_samples
+step = 4.0 * np.pi / n_samples
 
 for i in range(X.shape[0]):
-    x = i * step - 6.
+    x = i * step - 6.0
     X[i, 0] = x + np.random.normal(0, 0.1)
-    X[i, 1] = 3. * (np.sin(x) + np.random.normal(0, .2))
+    X[i, 1] = 3.0 * (np.sin(x) + np.random.normal(0, 0.2))
 
 plt.figure(figsize=(10, 10))
-plt.subplots_adjust(bottom=.04, top=0.95, hspace=.2, wspace=.05,
-                    left=.03, right=.97)
+plt.subplots_adjust(
+    bottom=0.04, top=0.95, hspace=0.2, wspace=0.05, left=0.03, right=0.97
+)
 
 # Fit a Gaussian mixture with EM using ten components
-gmm = mixture.GaussianMixture(n_components=10, covariance_type='full',
-                              max_iter=100).fit(X)
-plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
-             'Expectation-maximization')
+gmm = mixture.GaussianMixture(
+    n_components=10, covariance_type="full", max_iter=100
+).fit(X)
+plot_results(
+    X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, "Expectation-maximization"
+)
 
 dpgmm = mixture.BayesianGaussianMixture(
-    n_components=10, covariance_type='full', weight_concentration_prior=1e-2,
-    weight_concentration_prior_type='dirichlet_process',
-    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
-    init_params="random", max_iter=100, random_state=2).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
-             "Bayesian Gaussian mixture models with a Dirichlet process prior "
-             r"for $\gamma_0=0.01$.")
+    n_components=10,
+    covariance_type="full",
+    weight_concentration_prior=1e-2,
+    weight_concentration_prior_type="dirichlet_process",
+    mean_precision_prior=1e-2,
+    covariance_prior=1e0 * np.eye(2),
+    init_params="random",
+    max_iter=100,
+    random_state=2,
+).fit(X)
+plot_results(
+    X,
+    dpgmm.predict(X),
+    dpgmm.means_,
+    dpgmm.covariances_,
+    1,
+    "Bayesian Gaussian mixture models with a Dirichlet process prior "
+    r"for $\gamma_0=0.01$.",
+)
 
 X_s, y_s = dpgmm.sample(n_samples=2000)
-plot_samples(X_s, y_s, dpgmm.n_components, 0,
-             "Gaussian mixture with a Dirichlet process prior "
-             r"for $\gamma_0=0.01$ sampled with $2000$ samples.")
+plot_samples(
+    X_s,
+    y_s,
+    dpgmm.n_components,
+    0,
+    "Gaussian mixture with a Dirichlet process prior "
+    r"for $\gamma_0=0.01$ sampled with $2000$ samples.",
+)
 
 dpgmm = mixture.BayesianGaussianMixture(
-    n_components=10, covariance_type='full', weight_concentration_prior=1e+2,
-    weight_concentration_prior_type='dirichlet_process',
-    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
-    init_params="kmeans", max_iter=100, random_state=2).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 2,
-             "Bayesian Gaussian mixture models with a Dirichlet process prior "
-             r"for $\gamma_0=100$")
+    n_components=10,
+    covariance_type="full",
+    weight_concentration_prior=1e2,
+    weight_concentration_prior_type="dirichlet_process",
+    mean_precision_prior=1e-2,
+    covariance_prior=1e0 * np.eye(2),
+    init_params="kmeans",
+    max_iter=100,
+    random_state=2,
+).fit(X)
+plot_results(
+    X,
+    dpgmm.predict(X),
+    dpgmm.means_,
+    dpgmm.covariances_,
+    2,
+    "Bayesian Gaussian mixture models with a Dirichlet process prior "
+    r"for $\gamma_0=100$",
+)
 
 X_s, y_s = dpgmm.sample(n_samples=2000)
-plot_samples(X_s, y_s, dpgmm.n_components, 1,
-             "Gaussian mixture with a Dirichlet process prior "
-             r"for $\gamma_0=100$ sampled with $2000$ samples.")
+plot_samples(
+    X_s,
+    y_s,
+    dpgmm.n_components,
+    1,
+    "Gaussian mixture with a Dirichlet process prior "
+    r"for $\gamma_0=100$ sampled with $2000$ samples.",
+)
 
 plt.show()
diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py
index a1bd12581768c..a28593eb90866 100644
--- a/examples/model_selection/grid_search_text_feature_extraction.py
+++ b/examples/model_selection/grid_search_text_feature_extraction.py
@@ -1,4 +1,3 @@
-
 """
 ==========================================================
 Sample pipeline for text feature extraction and evaluation
@@ -60,15 +59,14 @@
 print(__doc__)
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
 
 # #############################################################################
 # Load some categories from the training set
 categories = [
-    'alt.atheism',
-    'talk.religion.misc',
+    "alt.atheism",
+    "talk.religion.misc",
 ]
 # Uncomment the following to do the analysis on all the categories
 # categories = None
@@ -76,7 +74,7 @@
 print("Loading 20 newsgroups dataset for categories:")
 print(categories)
 
-data = fetch_20newsgroups(subset='train', categories=categories)
+data = fetch_20newsgroups(subset="train", categories=categories)
 print("%d documents" % len(data.filenames))
 print("%d categories" % len(data.target_names))
 print()
@@ -84,23 +82,25 @@
 # #############################################################################
 # Define a pipeline combining a text feature extractor with a simple
 # classifier
-pipeline = Pipeline([
-    ('vect', CountVectorizer()),
-    ('tfidf', TfidfTransformer()),
-    ('clf', SGDClassifier()),
-])
+pipeline = Pipeline(
+    [
+        ("vect", CountVectorizer()),
+        ("tfidf", TfidfTransformer()),
+        ("clf", SGDClassifier()),
+    ]
+)
 
 # uncommenting more parameters will give better exploring power but will
 # increase processing time in a combinatorial way
 parameters = {
-    'vect__max_df': (0.5, 0.75, 1.0),
+    "vect__max_df": (0.5, 0.75, 1.0),
     # 'vect__max_features': (None, 5000, 10000, 50000),
-    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
+    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
     # 'tfidf__use_idf': (True, False),
     # 'tfidf__norm': ('l1', 'l2'),
-    'clf__max_iter': (20,),
-    'clf__alpha': (0.00001, 0.000001),
-    'clf__penalty': ('l2', 'elasticnet'),
+    "clf__max_iter": (20,),
+    "clf__alpha": (0.00001, 0.000001),
+    "clf__penalty": ("l2", "elasticnet"),
     # 'clf__max_iter': (10, 50, 80),
 }
 
diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py
index d54d9747a8cf3..251a4e175eb90 100644
--- a/examples/model_selection/plot_confusion_matrix.py
+++ b/examples/model_selection/plot_confusion_matrix.py
@@ -44,17 +44,23 @@
 
 # Run classifier, using a model that is too regularized (C too low) to see
 # the impact on the results
-classifier = svm.SVC(kernel='linear', C=0.01).fit(X_train, y_train)
+classifier = svm.SVC(kernel="linear", C=0.01).fit(X_train, y_train)
 
 np.set_printoptions(precision=2)
 
 # Plot non-normalized confusion matrix
-titles_options = [("Confusion matrix, without normalization", None),
-                  ("Normalized confusion matrix", 'true')]
+titles_options = [
+    ("Confusion matrix, without normalization", None),
+    ("Normalized confusion matrix", "true"),
+]
 for title, normalize in titles_options:
     disp = ConfusionMatrixDisplay.from_estimator(
-        classifier, X_test, y_test, display_labels=class_names,
-        cmap=plt.cm.Blues, normalize=normalize
+        classifier,
+        X_test,
+        y_test,
+        display_labels=class_names,
+        cmap=plt.cm.Blues,
+        normalize=normalize,
     )
     disp.ax_.set_title(title)
 
diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py
index f07fa1595e860..24deba92f11e5 100644
--- a/examples/model_selection/plot_cv_indices.py
+++ b/examples/model_selection/plot_cv_indices.py
@@ -11,13 +11,20 @@
 for comparison.
 """
 
-from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
-                                     StratifiedKFold, GroupShuffleSplit,
-                                     GroupKFold, StratifiedShuffleSplit,
-                                     StratifiedGroupKFold)
+from sklearn.model_selection import (
+    TimeSeriesSplit,
+    KFold,
+    ShuffleSplit,
+    StratifiedKFold,
+    GroupShuffleSplit,
+    GroupKFold,
+    StratifiedShuffleSplit,
+    StratifiedGroupKFold,
+)
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch
+
 np.random.seed(1338)
 cmap_data = plt.cm.Paired
 cmap_cv = plt.cm.coolwarm
@@ -41,9 +48,8 @@
 n_points = 100
 X = np.random.randn(100, 10)
 
-percentiles_classes = [.1, .3, .6]
-y = np.hstack([[ii] * int(100 * perc)
-               for ii, perc in enumerate(percentiles_classes)])
+percentiles_classes = [0.1, 0.3, 0.6]
+y = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)])
 
 # Evenly spaced groups repeated once
 groups = np.hstack([[ii] * 10 for ii in range(10)])
@@ -52,15 +58,31 @@
 def visualize_groups(classes, groups, name):
     # Visualize dataset groups
     fig, ax = plt.subplots()
-    ax.scatter(range(len(groups)),  [.5] * len(groups), c=groups, marker='_',
-               lw=50, cmap=cmap_data)
-    ax.scatter(range(len(groups)),  [3.5] * len(groups), c=classes, marker='_',
-               lw=50, cmap=cmap_data)
-    ax.set(ylim=[-1, 5], yticks=[.5, 3.5],
-           yticklabels=['Data\ngroup', 'Data\nclass'], xlabel="Sample index")
-
-
-visualize_groups(y, groups, 'no groups')
+    ax.scatter(
+        range(len(groups)),
+        [0.5] * len(groups),
+        c=groups,
+        marker="_",
+        lw=50,
+        cmap=cmap_data,
+    )
+    ax.scatter(
+        range(len(groups)),
+        [3.5] * len(groups),
+        c=classes,
+        marker="_",
+        lw=50,
+        cmap=cmap_data,
+    )
+    ax.set(
+        ylim=[-1, 5],
+        yticks=[0.5, 3.5],
+        yticklabels=["Data\ngroup", "Data\nclass"],
+        xlabel="Sample index",
+    )
+
+
+visualize_groups(y, groups, "no groups")
 
 # %%
 # Define a function to visualize cross-validation behavior
@@ -83,23 +105,37 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
         indices[tr] = 0
 
         # Visualize the results
-        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
-                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
-                   vmin=-.2, vmax=1.2)
+        ax.scatter(
+            range(len(indices)),
+            [ii + 0.5] * len(indices),
+            c=indices,
+            marker="_",
+            lw=lw,
+            cmap=cmap_cv,
+            vmin=-0.2,
+            vmax=1.2,
+        )
 
     # Plot the data classes and groups at the end
-    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
-               c=y, marker='_', lw=lw, cmap=cmap_data)
+    ax.scatter(
+        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
+    )
 
-    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
-               c=group, marker='_', lw=lw, cmap=cmap_data)
+    ax.scatter(
+        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
+    )
 
     # Formatting
-    yticklabels = list(range(n_splits)) + ['class', 'group']
-    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
-           xlabel='Sample index', ylabel="CV iteration",
-           ylim=[n_splits+2.2, -.2], xlim=[0, 100])
-    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
+    yticklabels = list(range(n_splits)) + ["class", "group"]
+    ax.set(
+        yticks=np.arange(n_splits + 2) + 0.5,
+        yticklabels=yticklabels,
+        xlabel="Sample index",
+        ylabel="CV iteration",
+        ylim=[n_splits + 2.2, -0.2],
+        xlim=[0, 100],
+    )
+    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
     return ax
 
 
@@ -132,11 +168,14 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 for cv in cvs:
     fig, ax = plt.subplots(figsize=(6, 3))
     plot_cv_indices(cv(n_splits), X, y, uneven_groups, ax, n_splits)
-    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
-              ['Testing set', 'Training set'], loc=(1.02, .8))
+    ax.legend(
+        [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
+        ["Testing set", "Training set"],
+        loc=(1.02, 0.8),
+    )
     # Make the legend fit
     plt.tight_layout()
-    fig.subplots_adjust(right=.7)
+    fig.subplots_adjust(right=0.7)
 
 # %%
 # Next we'll visualize this behavior for a number of CV iterators.
@@ -150,8 +189,16 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 #
 # Note how some use the group/class information while others do not.
 
-cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, StratifiedGroupKFold,
-       GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]
+cvs = [
+    KFold,
+    GroupKFold,
+    ShuffleSplit,
+    StratifiedKFold,
+    StratifiedGroupKFold,
+    GroupShuffleSplit,
+    StratifiedShuffleSplit,
+    TimeSeriesSplit,
+]
 
 
 for cv in cvs:
@@ -159,9 +206,12 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
     fig, ax = plt.subplots(figsize=(6, 3))
     plot_cv_indices(this_cv, X, y, groups, ax, n_splits)
 
-    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
-              ['Testing set', 'Training set'], loc=(1.02, .8))
+    ax.legend(
+        [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
+        ["Testing set", "Training set"],
+        loc=(1.02, 0.8),
+    )
     # Make the legend fit
     plt.tight_layout()
-    fig.subplots_adjust(right=.7)
+    fig.subplots_adjust(right=0.7)
 plt.show()
diff --git a/examples/model_selection/plot_cv_predict.py b/examples/model_selection/plot_cv_predict.py
index ee3e82f42cba1..8d31da4ff2fec 100644
--- a/examples/model_selection/plot_cv_predict.py
+++ b/examples/model_selection/plot_cv_predict.py
@@ -22,7 +22,7 @@
 
 fig, ax = plt.subplots()
 ax.scatter(y, predicted, edgecolors=(0, 0, 0))
-ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
-ax.set_xlabel('Measured')
-ax.set_ylabel('Predicted')
+ax.plot([y.min(), y.max()], [y.min(), y.max()], "k--", lw=4)
+ax.set_xlabel("Measured")
+ax.set_ylabel("Predicted")
 plt.show()
diff --git a/examples/model_selection/plot_grid_search_digits.py b/examples/model_selection/plot_grid_search_digits.py
index 498b00082b7c1..b6100489d4a53 100644
--- a/examples/model_selection/plot_grid_search_digits.py
+++ b/examples/model_selection/plot_grid_search_digits.py
@@ -33,23 +33,21 @@
 y = digits.target
 
 # Split the dataset in two equal parts
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.5, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 
 # Set the parameters by cross-validation
-tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
-                     'C': [1, 10, 100, 1000]},
-                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
+tuned_parameters = [
+    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
+    {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
+]
 
-scores = ['precision', 'recall']
+scores = ["precision", "recall"]
 
 for score in scores:
     print("# Tuning hyper-parameters for %s" % score)
     print()
 
-    clf = GridSearchCV(
-        SVC(), tuned_parameters, scoring='%s_macro' % score
-    )
+    clf = GridSearchCV(SVC(), tuned_parameters, scoring="%s_macro" % score)
     clf.fit(X_train, y_train)
 
     print("Best parameters set found on development set:")
@@ -58,11 +56,10 @@
     print()
     print("Grid scores on development set:")
     print()
-    means = clf.cv_results_['mean_test_score']
-    stds = clf.cv_results_['std_test_score']
-    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
-        print("%0.3f (+/-%0.03f) for %r"
-              % (mean, std * 2, params))
+    means = clf.cv_results_["mean_test_score"]
+    stds = clf.cv_results_["std_test_score"]
+    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
+        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
     print()
 
     print("Detailed classification report:")
diff --git a/examples/model_selection/plot_grid_search_refit_callable.py b/examples/model_selection/plot_grid_search_refit_callable.py
index b2e0cf9d8dcc7..3d363d6bb7384 100644
--- a/examples/model_selection/plot_grid_search_refit_callable.py
+++ b/examples/model_selection/plot_grid_search_refit_callable.py
@@ -46,10 +46,12 @@ def lower_bound(cv_results):
         Lower bound within 1 standard deviation of the
         best `mean_test_score`.
     """
-    best_score_idx = np.argmax(cv_results['mean_test_score'])
+    best_score_idx = np.argmax(cv_results["mean_test_score"])
 
-    return (cv_results['mean_test_score'][best_score_idx]
-            - cv_results['std_test_score'][best_score_idx])
+    return (
+        cv_results["mean_test_score"][best_score_idx]
+        - cv_results["std_test_score"][best_score_idx]
+    )
 
 
 def best_low_complexity(cv_results):
@@ -69,48 +71,56 @@ def best_low_complexity(cv_results):
         `mean_test_score`.
     """
     threshold = lower_bound(cv_results)
-    candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold)
-    best_idx = candidate_idx[cv_results['param_reduce_dim__n_components']
-                             [candidate_idx].argmin()]
+    candidate_idx = np.flatnonzero(cv_results["mean_test_score"] >= threshold)
+    best_idx = candidate_idx[
+        cv_results["param_reduce_dim__n_components"][candidate_idx].argmin()
+    ]
     return best_idx
 
 
-pipe = Pipeline([
-        ('reduce_dim', PCA(random_state=42)),
-        ('classify', LinearSVC(random_state=42, C=0.01)),
-])
+pipe = Pipeline(
+    [
+        ("reduce_dim", PCA(random_state=42)),
+        ("classify", LinearSVC(random_state=42, C=0.01)),
+    ]
+)
 
-param_grid = {
-    'reduce_dim__n_components': [6, 8, 10, 12, 14]
-}
+param_grid = {"reduce_dim__n_components": [6, 8, 10, 12, 14]}
 
-grid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid,
-                    scoring='accuracy', refit=best_low_complexity)
+grid = GridSearchCV(
+    pipe,
+    cv=10,
+    n_jobs=1,
+    param_grid=param_grid,
+    scoring="accuracy",
+    refit=best_low_complexity,
+)
 X, y = load_digits(return_X_y=True)
 grid.fit(X, y)
 
-n_components = grid.cv_results_['param_reduce_dim__n_components']
-test_scores = grid.cv_results_['mean_test_score']
+n_components = grid.cv_results_["param_reduce_dim__n_components"]
+test_scores = grid.cv_results_["mean_test_score"]
 
 plt.figure()
-plt.bar(n_components, test_scores, width=1.3, color='b')
+plt.bar(n_components, test_scores, width=1.3, color="b")
 
 lower = lower_bound(grid.cv_results_)
-plt.axhline(np.max(test_scores), linestyle='--', color='y',
-            label='Best score')
-plt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std')
+plt.axhline(np.max(test_scores), linestyle="--", color="y", label="Best score")
+plt.axhline(lower, linestyle="--", color=".5", label="Best score - 1 std")
 
 plt.title("Balance model complexity and cross-validated score")
-plt.xlabel('Number of PCA components used')
-plt.ylabel('Digit classification accuracy')
+plt.xlabel("Number of PCA components used")
+plt.ylabel("Digit classification accuracy")
 plt.xticks(n_components.tolist())
 plt.ylim((0, 1.0))
-plt.legend(loc='upper left')
+plt.legend(loc="upper left")
 
 best_index_ = grid.best_index_
 
 print("The best_index_ is %d" % best_index_)
 print("The n_components selected is %d" % n_components[best_index_])
-print("The corresponding accuracy score is %.2f"
-      % grid.cv_results_['mean_test_score'][best_index_])
+print(
+    "The corresponding accuracy score is %.2f"
+    % grid.cv_results_["mean_test_score"][best_index_]
+)
 plt.show()
diff --git a/examples/model_selection/plot_grid_search_stats.py b/examples/model_selection/plot_grid_search_stats.py
index 1b434ec0e8b6e..70dd204390491 100644
--- a/examples/model_selection/plot_grid_search_stats.py
+++ b/examples/model_selection/plot_grid_search_stats.py
@@ -21,8 +21,7 @@
 X, y = make_moons(noise=0.352, random_state=1, n_samples=100)
 
 sns.scatterplot(
-    x=X[:, 0], y=X[:, 1], hue=y,
-    marker='o', s=25, edgecolor='k', legend=False
+    x=X[:, 0], y=X[:, 1], hue=y, marker="o", s=25, edgecolor="k", legend=False
 ).set_title("Data")
 plt.show()
 
@@ -40,21 +39,16 @@
 from sklearn.svm import SVC
 
 param_grid = [
-    {'kernel': ['linear']},
-    {'kernel': ['poly'], 'degree': [2, 3]},
-    {'kernel': ['rbf']}
+    {"kernel": ["linear"]},
+    {"kernel": ["poly"], "degree": [2, 3]},
+    {"kernel": ["rbf"]},
 ]
 
 svc = SVC(random_state=0)
 
-cv = RepeatedStratifiedKFold(
-    n_splits=10, n_repeats=10, random_state=0
-)
+cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)
 
-search = GridSearchCV(
-    estimator=svc, param_grid=param_grid,
-    scoring='roc_auc', cv=cv
-)
+search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring="roc_auc", cv=cv)
 search.fit(X, y)
 
 # %%
@@ -64,17 +58,11 @@
 import pandas as pd
 
 results_df = pd.DataFrame(search.cv_results_)
-results_df = results_df.sort_values(by=['rank_test_score'])
-results_df = (
-    results_df
-    .set_index(results_df["params"].apply(
-        lambda x: "_".join(str(val) for val in x.values()))
-    )
-    .rename_axis('kernel')
-)
-results_df[
-    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
-]
+results_df = results_df.sort_values(by=["rank_test_score"])
+results_df = results_df.set_index(
+    results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
+).rename_axis("kernel")
+results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]]
 
 # %%
 # We can see that the estimator using the `'rbf'` kernel performed best,
@@ -102,13 +90,17 @@
 # in each fold, and calculating the correlation between models across folds:
 
 # create df of model scores ordered by performance
-model_scores = results_df.filter(regex=r'split\d*_test_score')
+model_scores = results_df.filter(regex=r"split\d*_test_score")
 
 # plot 30 examples of dependency between cv fold and AUC scores
 fig, ax = plt.subplots()
 sns.lineplot(
     data=model_scores.transpose().iloc[:30],
-    dashes=False, palette='Set1', marker='o', alpha=.5, ax=ax
+    dashes=False,
+    palette="Set1",
+    marker="o",
+    alpha=0.5,
+    ax=ax,
 )
 ax.set_xlabel("CV test fold", size=12, labelpad=10)
 ax.set_ylabel("Model AUC", size=12)
@@ -193,9 +185,7 @@ def corrected_std(differences, n_train, n_test):
     # kr = k times r, r times repeated k-fold crossvalidation,
     # kr equals the number of times the model was evaluated
     kr = len(differences)
-    corrected_var = (
-        np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
-    )
+    corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
     corrected_std = np.sqrt(corrected_var)
     return corrected_std
 
@@ -240,19 +230,18 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 n_test = len(list(cv.split(X, y))[0][1])
 
 t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test)
-print(f"Corrected t-value: {t_stat:.3f}\n"
-      f"Corrected p-value: {p_val:.3f}")
+print(f"Corrected t-value: {t_stat:.3f}\nCorrected p-value: {p_val:.3f}")
 
 # %%
 # We can compare the corrected t- and p-values with the uncorrected ones:
 
-t_stat_uncorrected = (
-    np.mean(differences) / np.sqrt(np.var(differences, ddof=1) / n)
-)
+t_stat_uncorrected = np.mean(differences) / np.sqrt(np.var(differences, ddof=1) / n)
 p_val_uncorrected = t.sf(np.abs(t_stat_uncorrected), df)
 
-print(f"Uncorrected t-value: {t_stat_uncorrected:.3f}\n"
-      f"Uncorrected p-value: {p_val_uncorrected:.3f}")
+print(
+    f"Uncorrected t-value: {t_stat_uncorrected:.3f}\n"
+    f"Uncorrected p-value: {p_val_uncorrected:.3f}"
+)
 
 # %%
 # Using the conventional significance alpha level at `p=0.05`, we observe that
@@ -310,8 +299,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 
 # initialize random variable
 t_post = t(
-    df, loc=np.mean(differences),
-    scale=corrected_std(differences, n_train, n_test)
+    df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test)
 )
 
 # %%
@@ -321,7 +309,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 
 plt.plot(x, t_post.pdf(x))
 plt.xticks(np.arange(-0.04, 0.06, 0.01))
-plt.fill_between(x, t_post.pdf(x), 0, facecolor='blue', alpha=.2)
+plt.fill_between(x, t_post.pdf(x), 0, facecolor="blue", alpha=0.2)
 plt.ylabel("Probability density")
 plt.xlabel(r"Mean difference ($\mu$)")
 plt.title("Posterior distribution")
@@ -336,10 +324,14 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 
 better_prob = 1 - t_post.cdf(0)
 
-print(f"Probability of {model_scores.index[0]} being more accurate than "
-      f"{model_scores.index[1]}: {better_prob:.3f}")
-print(f"Probability of {model_scores.index[1]} being more accurate than "
-      f"{model_scores.index[0]}: {1 - better_prob:.3f}")
+print(
+    f"Probability of {model_scores.index[0]} being more accurate than "
+    f"{model_scores.index[1]}: {better_prob:.3f}"
+)
+print(
+    f"Probability of {model_scores.index[1]} being more accurate than "
+    f"{model_scores.index[0]}: {1 - better_prob:.3f}"
+)
 
 # %%
 # In contrast with the frequentist approach, we can compute the probability
@@ -373,8 +365,10 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 rope_interval = [-0.01, 0.01]
 rope_prob = t_post.cdf(rope_interval[1]) - t_post.cdf(rope_interval[0])
 
-print(f"Probability of {model_scores.index[0]} and {model_scores.index[1]} "
-      f"being practically equivalent: {rope_prob:.3f}")
+print(
+    f"Probability of {model_scores.index[0]} and {model_scores.index[1]} "
+    f"being practically equivalent: {rope_prob:.3f}"
+)
 
 # %%
 # We can plot how the posterior is distributed over the ROPE interval:
@@ -384,7 +378,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 plt.plot(x, t_post.pdf(x))
 plt.xticks(np.arange(-0.04, 0.06, 0.01))
 plt.vlines([-0.01, 0.01], ymin=0, ymax=(np.max(t_post.pdf(x)) + 1))
-plt.fill_between(x_rope, t_post.pdf(x_rope), 0, facecolor='blue', alpha=.2)
+plt.fill_between(x_rope, t_post.pdf(x_rope), 0, facecolor="blue", alpha=0.2)
 plt.ylabel("Probability density")
 plt.xlabel(r"Mean difference ($\mu$)")
 plt.title("Posterior distribution under the ROPE")
@@ -416,9 +410,8 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
     cred_intervals.append([interval, cred_interval[0], cred_interval[1]])
 
 cred_int_df = pd.DataFrame(
-    cred_intervals,
-    columns=['interval', 'lower value', 'upper value']
-).set_index('interval')
+    cred_intervals, columns=["interval", "lower value", "upper value"]
+).set_index("interval")
 cred_int_df
 
 # %%
@@ -448,9 +441,8 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 from itertools import combinations
 from math import factorial
 
-n_comparisons = (
-    factorial(len(model_scores))
-    / (factorial(2) * factorial(len(model_scores) - 2))
+n_comparisons = factorial(len(model_scores)) / (
+    factorial(2) * factorial(len(model_scores) - 2)
 )
 pairwise_t_test = []
 
@@ -458,20 +450,16 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
     model_i_scores = model_scores.iloc[model_i].values
     model_k_scores = model_scores.iloc[model_k].values
     differences = model_i_scores - model_k_scores
-    t_stat, p_val = compute_corrected_ttest(
-        differences, df, n_train, n_test
-    )
+    t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test)
     p_val *= n_comparisons  # implement Bonferroni correction
     # Bonferroni can output p-values higher than 1
     p_val = 1 if p_val > 1 else p_val
     pairwise_t_test.append(
-        [model_scores.index[model_i], model_scores.index[model_k],
-         t_stat, p_val]
+        [model_scores.index[model_i], model_scores.index[model_k], t_stat, p_val]
     )
 
 pairwise_comp_df = pd.DataFrame(
-    pairwise_t_test,
-    columns=['model_1', 'model_2', 't_stat', 'p_val']
+    pairwise_t_test, columns=["model_1", "model_2", "t_stat", "p_val"]
 ).round(3)
 pairwise_comp_df
 
@@ -499,8 +487,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
     model_k_scores = model_scores.iloc[model_k].values
     differences = model_i_scores - model_k_scores
     t_post = t(
-        df, loc=np.mean(differences),
-        scale=corrected_std(differences, n_train, n_test)
+        df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test)
     )
     worse_prob = t_post.cdf(rope_interval[0])
     better_prob = 1 - t_post.cdf(rope_interval[1])
@@ -508,10 +495,9 @@ def compute_corrected_ttest(differences, df, n_train, n_test):
 
     pairwise_bayesian.append([worse_prob, better_prob, rope_prob])
 
-pairwise_bayesian_df = (pd.DataFrame(
-    pairwise_bayesian,
-    columns=['worse_prob', 'better_prob', 'rope_prob']
-).round(3))
+pairwise_bayesian_df = pd.DataFrame(
+    pairwise_bayesian, columns=["worse_prob", "better_prob", "rope_prob"]
+).round(3)
 
 pairwise_comp_df = pairwise_comp_df.join(pairwise_bayesian_df)
 pairwise_comp_df
diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index 71cc565c3528c..46b11cad2e7e4 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -26,8 +26,17 @@
 from sklearn.model_selection import ShuffleSplit
 
 
-def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
-                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
+def plot_learning_curve(
+    estimator,
+    title,
+    X,
+    y,
+    axes=None,
+    ylim=None,
+    cv=None,
+    n_jobs=None,
+    train_sizes=np.linspace(0.1, 1.0, 5),
+):
     """
     Generate 3 plots: the test and training learning curve, the training
     samples vs fit times curve, the fit times vs score curve.
@@ -96,10 +105,15 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
     axes[0].set_xlabel("Training examples")
     axes[0].set_ylabel("Score")
 
-    train_sizes, train_scores, test_scores, fit_times, _ = \
-        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
-                       train_sizes=train_sizes,
-                       return_times=True)
+    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
+        estimator,
+        X,
+        y,
+        cv=cv,
+        n_jobs=n_jobs,
+        train_sizes=train_sizes,
+        return_times=True,
+    )
     train_scores_mean = np.mean(train_scores, axis=1)
     train_scores_std = np.std(train_scores, axis=1)
     test_scores_mean = np.mean(test_scores, axis=1)
@@ -109,32 +123,50 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
 
     # Plot learning curve
     axes[0].grid()
-    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
-                         train_scores_mean + train_scores_std, alpha=0.1,
-                         color="r")
-    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
-                         test_scores_mean + test_scores_std, alpha=0.1,
-                         color="g")
-    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
-                 label="Training score")
-    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
-                 label="Cross-validation score")
+    axes[0].fill_between(
+        train_sizes,
+        train_scores_mean - train_scores_std,
+        train_scores_mean + train_scores_std,
+        alpha=0.1,
+        color="r",
+    )
+    axes[0].fill_between(
+        train_sizes,
+        test_scores_mean - test_scores_std,
+        test_scores_mean + test_scores_std,
+        alpha=0.1,
+        color="g",
+    )
+    axes[0].plot(
+        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
+    )
+    axes[0].plot(
+        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
+    )
     axes[0].legend(loc="best")
 
     # Plot n_samples vs fit_times
     axes[1].grid()
-    axes[1].plot(train_sizes, fit_times_mean, 'o-')
-    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
-                         fit_times_mean + fit_times_std, alpha=0.1)
+    axes[1].plot(train_sizes, fit_times_mean, "o-")
+    axes[1].fill_between(
+        train_sizes,
+        fit_times_mean - fit_times_std,
+        fit_times_mean + fit_times_std,
+        alpha=0.1,
+    )
     axes[1].set_xlabel("Training examples")
     axes[1].set_ylabel("fit_times")
     axes[1].set_title("Scalability of the model")
 
     # Plot fit_time vs score
     axes[2].grid()
-    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
-    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
-                         test_scores_mean + test_scores_std, alpha=0.1)
+    axes[2].plot(fit_times_mean, test_scores_mean, "o-")
+    axes[2].fill_between(
+        fit_times_mean,
+        test_scores_mean - test_scores_std,
+        test_scores_mean + test_scores_std,
+        alpha=0.1,
+    )
     axes[2].set_xlabel("fit_times")
     axes[2].set_ylabel("Score")
     axes[2].set_title("Performance of the model")
@@ -152,14 +184,16 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
 cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
 
 estimator = GaussianNB()
-plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01),
-                    cv=cv, n_jobs=4)
+plot_learning_curve(
+    estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4
+)
 
 title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
 # SVC is more expensive so we do a lower number of CV iterations:
 cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
 estimator = SVC(gamma=0.001)
-plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01),
-                    cv=cv, n_jobs=4)
+plot_learning_curve(
+    estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4
+)
 
 plt.show()
diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py
index 4f03f1b19462d..7d6ce84eb5f26 100644
--- a/examples/model_selection/plot_multi_metric_evaluation.py
+++ b/examples/model_selection/plot_multi_metric_evaluation.py
@@ -38,16 +38,20 @@
 
 # The scorers can be either one of the predefined metric strings or a scorer
 # callable, like the one returned by make_scorer
-scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
+scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}
 
 # Setting refit='AUC', refits an estimator on the whole dataset with the
 # parameter setting that has the best cross-validated AUC score.
 # That estimator is made available at ``gs.best_estimator_`` along with
 # parameters like ``gs.best_score_``, ``gs.best_params_`` and
 # ``gs.best_index_``
-gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
-                  param_grid={'min_samples_split': range(2, 403, 10)},
-                  scoring=scoring, refit='AUC', return_train_score=True)
+gs = GridSearchCV(
+    DecisionTreeClassifier(random_state=42),
+    param_grid={"min_samples_split": range(2, 403, 10)},
+    scoring=scoring,
+    refit="AUC",
+    return_train_score=True,
+)
 gs.fit(X, y)
 results = gs.cv_results_
 
@@ -56,8 +60,7 @@
 # -------------------
 
 plt.figure(figsize=(13, 13))
-plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
-          fontsize=16)
+plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16)
 
 plt.xlabel("min_samples_split")
 plt.ylabel("Score")
@@ -67,29 +70,47 @@
 ax.set_ylim(0.73, 1)
 
 # Get the regular numpy array from the MaskedArray
-X_axis = np.array(results['param_min_samples_split'].data, dtype=float)
-
-for scorer, color in zip(sorted(scoring), ['g', 'k']):
-    for sample, style in (('train', '--'), ('test', '-')):
-        sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
-        sample_score_std = results['std_%s_%s' % (sample, scorer)]
-        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
-                        sample_score_mean + sample_score_std,
-                        alpha=0.1 if sample == 'test' else 0, color=color)
-        ax.plot(X_axis, sample_score_mean, style, color=color,
-                alpha=1 if sample == 'test' else 0.7,
-                label="%s (%s)" % (scorer, sample))
-
-    best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
-    best_score = results['mean_test_%s' % scorer][best_index]
+X_axis = np.array(results["param_min_samples_split"].data, dtype=float)
+
+for scorer, color in zip(sorted(scoring), ["g", "k"]):
+    for sample, style in (("train", "--"), ("test", "-")):
+        sample_score_mean = results["mean_%s_%s" % (sample, scorer)]
+        sample_score_std = results["std_%s_%s" % (sample, scorer)]
+        ax.fill_between(
+            X_axis,
+            sample_score_mean - sample_score_std,
+            sample_score_mean + sample_score_std,
+            alpha=0.1 if sample == "test" else 0,
+            color=color,
+        )
+        ax.plot(
+            X_axis,
+            sample_score_mean,
+            style,
+            color=color,
+            alpha=1 if sample == "test" else 0.7,
+            label="%s (%s)" % (scorer, sample),
+        )
+
+    best_index = np.nonzero(results["rank_test_%s" % scorer] == 1)[0][0]
+    best_score = results["mean_test_%s" % scorer][best_index]
 
     # Plot a dotted vertical line at the best score for that scorer marked by x
-    ax.plot([X_axis[best_index], ] * 2, [0, best_score],
-            linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)
+    ax.plot(
+        [
+            X_axis[best_index],
+        ]
+        * 2,
+        [0, best_score],
+        linestyle="-.",
+        color=color,
+        marker="x",
+        markeredgewidth=3,
+        ms=8,
+    )
 
     # Annotate the best score for that scorer
-    ax.annotate("%0.2f" % best_score,
-                (X_axis[best_index], best_score + 0.005))
+    ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005))
 
 plt.legend(loc="best")
 plt.grid(False)
diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py
index a2c53841bc4da..030b46a0c748d 100644
--- a/examples/model_selection/plot_nested_cross_validation_iris.py
+++ b/examples/model_selection/plot_nested_cross_validation_iris.py
@@ -60,8 +60,7 @@
 y_iris = iris.target
 
 # Set up possible values of parameters to optimize over
-p_grid = {"C": [1, 10, 100],
-          "gamma": [.01, .1]}
+p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}
 
 # We will use a Support Vector Classifier with "rbf" kernel
 svm = SVC(kernel="rbf")
@@ -91,28 +90,39 @@
 
 score_difference = non_nested_scores - nested_scores
 
-print("Average difference of {:6f} with std. dev. of {:6f}."
-      .format(score_difference.mean(), score_difference.std()))
+print(
+    "Average difference of {:6f} with std. dev. of {:6f}.".format(
+        score_difference.mean(), score_difference.std()
+    )
+)
 
 # Plot scores on each trial for nested and non-nested CV
 plt.figure()
 plt.subplot(211)
-non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
-nested_line, = plt.plot(nested_scores, color='b')
+(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r")
+(nested_line,) = plt.plot(nested_scores, color="b")
 plt.ylabel("score", fontsize="14")
-plt.legend([non_nested_scores_line, nested_line],
-           ["Non-Nested CV", "Nested CV"],
-           bbox_to_anchor=(0, .4, .5, 0))
-plt.title("Non-Nested and Nested Cross Validation on Iris Dataset",
-          x=.5, y=1.1, fontsize="15")
+plt.legend(
+    [non_nested_scores_line, nested_line],
+    ["Non-Nested CV", "Nested CV"],
+    bbox_to_anchor=(0, 0.4, 0.5, 0),
+)
+plt.title(
+    "Non-Nested and Nested Cross Validation on Iris Dataset",
+    x=0.5,
+    y=1.1,
+    fontsize="15",
+)
 
 # Plot bar chart of the difference.
 plt.subplot(212)
 difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
 plt.xlabel("Individual Trial #")
-plt.legend([difference_plot],
-           ["Non-Nested CV - Nested CV Score"],
-           bbox_to_anchor=(0, 1, .8, 0))
+plt.legend(
+    [difference_plot],
+    ["Non-Nested CV - Nested CV Score"],
+    bbox_to_anchor=(0, 1, 0.8, 0),
+)
 plt.ylabel("score difference", fontsize="14")
 
 plt.show()
diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py
index ff5b51837ed2a..7893f704404ac 100644
--- a/examples/model_selection/plot_randomized_search.py
+++ b/examples/model_selection/plot_randomized_search.py
@@ -34,49 +34,60 @@
 X, y = load_digits(return_X_y=True)
 
 # build a classifier
-clf = SGDClassifier(loss='hinge', penalty='elasticnet',
-                    fit_intercept=True)
+clf = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True)
 
 
 # Utility function to report best scores
 def report(results, n_top=3):
     for i in range(1, n_top + 1):
-        candidates = np.flatnonzero(results['rank_test_score'] == i)
+        candidates = np.flatnonzero(results["rank_test_score"] == i)
         for candidate in candidates:
             print("Model with rank: {0}".format(i))
-            print("Mean validation score: {0:.3f} (std: {1:.3f})"
-                  .format(results['mean_test_score'][candidate],
-                          results['std_test_score'][candidate]))
-            print("Parameters: {0}".format(results['params'][candidate]))
+            print(
+                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
+                    results["mean_test_score"][candidate],
+                    results["std_test_score"][candidate],
+                )
+            )
+            print("Parameters: {0}".format(results["params"][candidate]))
             print("")
 
 
 # specify parameters and distributions to sample from
-param_dist = {'average': [True, False],
-              'l1_ratio': stats.uniform(0, 1),
-              'alpha': loguniform(1e-4, 1e0)}
+param_dist = {
+    "average": [True, False],
+    "l1_ratio": stats.uniform(0, 1),
+    "alpha": loguniform(1e-4, 1e0),
+}
 
 # run randomized search
 n_iter_search = 20
-random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
-                                   n_iter=n_iter_search)
+random_search = RandomizedSearchCV(
+    clf, param_distributions=param_dist, n_iter=n_iter_search
+)
 
 start = time()
 random_search.fit(X, y)
-print("RandomizedSearchCV took %.2f seconds for %d candidates"
-      " parameter settings." % ((time() - start), n_iter_search))
+print(
+    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
+    % ((time() - start), n_iter_search)
+)
 report(random_search.cv_results_)
 
 # use a full grid over all parameters
-param_grid = {'average': [True, False],
-              'l1_ratio': np.linspace(0, 1, num=10),
-              'alpha': np.power(10, np.arange(-4, 1, dtype=float))}
+param_grid = {
+    "average": [True, False],
+    "l1_ratio": np.linspace(0, 1, num=10),
+    "alpha": np.power(10, np.arange(-4, 1, dtype=float)),
+}
 
 # run grid search
 grid_search = GridSearchCV(clf, param_grid=param_grid)
 start = time()
 grid_search.fit(X, y)
 
-print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
-      % (time() - start, len(grid_search.cv_results_['params'])))
+print(
+    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
+    % (time() - start, len(grid_search.cv_results_["params"]))
+)
 report(grid_search.cv_results_)
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 86ca06eb00445..5cd76faf829d8 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -61,12 +61,12 @@
 X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 
 # shuffle and split training and test sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
-                                                    random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
 
 # Learn to predict each class against the other
-classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
-                                 random_state=random_state))
+classifier = OneVsRestClassifier(
+    svm.SVC(kernel="linear", probability=True, random_state=random_state)
+)
 y_score = classifier.fit(X_train, y_train).decision_function(X_test)
 
 # Compute ROC curve and ROC area for each class
@@ -86,14 +86,19 @@
 # Plot of a ROC curve for a specific class
 plt.figure()
 lw = 2
-plt.plot(fpr[2], tpr[2], color='darkorange',
-         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
-plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+plt.plot(
+    fpr[2],
+    tpr[2],
+    color="darkorange",
+    lw=lw,
+    label="ROC curve (area = %0.2f)" % roc_auc[2],
+)
+plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('Receiver operating characteristic example')
+plt.xlabel("False Positive Rate")
+plt.ylabel("True Positive Rate")
+plt.title("Receiver operating characteristic example")
 plt.legend(loc="lower right")
 plt.show()
 
@@ -120,28 +125,40 @@
 
 # Plot all ROC curves
 plt.figure()
-plt.plot(fpr["micro"], tpr["micro"],
-         label='micro-average ROC curve (area = {0:0.2f})'
-               ''.format(roc_auc["micro"]),
-         color='deeppink', linestyle=':', linewidth=4)
-
-plt.plot(fpr["macro"], tpr["macro"],
-         label='macro-average ROC curve (area = {0:0.2f})'
-               ''.format(roc_auc["macro"]),
-         color='navy', linestyle=':', linewidth=4)
-
-colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
+plt.plot(
+    fpr["micro"],
+    tpr["micro"],
+    label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
+    color="deeppink",
+    linestyle=":",
+    linewidth=4,
+)
+
+plt.plot(
+    fpr["macro"],
+    tpr["macro"],
+    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
+    color="navy",
+    linestyle=":",
+    linewidth=4,
+)
+
+colors = cycle(["aqua", "darkorange", "cornflowerblue"])
 for i, color in zip(range(n_classes), colors):
-    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
-             label='ROC curve of class {0} (area = {1:0.2f})'
-             ''.format(i, roc_auc[i]))
-
-plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+    plt.plot(
+        fpr[i],
+        tpr[i],
+        color=color,
+        lw=lw,
+        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),
+    )
+
+plt.plot([0, 1], [0, 1], "k--", lw=lw)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('Some extension of Receiver operating characteristic to multi-class')
+plt.xlabel("False Positive Rate")
+plt.ylabel("True Positive Rate")
+plt.title("Some extension of Receiver operating characteristic to multi-class")
 plt.legend(loc="lower right")
 plt.show()
 
@@ -156,17 +173,19 @@
 # prevalence-weighted average.
 y_prob = classifier.predict_proba(X_test)
 
-macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
-                                  average="macro")
-weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
-                                     average="weighted")
-macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
-                                  average="macro")
-weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
-                                     average="weighted")
-print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
-      "(weighted by prevalence)"
-      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
-print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
-      "(weighted by prevalence)"
-      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
+macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro")
+weighted_roc_auc_ovo = roc_auc_score(
+    y_test, y_prob, multi_class="ovo", average="weighted"
+)
+macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro")
+weighted_roc_auc_ovr = roc_auc_score(
+    y_test, y_prob, multi_class="ovr", average="weighted"
+)
+print(
+    "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
+    "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)
+)
+print(
+    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
+    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
+)
diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py
index 04a2d19df89f7..c78de42ab9359 100644
--- a/examples/model_selection/plot_successive_halving_heatmap.py
+++ b/examples/model_selection/plot_successive_halving_heatmap.py
@@ -33,13 +33,14 @@
 
 gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
 Cs = [1, 10, 100, 1e3, 1e4, 1e5]
-param_grid = {'gamma': gammas, 'C': Cs}
+param_grid = {"gamma": gammas, "C": Cs}
 
 clf = SVC(random_state=rng)
 
 tic = time()
-gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, factor=2,
-                          random_state=rng)
+gsh = HalvingGridSearchCV(
+    estimator=clf, param_grid=param_grid, factor=2, random_state=rng
+)
 gsh.fit(X, y)
 gsh_time = time() - tic
 
@@ -55,46 +56,54 @@
 def make_heatmap(ax, gs, is_sh=False, make_cbar=False):
     """Helper to make a heatmap."""
     results = pd.DataFrame.from_dict(gs.cv_results_)
-    results['params_str'] = results.params.apply(str)
+    results["params_str"] = results.params.apply(str)
     if is_sh:
         # SH dataframe: get mean_test_score values for the highest iter
-        scores_matrix = results.sort_values('iter').pivot_table(
-                index='param_gamma', columns='param_C',
-                values='mean_test_score', aggfunc='last'
+        scores_matrix = results.sort_values("iter").pivot_table(
+            index="param_gamma",
+            columns="param_C",
+            values="mean_test_score",
+            aggfunc="last",
         )
     else:
-        scores_matrix = results.pivot(index='param_gamma', columns='param_C',
-                                      values='mean_test_score')
+        scores_matrix = results.pivot(
+            index="param_gamma", columns="param_C", values="mean_test_score"
+        )
 
     im = ax.imshow(scores_matrix)
 
     ax.set_xticks(np.arange(len(Cs)))
-    ax.set_xticklabels(['{:.0E}'.format(x) for x in Cs])
-    ax.set_xlabel('C', fontsize=15)
+    ax.set_xticklabels(["{:.0E}".format(x) for x in Cs])
+    ax.set_xlabel("C", fontsize=15)
 
     ax.set_yticks(np.arange(len(gammas)))
-    ax.set_yticklabels(['{:.0E}'.format(x) for x in gammas])
-    ax.set_ylabel('gamma', fontsize=15)
+    ax.set_yticklabels(["{:.0E}".format(x) for x in gammas])
+    ax.set_ylabel("gamma", fontsize=15)
 
     # Rotate the tick labels and set their alignment.
-    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
-             rotation_mode="anchor")
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
 
     if is_sh:
-        iterations = results.pivot_table(index='param_gamma',
-                                         columns='param_C', values='iter',
-                                         aggfunc='max').values
+        iterations = results.pivot_table(
+            index="param_gamma", columns="param_C", values="iter", aggfunc="max"
+        ).values
         for i in range(len(gammas)):
             for j in range(len(Cs)):
-                ax.text(j, i, iterations[i, j],
-                        ha="center", va="center", color="w", fontsize=20)
+                ax.text(
+                    j,
+                    i,
+                    iterations[i, j],
+                    ha="center",
+                    va="center",
+                    color="w",
+                    fontsize=20,
+                )
 
     if make_cbar:
         fig.subplots_adjust(right=0.8)
         cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
         fig.colorbar(im, cax=cbar_ax)
-        cbar_ax.set_ylabel('mean_test_score', rotation=-90, va="bottom",
-                           fontsize=15)
+        cbar_ax.set_ylabel("mean_test_score", rotation=-90, va="bottom", fontsize=15)
 
 
 fig, axes = plt.subplots(ncols=2, sharey=True)
@@ -103,9 +112,8 @@ def make_heatmap(ax, gs, is_sh=False, make_cbar=False):
 make_heatmap(ax1, gsh, is_sh=True)
 make_heatmap(ax2, gs, make_cbar=True)
 
-ax1.set_title('Successive Halving\ntime = {:.3f}s'.format(gsh_time),
-              fontsize=15)
-ax2.set_title('GridSearch\ntime = {:.3f}s'.format(gs_time), fontsize=15)
+ax1.set_title("Successive Halving\ntime = {:.3f}s".format(gsh_time), fontsize=15)
+ax2.set_title("GridSearch\ntime = {:.3f}s".format(gs_time), fontsize=15)
 
 plt.show()
 
diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py
index 53d33849e9801..11c204ef939d5 100644
--- a/examples/model_selection/plot_successive_halving_iterations.py
+++ b/examples/model_selection/plot_successive_halving_iterations.py
@@ -32,17 +32,17 @@
 
 clf = RandomForestClassifier(n_estimators=20, random_state=rng)
 
-param_dist = {"max_depth": [3, None],
-              "max_features": randint(1, 11),
-              "min_samples_split": randint(2, 11),
-              "bootstrap": [True, False],
-              "criterion": ["gini", "entropy"]}
+param_dist = {
+    "max_depth": [3, None],
+    "max_features": randint(1, 11),
+    "min_samples_split": randint(2, 11),
+    "bootstrap": [True, False],
+    "criterion": ["gini", "entropy"],
+}
 
 rsh = HalvingRandomSearchCV(
-    estimator=clf,
-    param_distributions=param_dist,
-    factor=2,
-    random_state=rng)
+    estimator=clf, param_distributions=param_dist, factor=2, random_state=rng
+)
 rsh.fit(X, y)
 
 # %%
@@ -50,23 +50,23 @@
 # and plot the evolution of the search.
 
 results = pd.DataFrame(rsh.cv_results_)
-results['params_str'] = results.params.apply(str)
-results.drop_duplicates(subset=('params_str', 'iter'), inplace=True)
-mean_scores = results.pivot(index='iter', columns='params_str',
-                            values='mean_test_score')
-ax = mean_scores.plot(legend=False, alpha=.6)
+results["params_str"] = results.params.apply(str)
+results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
+mean_scores = results.pivot(
+    index="iter", columns="params_str", values="mean_test_score"
+)
+ax = mean_scores.plot(legend=False, alpha=0.6)
 
 labels = [
-    f'iter={i}\nn_samples={rsh.n_resources_[i]}\n'
-    f'n_candidates={rsh.n_candidates_[i]}'
+    f"iter={i}\nn_samples={rsh.n_resources_[i]}\nn_candidates={rsh.n_candidates_[i]}"
     for i in range(rsh.n_iterations_)
 ]
 
 ax.set_xticks(range(rsh.n_iterations_))
-ax.set_xticklabels(labels, rotation=45, multialignment='left')
-ax.set_title('Scores of candidates over iterations')
-ax.set_ylabel('mean test score', fontsize=15)
-ax.set_xlabel('iterations', fontsize=15)
+ax.set_xticklabels(labels, rotation=45, multialignment="left")
+ax.set_title("Scores of candidates over iterations")
+ax.set_ylabel("mean test score", fontsize=15)
+ax.set_xlabel("iterations", fontsize=15)
 plt.tight_layout()
 plt.show()
 
diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py
index f32d6e46f9933..d9c00805aa390 100644
--- a/examples/model_selection/plot_train_error_vs_test_error.py
+++ b/examples/model_selection/plot_train_error_vs_test_error.py
@@ -56,20 +56,27 @@
 # Plot results functions
 
 import matplotlib.pyplot as plt
+
 plt.subplot(2, 1, 1)
-plt.semilogx(alphas, train_errors, label='Train')
-plt.semilogx(alphas, test_errors, label='Test')
-plt.vlines(alpha_optim, plt.ylim()[0], np.max(test_errors), color='k',
-           linewidth=3, label='Optimum on test')
-plt.legend(loc='lower left')
+plt.semilogx(alphas, train_errors, label="Train")
+plt.semilogx(alphas, test_errors, label="Test")
+plt.vlines(
+    alpha_optim,
+    plt.ylim()[0],
+    np.max(test_errors),
+    color="k",
+    linewidth=3,
+    label="Optimum on test",
+)
+plt.legend(loc="lower left")
 plt.ylim([0, 1.2])
-plt.xlabel('Regularization parameter')
-plt.ylabel('Performance')
+plt.xlabel("Regularization parameter")
+plt.ylabel("Performance")
 
 # Show estimated coef_ vs true coef
 plt.subplot(2, 1, 2)
-plt.plot(coef, label='True coef')
-plt.plot(coef_, label='Estimated coef')
+plt.plot(coef, label="True coef")
+plt.plot(coef_, label="Estimated coef")
 plt.legend()
 plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26)
 plt.show()
diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py
index fcd799923f625..c93da3f2dc5b5 100644
--- a/examples/model_selection/plot_underfitting_overfitting.py
+++ b/examples/model_selection/plot_underfitting_overfitting.py
@@ -47,26 +47,33 @@ def true_fun(X):
     ax = plt.subplot(1, len(degrees), i + 1)
     plt.setp(ax, xticks=(), yticks=())
 
-    polynomial_features = PolynomialFeatures(degree=degrees[i],
-                                             include_bias=False)
+    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
     linear_regression = LinearRegression()
-    pipeline = Pipeline([("polynomial_features", polynomial_features),
-                         ("linear_regression", linear_regression)])
+    pipeline = Pipeline(
+        [
+            ("polynomial_features", polynomial_features),
+            ("linear_regression", linear_regression),
+        ]
+    )
     pipeline.fit(X[:, np.newaxis], y)
 
     # Evaluate the models using crossvalidation
-    scores = cross_val_score(pipeline, X[:, np.newaxis], y,
-                             scoring="neg_mean_squared_error", cv=10)
+    scores = cross_val_score(
+        pipeline, X[:, np.newaxis], y, scoring="neg_mean_squared_error", cv=10
+    )
 
     X_test = np.linspace(0, 1, 100)
     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
     plt.plot(X_test, true_fun(X_test), label="True function")
-    plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
+    plt.scatter(X, y, edgecolor="b", s=20, label="Samples")
     plt.xlabel("x")
     plt.ylabel("y")
     plt.xlim((0, 1))
     plt.ylim((-2, 2))
     plt.legend(loc="best")
-    plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
-        degrees[i], -scores.mean(), scores.std()))
+    plt.title(
+        "Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
+            degrees[i], -scores.mean(), scores.std()
+        )
+    )
 plt.show()
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
index 0eb3850787c53..524dd71eda40f 100644
--- a/examples/model_selection/plot_validation_curve.py
+++ b/examples/model_selection/plot_validation_curve.py
@@ -24,8 +24,14 @@
 
 param_range = np.logspace(-6, -1, 5)
 train_scores, test_scores = validation_curve(
-    SVC(), X, y, param_name="gamma", param_range=param_range,
-    scoring="accuracy", n_jobs=1)
+    SVC(),
+    X,
+    y,
+    param_name="gamma",
+    param_range=param_range,
+    scoring="accuracy",
+    n_jobs=1,
+)
 train_scores_mean = np.mean(train_scores, axis=1)
 train_scores_std = np.std(train_scores, axis=1)
 test_scores_mean = np.mean(test_scores, axis=1)
@@ -36,15 +42,27 @@
 plt.ylabel("Score")
 plt.ylim(0.0, 1.1)
 lw = 2
-plt.semilogx(param_range, train_scores_mean, label="Training score",
-             color="darkorange", lw=lw)
-plt.fill_between(param_range, train_scores_mean - train_scores_std,
-                 train_scores_mean + train_scores_std, alpha=0.2,
-                 color="darkorange", lw=lw)
-plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
-             color="navy", lw=lw)
-plt.fill_between(param_range, test_scores_mean - test_scores_std,
-                 test_scores_mean + test_scores_std, alpha=0.2,
-                 color="navy", lw=lw)
+plt.semilogx(
+    param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw
+)
+plt.fill_between(
+    param_range,
+    train_scores_mean - train_scores_std,
+    train_scores_mean + train_scores_std,
+    alpha=0.2,
+    color="darkorange",
+    lw=lw,
+)
+plt.semilogx(
+    param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw
+)
+plt.fill_between(
+    param_range,
+    test_scores_mean - test_scores_std,
+    test_scores_mean + test_scores_std,
+    alpha=0.2,
+    color="navy",
+    lw=lw,
+)
 plt.legend(loc="best")
 plt.show()
diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
index 7ae80af3fdab3..a92249b97e4ad 100644
--- a/examples/multioutput/plot_classifier_chain_yeast.py
+++ b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -47,10 +47,9 @@
 print(__doc__)
 
 # Load a multi-label dataset from https://www.openml.org/d/40597
-X, Y = fetch_openml('yeast', version=4, return_X_y=True)
-Y = Y == 'TRUE'
-X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,
-                                                    random_state=0)
+X, Y = fetch_openml("yeast", version=4, return_X_y=True)
+Y = Y == "TRUE"
+X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
 # Fit an independent logistic regression model for each class using the
 # OneVsRestClassifier wrapper.
@@ -58,41 +57,42 @@
 ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
-ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')
+ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average="samples")
 
 # Fit an ensemble of logistic regression classifier chains and take the
 # take the average prediction of all the chains.
-chains = [ClassifierChain(base_lr, order='random', random_state=i)
-          for i in range(10)]
+chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]
 for chain in chains:
     chain.fit(X_train, Y_train)
 
-Y_pred_chains = np.array([chain.predict(X_test) for chain in
-                          chains])
-chain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5,
-                                      average='samples')
-                        for Y_pred_chain in Y_pred_chains]
+Y_pred_chains = np.array([chain.predict(X_test) for chain in chains])
+chain_jaccard_scores = [
+    jaccard_score(Y_test, Y_pred_chain >= 0.5, average="samples")
+    for Y_pred_chain in Y_pred_chains
+]
 
 Y_pred_ensemble = Y_pred_chains.mean(axis=0)
-ensemble_jaccard_score = jaccard_score(Y_test,
-                                       Y_pred_ensemble >= .5,
-                                       average='samples')
+ensemble_jaccard_score = jaccard_score(
+    Y_test, Y_pred_ensemble >= 0.5, average="samples"
+)
 
 model_scores = [ovr_jaccard_score] + chain_jaccard_scores
 model_scores.append(ensemble_jaccard_score)
 
-model_names = ('Independent',
-               'Chain 1',
-               'Chain 2',
-               'Chain 3',
-               'Chain 4',
-               'Chain 5',
-               'Chain 6',
-               'Chain 7',
-               'Chain 8',
-               'Chain 9',
-               'Chain 10',
-               'Ensemble')
+model_names = (
+    "Independent",
+    "Chain 1",
+    "Chain 2",
+    "Chain 3",
+    "Chain 4",
+    "Chain 5",
+    "Chain 6",
+    "Chain 7",
+    "Chain 8",
+    "Chain 9",
+    "Chain 10",
+    "Ensemble",
+)
 
 x_pos = np.arange(len(model_names))
 
@@ -102,12 +102,12 @@
 
 fig, ax = plt.subplots(figsize=(7, 4))
 ax.grid(True)
-ax.set_title('Classifier Chain Ensemble Performance Comparison')
+ax.set_title("Classifier Chain Ensemble Performance Comparison")
 ax.set_xticks(x_pos)
-ax.set_xticklabels(model_names, rotation='vertical')
-ax.set_ylabel('Jaccard Similarity Score')
-ax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])
-colors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']
+ax.set_xticklabels(model_names, rotation="vertical")
+ax.set_ylabel("Jaccard Similarity Score")
+ax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1])
+colors = ["r"] + ["b"] * len(chain_jaccard_scores) + ["g"]
 ax.bar(x_pos, model_scores, alpha=0.5, color=colors)
 plt.tight_layout()
 plt.show()
diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py
index 78f5f184a0da7..055796600fd4e 100644
--- a/examples/neighbors/approximate_nearest_neighbors.py
+++ b/examples/neighbors/approximate_nearest_neighbors.py
@@ -73,8 +73,7 @@
 class NMSlibTransformer(TransformerMixin, BaseEstimator):
     """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
 
-    def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph',
-                 n_jobs=1):
+    def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=1):
         self.n_neighbors = n_neighbors
         self.method = method
         self.metric = metric
@@ -86,10 +85,10 @@ def fit(self, X):
         # see more metric in the manual
         # https://github.com/nmslib/nmslib/tree/master/manual
         space = {
-            'euclidean': 'l2',
-            'cosine': 'cosinesimil',
-            'l1': 'l1',
-            'l2': 'l2',
+            "euclidean": "l2",
+            "cosine": "cosinesimil",
+            "l1": "l1",
+            "l2": "l2",
         }[self.metric]
 
         self.nmslib_ = nmslib.init(method=self.method, space=space)
@@ -104,16 +103,15 @@ def transform(self, X):
         # neighbor, one extra neighbor will be computed.
         n_neighbors = self.n_neighbors + 1
 
-        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
-                                             num_threads=self.n_jobs)
+        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors, num_threads=self.n_jobs)
         indices, distances = zip(*results)
         indices, distances = np.vstack(indices), np.vstack(distances)
 
-        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
-                           n_neighbors)
-        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
-                                       indptr), shape=(n_samples_transform,
-                                                       self.n_samples_fit_))
+        indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
+        kneighbors_graph = csr_matrix(
+            (distances.ravel(), indices.ravel(), indptr),
+            shape=(n_samples_transform, self.n_samples_fit_),
+        )
 
         return kneighbors_graph
 
@@ -121,8 +119,7 @@ def transform(self, X):
 class AnnoyTransformer(TransformerMixin, BaseEstimator):
     """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""
 
-    def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
-                 search_k=-1):
+    def __init__(self, n_neighbors=5, metric="euclidean", n_trees=10, search_k=-1):
         self.n_neighbors = n_neighbors
         self.n_trees = n_trees
         self.search_k = search_k
@@ -151,34 +148,33 @@ def _transform(self, X):
         # neighbor, one extra neighbor will be computed.
         n_neighbors = self.n_neighbors + 1
 
-        indices = np.empty((n_samples_transform, n_neighbors),
-                           dtype=int)
+        indices = np.empty((n_samples_transform, n_neighbors), dtype=int)
         distances = np.empty((n_samples_transform, n_neighbors))
 
         if X is None:
             for i in range(self.annoy_.get_n_items()):
                 ind, dist = self.annoy_.get_nns_by_item(
-                    i, n_neighbors, self.search_k, include_distances=True)
+                    i, n_neighbors, self.search_k, include_distances=True
+                )
 
                 indices[i], distances[i] = ind, dist
         else:
             for i, x in enumerate(X):
                 indices[i], distances[i] = self.annoy_.get_nns_by_vector(
-                    x.tolist(), n_neighbors, self.search_k,
-                    include_distances=True)
+                    x.tolist(), n_neighbors, self.search_k, include_distances=True
+                )
 
-        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
-                           n_neighbors)
-        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
-                                       indptr), shape=(n_samples_transform,
-                                                       self.n_samples_fit_))
+        indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
+        kneighbors_graph = csr_matrix(
+            (distances.ravel(), indices.ravel(), indptr),
+            shape=(n_samples_transform, self.n_samples_fit_),
+        )
 
         return kneighbors_graph
 
 
 def test_transformers():
-    """Test that AnnoyTransformer and KNeighborsTransformer give same results
-    """
+    """Test that AnnoyTransformer and KNeighborsTransformer give same results"""
     X = np.random.RandomState(42).randn(10, 2)
 
     knn = KNeighborsTransformer()
@@ -203,8 +199,8 @@ def load_mnist(n_samples):
 
 def run_benchmark():
     datasets = [
-        ('MNIST_2000', load_mnist(n_samples=2000)),
-        ('MNIST_10000', load_mnist(n_samples=10000)),
+        ("MNIST_2000", load_mnist(n_samples=2000)),
+        ("MNIST_10000", load_mnist(n_samples=10000)),
     ]
 
     n_iter = 500
@@ -213,49 +209,67 @@ def run_benchmark():
     # TSNE requires a certain number of neighbors which depends on the
     # perplexity parameter.
     # Add one since we include each sample as its own neighbor.
-    n_neighbors = int(3. * perplexity + 1) + 1
+    n_neighbors = int(3.0 * perplexity + 1) + 1
 
-    tsne_params = dict(perplexity=perplexity, method="barnes_hut",
-                       random_state=42, n_iter=n_iter,
-                       square_distances=True)
+    tsne_params = dict(
+        perplexity=perplexity,
+        method="barnes_hut",
+        random_state=42,
+        n_iter=n_iter,
+        square_distances=True,
+    )
 
     transformers = [
-        ('AnnoyTransformer',
-         AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)),
-        ('NMSlibTransformer',
-         NMSlibTransformer(n_neighbors=n_neighbors, metric=metric)),
-        ('KNeighborsTransformer',
-         KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
-                               metric=metric)),
-        ('TSNE with AnnoyTransformer',
-         make_pipeline(
-             AnnoyTransformer(n_neighbors=n_neighbors, metric=metric),
-             TSNE(metric='precomputed', **tsne_params))),
-        ('TSNE with NMSlibTransformer',
-         make_pipeline(
-             NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
-             TSNE(metric='precomputed', **tsne_params))),
-        ('TSNE with KNeighborsTransformer',
-         make_pipeline(
-             KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
-                                   metric=metric),
-             TSNE(metric='precomputed', **tsne_params))),
-        ('TSNE with internal NearestNeighbors',
-         TSNE(metric=metric, **tsne_params)),
+        ("AnnoyTransformer", AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)),
+        (
+            "NMSlibTransformer",
+            NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
+        ),
+        (
+            "KNeighborsTransformer",
+            KNeighborsTransformer(
+                n_neighbors=n_neighbors, mode="distance", metric=metric
+            ),
+        ),
+        (
+            "TSNE with AnnoyTransformer",
+            make_pipeline(
+                AnnoyTransformer(n_neighbors=n_neighbors, metric=metric),
+                TSNE(metric="precomputed", **tsne_params),
+            ),
+        ),
+        (
+            "TSNE with NMSlibTransformer",
+            make_pipeline(
+                NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
+                TSNE(metric="precomputed", **tsne_params),
+            ),
+        ),
+        (
+            "TSNE with KNeighborsTransformer",
+            make_pipeline(
+                KNeighborsTransformer(
+                    n_neighbors=n_neighbors, mode="distance", metric=metric
+                ),
+                TSNE(metric="precomputed", **tsne_params),
+            ),
+        ),
+        ("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)),
     ]
 
     # init the plot
     nrows = len(datasets)
-    ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
-    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
-                             figsize=(5 * ncols, 4 * nrows))
+    ncols = np.sum([1 for name, model in transformers if "TSNE" in name])
+    fig, axes = plt.subplots(
+        nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)
+    )
     axes = axes.ravel()
     i_ax = 0
 
     for dataset_name, (X, y) in datasets:
 
-        msg = 'Benchmarking on %s:' % dataset_name
-        print('\n%s\n%s' % (msg, '-' * len(msg)))
+        msg = "Benchmarking on %s:" % dataset_name
+        print("\n%s\n%s" % (msg, "-" * len(msg)))
 
         for transformer_name, transformer in transformers:
             start = time.time()
@@ -264,23 +278,28 @@ def run_benchmark():
 
             # print the duration report
             longest = np.max([len(name) for name, model in transformers])
-            whitespaces = ' ' * (longest - len(transformer_name))
-            print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))
+            whitespaces = " " * (longest - len(transformer_name))
+            print("%s: %s%.3f sec" % (transformer_name, whitespaces, duration))
 
             # plot TSNE embedding which should be very similar across methods
-            if 'TSNE' in transformer_name:
-                axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
-                axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y.astype(np.int32),
-                                   alpha=0.2, cmap=plt.cm.viridis)
+            if "TSNE" in transformer_name:
+                axes[i_ax].set_title(transformer_name + "\non " + dataset_name)
+                axes[i_ax].scatter(
+                    Xt[:, 0],
+                    Xt[:, 1],
+                    c=y.astype(np.int32),
+                    alpha=0.2,
+                    cmap=plt.cm.viridis,
+                )
                 axes[i_ax].xaxis.set_major_formatter(NullFormatter())
                 axes[i_ax].yaxis.set_major_formatter(NullFormatter())
-                axes[i_ax].axis('tight')
+                axes[i_ax].axis("tight")
                 i_ax += 1
 
     fig.tight_layout()
     plt.show()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test_transformers()
     run_benchmark()
diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py
index a97ed3f2983a4..0ffe6b92d3053 100644
--- a/examples/neighbors/plot_caching_nearest_neighbors.py
+++ b/examples/neighbors/plot_caching_nearest_neighbors.py
@@ -35,30 +35,35 @@
 # The transformer computes the nearest neighbors graph using the maximum number
 # of neighbors necessary in the grid search. The classifier model filters the
 # nearest neighbors graph as required by its own n_neighbors parameter.
-graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list),
-                                    mode='distance')
-classifier_model = KNeighborsClassifier(metric='precomputed')
+graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode="distance")
+classifier_model = KNeighborsClassifier(metric="precomputed")
 
 # Note that we give `memory` a directory to cache the graph computation
 # that will be used several times when tuning the hyperparameters of the
 # classifier.
 with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
     full_model = Pipeline(
-        steps=[('graph', graph_model), ('classifier', classifier_model)],
-        memory=tmpdir)
+        steps=[("graph", graph_model), ("classifier", classifier_model)], memory=tmpdir
+    )
 
-    param_grid = {'classifier__n_neighbors': n_neighbors_list}
+    param_grid = {"classifier__n_neighbors": n_neighbors_list}
     grid_model = GridSearchCV(full_model, param_grid)
     grid_model.fit(X, y)
 
 # Plot the results of the grid search.
 fig, axes = plt.subplots(1, 2, figsize=(8, 4))
-axes[0].errorbar(x=n_neighbors_list,
-                 y=grid_model.cv_results_['mean_test_score'],
-                 yerr=grid_model.cv_results_['std_test_score'])
-axes[0].set(xlabel='n_neighbors', title='Classification accuracy')
-axes[1].errorbar(x=n_neighbors_list, y=grid_model.cv_results_['mean_fit_time'],
-                 yerr=grid_model.cv_results_['std_fit_time'], color='r')
-axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
+axes[0].errorbar(
+    x=n_neighbors_list,
+    y=grid_model.cv_results_["mean_test_score"],
+    yerr=grid_model.cv_results_["std_test_score"],
+)
+axes[0].set(xlabel="n_neighbors", title="Classification accuracy")
+axes[1].errorbar(
+    x=n_neighbors_list,
+    y=grid_model.cv_results_["mean_fit_time"],
+    yerr=grid_model.cv_results_["std_fit_time"],
+    color="r",
+)
+axes[1].set(xlabel="n_neighbors", title="Fit time (with caching)")
 fig.tight_layout()
 plt.show()
diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py
index 0b8d828225f39..08e22a2874a16 100644
--- a/examples/neighbors/plot_classification.py
+++ b/examples/neighbors/plot_classification.py
@@ -24,13 +24,13 @@
 X = iris.data[:, :2]
 y = iris.target
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 # Create color maps
-cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
-cmap_bold = ['darkorange', 'c', 'darkblue']
+cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
+cmap_bold = ["darkorange", "c", "darkblue"]
 
-for weights in ['uniform', 'distance']:
+for weights in ["uniform", "distance"]:
     # we create an instance of Neighbours Classifier and fit the data.
     clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
     clf.fit(X, y)
@@ -39,8 +39,7 @@
     # point in the mesh [x_min, x_max]x[y_min, y_max].
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
 
     # Put the result into a color plot
@@ -49,12 +48,19 @@
     plt.contourf(xx, yy, Z, cmap=cmap_light)
 
     # Plot also the training points
-    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y],
-                    palette=cmap_bold, alpha=1.0, edgecolor="black")
+    sns.scatterplot(
+        x=X[:, 0],
+        y=X[:, 1],
+        hue=iris.target_names[y],
+        palette=cmap_bold,
+        alpha=1.0,
+        edgecolor="black",
+    )
     plt.xlim(xx.min(), xx.max())
     plt.ylim(yy.min(), yy.max())
-    plt.title("3-Class classification (k = %i, weights = '%s')"
-              % (n_neighbors, weights))
+    plt.title(
+        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
+    )
     plt.xlabel(iris.feature_names[0])
     plt.ylabel(iris.feature_names[1])
 
diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py
index 8367d16b955fe..44e4b1f65c38b 100644
--- a/examples/neighbors/plot_digits_kde_sampling.py
+++ b/examples/neighbors/plot_digits_kde_sampling.py
@@ -26,7 +26,7 @@
 data = pca.fit_transform(digits.data)
 
 # use grid search cross-validation to optimize the bandwidth
-params = {'bandwidth': np.logspace(-1, 1, 20)}
+params = {"bandwidth": np.logspace(-1, 1, 20)}
 grid = GridSearchCV(KernelDensity(), params)
 grid.fit(data)
 
@@ -48,14 +48,16 @@
 for j in range(11):
     ax[4, j].set_visible(False)
     for i in range(4):
-        im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),
-                             cmap=plt.cm.binary, interpolation='nearest')
+        im = ax[i, j].imshow(
+            real_data[i, j].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest"
+        )
         im.set_clim(0, 16)
-        im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),
-                                 cmap=plt.cm.binary, interpolation='nearest')
+        im = ax[i + 5, j].imshow(
+            new_data[i, j].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest"
+        )
         im.set_clim(0, 16)
 
-ax[0, 5].set_title('Selection from the input data')
+ax[0, 5].set_title("Selection from the input data")
 ax[5, 5].set_title('"New" digits drawn from the kernel density model')
 
 plt.show()
diff --git a/examples/neighbors/plot_kde_1d.py b/examples/neighbors/plot_kde_1d.py
index fb2699e548ade..857be4feaa367 100644
--- a/examples/neighbors/plot_kde_1d.py
+++ b/examples/neighbors/plot_kde_1d.py
@@ -36,17 +36,18 @@
 from sklearn.utils.fixes import parse_version
 
 # `normed` is being deprecated in favor of `density` in histograms
-if parse_version(matplotlib.__version__) >= parse_version('2.1'):
-    density_param = {'density': True}
+if parse_version(matplotlib.__version__) >= parse_version("2.1"):
+    density_param = {"density": True}
 else:
-    density_param = {'normed': True}
+    density_param = {"normed": True}
 
 # ----------------------------------------------------------------------
 # Plot the progression of histograms to kernels
 np.random.seed(1)
 N = 20
-X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
-                    np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]
+X = np.concatenate(
+    (np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N)))
+)[:, np.newaxis]
 X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
 bins = np.linspace(-5, 10, 10)
 
@@ -54,35 +55,35 @@
 fig.subplots_adjust(hspace=0.05, wspace=0.05)
 
 # histogram 1
-ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param)
+ax[0, 0].hist(X[:, 0], bins=bins, fc="#AAAAFF", **density_param)
 ax[0, 0].text(-3.5, 0.31, "Histogram")
 
 # histogram 2
-ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param)
+ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc="#AAAAFF", **density_param)
 ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted")
 
 # tophat KDE
-kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X)
+kde = KernelDensity(kernel="tophat", bandwidth=0.75).fit(X)
 log_dens = kde.score_samples(X_plot)
-ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
+ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc="#AAAAFF")
 ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density")
 
 # Gaussian KDE
-kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
+kde = KernelDensity(kernel="gaussian", bandwidth=0.75).fit(X)
 log_dens = kde.score_samples(X_plot)
-ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
+ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc="#AAAAFF")
 ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density")
 
 for axi in ax.ravel():
-    axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k')
+    axi.plot(X[:, 0], np.full(X.shape[0], -0.01), "+k")
     axi.set_xlim(-4, 9)
     axi.set_ylim(-0.02, 0.34)
 
 for axi in ax[:, 0]:
-    axi.set_ylabel('Normalized Density')
+    axi.set_ylabel("Normalized Density")
 
 for axi in ax[1, :]:
-    axi.set_xlabel('x')
+    axi.set_xlabel("x")
 
 # ----------------------------------------------------------------------
 # Plot all available kernels
@@ -95,20 +96,21 @@
 
 def format_func(x, loc):
     if x == 0:
-        return '0'
+        return "0"
     elif x == 1:
-        return 'h'
+        return "h"
     elif x == -1:
-        return '-h'
+        return "-h"
     else:
-        return '%ih' % x
+        return "%ih" % x
 
 
-for i, kernel in enumerate(['gaussian', 'tophat', 'epanechnikov',
-                            'exponential', 'linear', 'cosine']):
+for i, kernel in enumerate(
+    ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
+):
     axi = ax.ravel()[i]
     log_dens = KernelDensity(kernel=kernel).fit(X_src).score_samples(X_plot)
-    axi.fill(X_plot[:, 0], np.exp(log_dens), '-k', fc='#AAAAFF')
+    axi.fill(X_plot[:, 0], np.exp(log_dens), "-k", fc="#AAAAFF")
     axi.text(-2.6, 0.95, kernel)
 
     axi.xaxis.set_major_formatter(plt.FuncFormatter(format_func))
@@ -118,37 +120,42 @@ def format_func(x, loc):
     axi.set_ylim(0, 1.05)
     axi.set_xlim(-2.9, 2.9)
 
-ax[0, 1].set_title('Available Kernels')
+ax[0, 1].set_title("Available Kernels")
 
 # ----------------------------------------------------------------------
 # Plot a 1D density example
 N = 100
 np.random.seed(1)
-X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
-                    np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]
+X = np.concatenate(
+    (np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N)))
+)[:, np.newaxis]
 
 X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
 
-true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0])
-             + 0.7 * norm(5, 1).pdf(X_plot[:, 0]))
+true_dens = 0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])
 
 fig, ax = plt.subplots()
-ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2,
-        label='input distribution')
-colors = ['navy', 'cornflowerblue', 'darkorange']
-kernels = ['gaussian', 'tophat', 'epanechnikov']
+ax.fill(X_plot[:, 0], true_dens, fc="black", alpha=0.2, label="input distribution")
+colors = ["navy", "cornflowerblue", "darkorange"]
+kernels = ["gaussian", "tophat", "epanechnikov"]
 lw = 2
 
 for color, kernel in zip(colors, kernels):
     kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
     log_dens = kde.score_samples(X_plot)
-    ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw,
-            linestyle='-', label="kernel = '{0}'".format(kernel))
+    ax.plot(
+        X_plot[:, 0],
+        np.exp(log_dens),
+        color=color,
+        lw=lw,
+        linestyle="-",
+        label="kernel = '{0}'".format(kernel),
+    )
 
 ax.text(6, 0.38, "N={0} points".format(N))
 
-ax.legend(loc='upper left')
-ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
+ax.legend(loc="upper left")
+ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), "+k")
 
 ax.set_xlim(-4, 9)
 ax.set_ylim(-0.02, 0.4)
diff --git a/examples/neighbors/plot_lof_novelty_detection.py b/examples/neighbors/plot_lof_novelty_detection.py
index f1129d0bd64e6..a37d28bb6d4bb 100644
--- a/examples/neighbors/plot_lof_novelty_detection.py
+++ b/examples/neighbors/plot_lof_novelty_detection.py
@@ -60,24 +60,29 @@
 
 plt.title("Novelty Detection with LOF")
 plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
-a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
-plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
+a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
+plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")
 
 s = 40
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
-                 edgecolors='k')
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
-                edgecolors='k')
-plt.axis('tight')
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")
+plt.axis("tight")
 plt.xlim((-5, 5))
 plt.ylim((-5, 5))
-plt.legend([a.collections[0], b1, b2, c],
-           ["learned frontier", "training observations",
-            "new regular observations", "new abnormal observations"],
-           loc="upper left",
-           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.legend(
+    [a.collections[0], b1, b2, c],
+    [
+        "learned frontier",
+        "training observations",
+        "new regular observations",
+        "new abnormal observations",
+    ],
+    loc="upper left",
+    prop=matplotlib.font_manager.FontProperties(size=11),
+)
 plt.xlabel(
     "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
-    % (n_error_test, n_error_outliers))
+    % (n_error_test, n_error_outliers)
+)
 plt.show()
diff --git a/examples/neighbors/plot_lof_outlier_detection.py b/examples/neighbors/plot_lof_outlier_detection.py
index 4bb2949bcdcd7..0c7706acc8b93 100644
--- a/examples/neighbors/plot_lof_outlier_detection.py
+++ b/examples/neighbors/plot_lof_outlier_detection.py
@@ -53,16 +53,22 @@
 X_scores = clf.negative_outlier_factor_
 
 plt.title("Local Outlier Factor (LOF)")
-plt.scatter(X[:, 0], X[:, 1], color='k', s=3., label='Data points')
+plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
 # plot circles with radius proportional to the outlier scores
 radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
-plt.scatter(X[:, 0], X[:, 1], s=1000 * radius, edgecolors='r',
-            facecolors='none', label='Outlier scores')
-plt.axis('tight')
+plt.scatter(
+    X[:, 0],
+    X[:, 1],
+    s=1000 * radius,
+    edgecolors="r",
+    facecolors="none",
+    label="Outlier scores",
+)
+plt.axis("tight")
 plt.xlim((-5, 5))
 plt.ylim((-5, 5))
 plt.xlabel("prediction errors: %d" % (n_errors))
-legend = plt.legend(loc='upper left')
+legend = plt.legend(loc="upper left")
 legend.legendHandles[0]._sizes = [10]
 legend.legendHandles[1]._sizes = [20]
 plt.show()
diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py
index 5536e8eb69e89..79b4a7a370557 100644
--- a/examples/neighbors/plot_nca_classification.py
+++ b/examples/neighbors/plot_nca_classification.py
@@ -22,8 +22,7 @@
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
-from sklearn.neighbors import (KNeighborsClassifier,
-                               NeighborhoodComponentsAnalysis)
+from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
 from sklearn.pipeline import Pipeline
 
 
@@ -38,30 +37,37 @@
 # slicing by using a two-dim dataset
 X = X[:, [0, 2]]
 
-X_train, X_test, y_train, y_test = \
-    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, stratify=y, test_size=0.7, random_state=42
+)
 
-h = .01  # step size in the mesh
+h = 0.01  # step size in the mesh
 
 # Create color maps
-cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
-cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
-
-names = ['KNN', 'NCA, KNN']
-
-classifiers = [Pipeline([('scaler', StandardScaler()),
-                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
-                         ]),
-               Pipeline([('scaler', StandardScaler()),
-                         ('nca', NeighborhoodComponentsAnalysis()),
-                         ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
-                         ])
-               ]
+cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
+cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
+
+names = ["KNN", "NCA, KNN"]
+
+classifiers = [
+    Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            ("knn", KNeighborsClassifier(n_neighbors=n_neighbors)),
+        ]
+    ),
+    Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            ("nca", NeighborhoodComponentsAnalysis()),
+            ("knn", KNeighborsClassifier(n_neighbors=n_neighbors)),
+        ]
+    ),
+]
 
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                     np.arange(y_min, y_max, h))
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
 for name, clf in zip(names, classifiers):
 
@@ -75,14 +81,21 @@
     # Put the result into a color plot
     Z = Z.reshape(xx.shape)
     plt.figure()
-    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8)
+    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8)
 
     # Plot also the training and testing points
-    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
+    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
     plt.xlim(xx.min(), xx.max())
     plt.ylim(yy.min(), yy.max())
     plt.title("{} (k = {})".format(name, n_neighbors))
-    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
-             ha='center', va='center', transform=plt.gca().transAxes)
+    plt.text(
+        0.9,
+        0.1,
+        "{:.2f}".format(score),
+        size=15,
+        ha="center",
+        va="center",
+        transform=plt.gca().transAxes,
+    )
 
 plt.show()
diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py
index 64135f76ee58e..95be0e7ec327d 100644
--- a/examples/neighbors/plot_nca_dim_reduction.py
+++ b/examples/neighbors/plot_nca_dim_reduction.py
@@ -35,8 +35,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.decomposition import PCA
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-from sklearn.neighbors import (KNeighborsClassifier,
-                               NeighborhoodComponentsAnalysis)
+from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
@@ -49,31 +48,30 @@
 X, y = datasets.load_digits(return_X_y=True)
 
 # Split into train/test
-X_train, X_test, y_train, y_test = \
-    train_test_split(X, y, test_size=0.5, stratify=y,
-                     random_state=random_state)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.5, stratify=y, random_state=random_state
+)
 
 dim = len(X[0])
 n_classes = len(np.unique(y))
 
 # Reduce dimension to 2 with PCA
-pca = make_pipeline(StandardScaler(),
-                    PCA(n_components=2, random_state=random_state))
+pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state))
 
 # Reduce dimension to 2 with LinearDiscriminantAnalysis
-lda = make_pipeline(StandardScaler(),
-                    LinearDiscriminantAnalysis(n_components=2))
+lda = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(n_components=2))
 
 # Reduce dimension to 2 with NeighborhoodComponentAnalysis
-nca = make_pipeline(StandardScaler(),
-                    NeighborhoodComponentsAnalysis(n_components=2,
-                                                   random_state=random_state))
+nca = make_pipeline(
+    StandardScaler(),
+    NeighborhoodComponentsAnalysis(n_components=2, random_state=random_state),
+)
 
 # Use a nearest neighbor classifier to evaluate the methods
 knn = KNeighborsClassifier(n_neighbors=n_neighbors)
 
 # Make a list of the methods to be compared
-dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
+dim_reduction_methods = [("PCA", pca), ("LDA", lda), ("NCA", nca)]
 
 # plt.figure()
 for i, (name, model) in enumerate(dim_reduction_methods):
@@ -93,8 +91,8 @@
     X_embedded = model.transform(X)
 
     # Plot the projected points and show the evaluation score
-    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
-    plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
-                                                              n_neighbors,
-                                                              acc_knn))
+    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap="Set1")
+    plt.title(
+        "{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, n_neighbors, acc_knn)
+    )
 plt.show()
diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py
index 1e1f435e7e57a..ec158e5ad9824 100644
--- a/examples/neighbors/plot_nca_illustration.py
+++ b/examples/neighbors/plot_nca_illustration.py
@@ -28,32 +28,37 @@
 # point no. 3. The thickness of a link between point no. 3 and another point
 # is proportional to their distance.
 
-X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
-                           n_redundant=0, n_classes=3, n_clusters_per_class=1,
-                           class_sep=1.0, random_state=0)
+X, y = make_classification(
+    n_samples=9,
+    n_features=2,
+    n_informative=2,
+    n_redundant=0,
+    n_classes=3,
+    n_clusters_per_class=1,
+    class_sep=1.0,
+    random_state=0,
+)
 
 plt.figure(1)
 ax = plt.gca()
 for i in range(X.shape[0]):
-    ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
+    ax.text(X[i, 0], X[i, 1], str(i), va="center", ha="center")
     ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)
 
 ax.set_title("Original points")
 ax.axes.get_xaxis().set_visible(False)
 ax.axes.get_yaxis().set_visible(False)
-ax.axis('equal')  # so that boundaries are displayed correctly as circles
+ax.axis("equal")  # so that boundaries are displayed correctly as circles
 
 
 def link_thickness_i(X, i):
     diff_embedded = X[i] - X
-    dist_embedded = np.einsum('ij,ij->i', diff_embedded,
-                              diff_embedded)
+    dist_embedded = np.einsum("ij,ij->i", diff_embedded, diff_embedded)
     dist_embedded[i] = np.inf
 
     # compute exponentiated distances (use the log-sum-exp trick to
     # avoid numerical instabilities
-    exp_dist_embedded = np.exp(-dist_embedded -
-                               logsumexp(-dist_embedded))
+    exp_dist_embedded = np.exp(-dist_embedded - logsumexp(-dist_embedded))
     return exp_dist_embedded
 
 
@@ -63,8 +68,7 @@ def relate_point(X, i, ax):
         thickness = link_thickness_i(X, i)
         if i != j:
             line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
-            ax.plot(*line, c=cm.Set1(y[j]),
-                    linewidth=5*thickness[j])
+            ax.plot(*line, c=cm.Set1(y[j]), linewidth=5 * thickness[j])
 
 
 i = 3
@@ -87,13 +91,11 @@ def relate_point(X, i, ax):
 relate_point(X_embedded, i, ax2)
 
 for i in range(len(X)):
-    ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
-             va='center', ha='center')
-    ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]),
-                alpha=0.4)
+    ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i), va="center", ha="center")
+    ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)
 
 ax2.set_title("NCA embedding")
 ax2.axes.get_xaxis().set_visible(False)
 ax2.axes.get_yaxis().set_visible(False)
-ax2.axis('equal')
+ax2.axis("equal")
 plt.show()
diff --git a/examples/neighbors/plot_nearest_centroid.py b/examples/neighbors/plot_nearest_centroid.py
index 04a105c0e07fd..9e5d21777e718 100644
--- a/examples/neighbors/plot_nearest_centroid.py
+++ b/examples/neighbors/plot_nearest_centroid.py
@@ -23,13 +23,13 @@
 X = iris.data[:, :2]
 y = iris.target
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 # Create color maps
-cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
-cmap_bold = ListedColormap(['darkorange', 'c', 'darkblue'])
+cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
+cmap_bold = ListedColormap(["darkorange", "c", "darkblue"])
 
-for shrinkage in [None, .2]:
+for shrinkage in [None, 0.2]:
     # we create an instance of Neighbours Classifier and fit the data.
     clf = NearestCentroid(shrink_threshold=shrinkage)
     clf.fit(X, y)
@@ -39,8 +39,7 @@
     # point in the mesh [x_min, x_max]x[y_min, y_max].
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
 
     # Put the result into a color plot
@@ -49,10 +48,8 @@
     plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
 
     # Plot also the training points
-    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
-                edgecolor='k', s=20)
-    plt.title("3-Class classification (shrink_threshold=%r)"
-              % shrinkage)
-    plt.axis('tight')
+    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20)
+    plt.title("3-Class classification (shrink_threshold=%r)" % shrinkage)
+    plt.axis("tight")
 
 plt.show()
diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py
index 9625e205009aa..ba91a1a65d2a8 100644
--- a/examples/neighbors/plot_regression.py
+++ b/examples/neighbors/plot_regression.py
@@ -34,17 +34,16 @@
 # Fit regression model
 n_neighbors = 5
 
-for i, weights in enumerate(['uniform', 'distance']):
+for i, weights in enumerate(["uniform", "distance"]):
     knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
     y_ = knn.fit(X, y).predict(T)
 
     plt.subplot(2, 1, i + 1)
-    plt.scatter(X, y, color='darkorange', label='data')
-    plt.plot(T, y_, color='navy', label='prediction')
-    plt.axis('tight')
+    plt.scatter(X, y, color="darkorange", label="data")
+    plt.plot(T, y_, color="navy", label="prediction")
+    plt.axis("tight")
     plt.legend()
-    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,
-                                                                weights))
+    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, weights))
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/neighbors/plot_species_kde.py b/examples/neighbors/plot_species_kde.py
index 344c36ed452f1..9607d1a20aae4 100644
--- a/examples/neighbors/plot_species_kde.py
+++ b/examples/neighbors/plot_species_kde.py
@@ -48,6 +48,7 @@
 # otherwise, we'll improvise later...
 try:
     from mpl_toolkits.basemap import Basemap
+
     basemap = True
 except ImportError:
     basemap = False
@@ -82,13 +83,14 @@ def construct_grids(batch):
 
 # Get matrices/arrays of species IDs and locations
 data = fetch_species_distributions()
-species_names = ['Bradypus Variegatus', 'Microryzomys Minutus']
+species_names = ["Bradypus Variegatus", "Microryzomys Minutus"]
 
-Xtrain = np.vstack([data['train']['dd lat'],
-                    data['train']['dd long']]).T
-ytrain = np.array([d.decode('ascii').startswith('micro')
-                  for d in data['train']['species']], dtype='int')
-Xtrain *= np.pi / 180.  # Convert lat/long to radians
+Xtrain = np.vstack([data["train"]["dd lat"], data["train"]["dd long"]]).T
+ytrain = np.array(
+    [d.decode("ascii").startswith("micro") for d in data["train"]["species"]],
+    dtype="int",
+)
+Xtrain *= np.pi / 180.0  # Convert lat/long to radians
 
 # Set up the data grid for the contour plot
 xgrid, ygrid = construct_grids(data)
@@ -98,7 +100,7 @@ def construct_grids(batch):
 
 xy = np.vstack([Y.ravel(), X.ravel()]).T
 xy = xy[land_mask]
-xy *= np.pi / 180.
+xy *= np.pi / 180.0
 
 # Plot map of South America with distributions of each species
 fig = plt.figure()
@@ -109,12 +111,13 @@ def construct_grids(batch):
 
     # construct a kernel density estimate of the distribution
     print(" - computing KDE in spherical coordinates")
-    kde = KernelDensity(bandwidth=0.04, metric='haversine',
-                        kernel='gaussian', algorithm='ball_tree')
+    kde = KernelDensity(
+        bandwidth=0.04, metric="haversine", kernel="gaussian", algorithm="ball_tree"
+    )
     kde.fit(Xtrain[ytrain == i])
 
     # evaluate only on the land: -9999 indicates ocean
-    Z = np.full(land_mask.shape[0], -9999, dtype='int')
+    Z = np.full(land_mask.shape[0], -9999, dtype="int")
     Z[land_mask] = np.exp(kde.score_samples(xy))
     Z = Z.reshape(X.shape)
 
@@ -124,16 +127,21 @@ def construct_grids(batch):
 
     if basemap:
         print(" - plot coastlines using basemap")
-        m = Basemap(projection='cyl', llcrnrlat=Y.min(),
-                    urcrnrlat=Y.max(), llcrnrlon=X.min(),
-                    urcrnrlon=X.max(), resolution='c')
+        m = Basemap(
+            projection="cyl",
+            llcrnrlat=Y.min(),
+            urcrnrlat=Y.max(),
+            llcrnrlon=X.min(),
+            urcrnrlon=X.max(),
+            resolution="c",
+        )
         m.drawcoastlines()
         m.drawcountries()
     else:
         print(" - plot coastlines from coverage")
-        plt.contour(X, Y, land_reference,
-                    levels=[-9998], colors="k",
-                    linestyles="solid")
+        plt.contour(
+            X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid"
+        )
         plt.xticks([])
         plt.yticks([])
 
diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py
index 15cd8e490efca..8e35f61fe2494 100644
--- a/examples/neural_networks/plot_mlp_alpha.py
+++ b/examples/neural_networks/plot_mlp_alpha.py
@@ -30,47 +30,55 @@
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import make_pipeline
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 alphas = np.logspace(-1, 1, 5)
 
 classifiers = []
 names = []
 for alpha in alphas:
-    classifiers.append(make_pipeline(
-        StandardScaler(),
-        MLPClassifier(
-            solver='lbfgs', alpha=alpha, random_state=1, max_iter=2000,
-            early_stopping=True, hidden_layer_sizes=[100, 100],
+    classifiers.append(
+        make_pipeline(
+            StandardScaler(),
+            MLPClassifier(
+                solver="lbfgs",
+                alpha=alpha,
+                random_state=1,
+                max_iter=2000,
+                early_stopping=True,
+                hidden_layer_sizes=[100, 100],
+            ),
         )
-    ))
+    )
     names.append(f"alpha {alpha:.2f}")
 
-X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
-                           random_state=0, n_clusters_per_class=1)
+X, y = make_classification(
+    n_features=2, n_redundant=0, n_informative=2, random_state=0, n_clusters_per_class=1
+)
 rng = np.random.RandomState(2)
 X += 2 * rng.uniform(size=X.shape)
 linearly_separable = (X, y)
 
-datasets = [make_moons(noise=0.3, random_state=0),
-            make_circles(noise=0.2, factor=0.5, random_state=1),
-            linearly_separable]
+datasets = [
+    make_moons(noise=0.3, random_state=0),
+    make_circles(noise=0.2, factor=0.5, random_state=1),
+    linearly_separable,
+]
 
 figure = plt.figure(figsize=(17, 9))
 i = 1
 # iterate over datasets
 for X, y in datasets:
     # split into training and test part
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
 
-    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
     # just plot the dataset first
     cm = plt.cm.RdBu
-    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
+    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
     ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
     # Plot the training points
     ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
@@ -97,23 +105,41 @@
 
         # Put the result into a color plot
         Z = Z.reshape(xx.shape)
-        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
 
         # Plot also the training points
-        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
-                   edgecolors='black', s=25)
+        ax.scatter(
+            X_train[:, 0],
+            X_train[:, 1],
+            c=y_train,
+            cmap=cm_bright,
+            edgecolors="black",
+            s=25,
+        )
         # and testing points
-        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
-                   alpha=0.6, edgecolors='black', s=25)
+        ax.scatter(
+            X_test[:, 0],
+            X_test[:, 1],
+            c=y_test,
+            cmap=cm_bright,
+            alpha=0.6,
+            edgecolors="black",
+            s=25,
+        )
 
         ax.set_xlim(xx.min(), xx.max())
         ax.set_ylim(yy.min(), yy.max())
         ax.set_xticks(())
         ax.set_yticks(())
         ax.set_title(name)
-        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
-                size=15, horizontalalignment='right')
+        ax.text(
+            xx.max() - 0.3,
+            yy.min() + 0.3,
+            ("%.2f" % score).lstrip("0"),
+            size=15,
+            horizontalalignment="right",
+        )
         i += 1
 
-figure.subplots_adjust(left=.02, right=.98)
+figure.subplots_adjust(left=0.02, right=0.98)
 plt.show()
diff --git a/examples/neural_networks/plot_mlp_training_curves.py b/examples/neural_networks/plot_mlp_training_curves.py
index 608db0f75ad98..45c3d3529da80 100644
--- a/examples/neural_networks/plot_mlp_training_curves.py
+++ b/examples/neural_networks/plot_mlp_training_curves.py
@@ -25,32 +25,69 @@
 from sklearn.exceptions import ConvergenceWarning
 
 # different learning rate schedules and momentum parameters
-params = [{'solver': 'sgd', 'learning_rate': 'constant', 'momentum': 0,
-           'learning_rate_init': 0.2},
-          {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9,
-           'nesterovs_momentum': False, 'learning_rate_init': 0.2},
-          {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9,
-           'nesterovs_momentum': True, 'learning_rate_init': 0.2},
-          {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': 0,
-           'learning_rate_init': 0.2},
-          {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9,
-           'nesterovs_momentum': True, 'learning_rate_init': 0.2},
-          {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9,
-           'nesterovs_momentum': False, 'learning_rate_init': 0.2},
-          {'solver': 'adam', 'learning_rate_init': 0.01}]
-
-labels = ["constant learning-rate", "constant with momentum",
-          "constant with Nesterov's momentum",
-          "inv-scaling learning-rate", "inv-scaling with momentum",
-          "inv-scaling with Nesterov's momentum", "adam"]
-
-plot_args = [{'c': 'red', 'linestyle': '-'},
-             {'c': 'green', 'linestyle': '-'},
-             {'c': 'blue', 'linestyle': '-'},
-             {'c': 'red', 'linestyle': '--'},
-             {'c': 'green', 'linestyle': '--'},
-             {'c': 'blue', 'linestyle': '--'},
-             {'c': 'black', 'linestyle': '-'}]
+params = [
+    {
+        "solver": "sgd",
+        "learning_rate": "constant",
+        "momentum": 0,
+        "learning_rate_init": 0.2,
+    },
+    {
+        "solver": "sgd",
+        "learning_rate": "constant",
+        "momentum": 0.9,
+        "nesterovs_momentum": False,
+        "learning_rate_init": 0.2,
+    },
+    {
+        "solver": "sgd",
+        "learning_rate": "constant",
+        "momentum": 0.9,
+        "nesterovs_momentum": True,
+        "learning_rate_init": 0.2,
+    },
+    {
+        "solver": "sgd",
+        "learning_rate": "invscaling",
+        "momentum": 0,
+        "learning_rate_init": 0.2,
+    },
+    {
+        "solver": "sgd",
+        "learning_rate": "invscaling",
+        "momentum": 0.9,
+        "nesterovs_momentum": True,
+        "learning_rate_init": 0.2,
+    },
+    {
+        "solver": "sgd",
+        "learning_rate": "invscaling",
+        "momentum": 0.9,
+        "nesterovs_momentum": False,
+        "learning_rate_init": 0.2,
+    },
+    {"solver": "adam", "learning_rate_init": 0.01},
+]
+
+labels = [
+    "constant learning-rate",
+    "constant with momentum",
+    "constant with Nesterov's momentum",
+    "inv-scaling learning-rate",
+    "inv-scaling with momentum",
+    "inv-scaling with Nesterov's momentum",
+    "adam",
+]
+
+plot_args = [
+    {"c": "red", "linestyle": "-"},
+    {"c": "green", "linestyle": "-"},
+    {"c": "blue", "linestyle": "-"},
+    {"c": "red", "linestyle": "--"},
+    {"c": "green", "linestyle": "--"},
+    {"c": "blue", "linestyle": "--"},
+    {"c": "black", "linestyle": "-"},
+]
 
 
 def plot_on_dataset(X, y, ax, name):
@@ -68,14 +105,14 @@ def plot_on_dataset(X, y, ax, name):
 
     for label, param in zip(labels, params):
         print("training: %s" % label)
-        mlp = MLPClassifier(random_state=0,
-                            max_iter=max_iter, **param)
+        mlp = MLPClassifier(random_state=0, max_iter=max_iter, **param)
 
         # some parameter combinations will not converge as can be seen on the
         # plots so they are ignored here
         with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=ConvergenceWarning,
-                                    module="sklearn")
+            warnings.filterwarnings(
+                "ignore", category=ConvergenceWarning, module="sklearn"
+            )
             mlp.fit(X, y)
 
         mlps.append(mlp)
@@ -89,13 +126,16 @@ def plot_on_dataset(X, y, ax, name):
 # load / generate some toy datasets
 iris = datasets.load_iris()
 X_digits, y_digits = datasets.load_digits(return_X_y=True)
-data_sets = [(iris.data, iris.target),
-             (X_digits, y_digits),
-             datasets.make_circles(noise=0.2, factor=0.5, random_state=1),
-             datasets.make_moons(noise=0.3, random_state=0)]
-
-for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits',
-                                                    'circles', 'moons']):
+data_sets = [
+    (iris.data, iris.target),
+    (X_digits, y_digits),
+    datasets.make_circles(noise=0.2, factor=0.5, random_state=1),
+    datasets.make_moons(noise=0.3, random_state=0),
+]
+
+for ax, data, name in zip(
+    axes.ravel(), data_sets, ["iris", "digits", "circles", "moons"]
+):
     plot_on_dataset(*data, ax=ax, name=name)
 
 fig.legend(ax.get_lines(), labels, ncol=3, loc="upper center")
diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
index 33f421a226c33..0fba5412b96d0 100644
--- a/examples/neural_networks/plot_mnist_filters.py
+++ b/examples/neural_networks/plot_mnist_filters.py
@@ -33,22 +33,27 @@
 print(__doc__)
 
 # Load data from https://www.openml.org/d/554
-X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
-X = X / 255.
+X, y = fetch_openml("mnist_784", version=1, return_X_y=True)
+X = X / 255.0
 
 # rescale the data, use the traditional train/test split
 X_train, X_test = X[:60000], X[60000:]
 y_train, y_test = y[:60000], y[60000:]
 
-mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
-                    solver='sgd', verbose=10, random_state=1,
-                    learning_rate_init=.1)
+mlp = MLPClassifier(
+    hidden_layer_sizes=(50,),
+    max_iter=10,
+    alpha=1e-4,
+    solver="sgd",
+    verbose=10,
+    random_state=1,
+    learning_rate_init=0.1,
+)
 
 # this example won't converge because of CI's time constraints, so we catch the
 # warning and are ignore it here
 with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=ConvergenceWarning,
-                            module="sklearn")
+    warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
     mlp.fit(X_train, y_train)
 
 print("Training set score: %f" % mlp.score(X_train, y_train))
@@ -58,8 +63,7 @@
 # use global min / max to ensure all weights are shown on the same scale
 vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
 for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
-    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
-               vmax=.5 * vmax)
+    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=0.5 * vmin, vmax=0.5 * vmax)
     ax.set_xticks(())
     ax.set_yticks(())
 
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index f7ad3513499ca..c17bebfc38cff 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -44,53 +44,42 @@
 # #############################################################################
 # Setting up
 
+
 def nudge_dataset(X, Y):
     """
     This produces a dataset 5 times bigger than the original one,
     by moving the 8x8 images in X around by 1px to left, right, down, up
     """
     direction_vectors = [
-        [[0, 1, 0],
-         [0, 0, 0],
-         [0, 0, 0]],
-
-        [[0, 0, 0],
-         [1, 0, 0],
-         [0, 0, 0]],
-
-        [[0, 0, 0],
-         [0, 0, 1],
-         [0, 0, 0]],
-
-        [[0, 0, 0],
-         [0, 0, 0],
-         [0, 1, 0]]]
+        [[0, 1, 0], [0, 0, 0], [0, 0, 0]],
+        [[0, 0, 0], [1, 0, 0], [0, 0, 0]],
+        [[0, 0, 0], [0, 0, 1], [0, 0, 0]],
+        [[0, 0, 0], [0, 0, 0], [0, 1, 0]],
+    ]
 
     def shift(x, w):
-        return convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
+        return convolve(x.reshape((8, 8)), mode="constant", weights=w).ravel()
 
-    X = np.concatenate([X] +
-                       [np.apply_along_axis(shift, 1, X, vector)
-                        for vector in direction_vectors])
+    X = np.concatenate(
+        [X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]
+    )
     Y = np.concatenate([Y for _ in range(5)], axis=0)
     return X, Y
 
 
 # Load Data
 X, y = datasets.load_digits(return_X_y=True)
-X = np.asarray(X, 'float32')
+X = np.asarray(X, "float32")
 X, Y = nudge_dataset(X, y)
 X = minmax_scale(X, feature_range=(0, 1))  # 0-1 scaling
 
-X_train, X_test, Y_train, Y_test = train_test_split(
-    X, Y, test_size=0.2, random_state=0)
+X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
 # Models we will use
-logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1)
+logistic = linear_model.LogisticRegression(solver="newton-cg", tol=1)
 rbm = BernoulliRBM(random_state=0, verbose=True)
 
-rbm_features_classifier = Pipeline(
-    steps=[('rbm', rbm), ('logistic', logistic)])
+rbm_features_classifier = Pipeline(steps=[("rbm", rbm), ("logistic", logistic)])
 
 # #############################################################################
 # Training
@@ -110,19 +99,23 @@ def shift(x, w):
 
 # Training the Logistic regression classifier directly on the pixel
 raw_pixel_classifier = clone(logistic)
-raw_pixel_classifier.C = 100.
+raw_pixel_classifier.C = 100.0
 raw_pixel_classifier.fit(X_train, Y_train)
 
 # #############################################################################
 # Evaluation
 
 Y_pred = rbm_features_classifier.predict(X_test)
-print("Logistic regression using RBM features:\n%s\n" % (
-    metrics.classification_report(Y_test, Y_pred)))
+print(
+    "Logistic regression using RBM features:\n%s\n"
+    % (metrics.classification_report(Y_test, Y_pred))
+)
 
 Y_pred = raw_pixel_classifier.predict(X_test)
-print("Logistic regression using raw pixel features:\n%s\n" % (
-    metrics.classification_report(Y_test, Y_pred)))
+print(
+    "Logistic regression using raw pixel features:\n%s\n"
+    % (metrics.classification_report(Y_test, Y_pred))
+)
 
 # #############################################################################
 # Plotting
@@ -130,11 +123,10 @@ def shift(x, w):
 plt.figure(figsize=(4.2, 4))
 for i, comp in enumerate(rbm.components_):
     plt.subplot(10, 10, i + 1)
-    plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,
-               interpolation='nearest')
+    plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation="nearest")
     plt.xticks(())
     plt.yticks(())
-plt.suptitle('100 components extracted by RBM', fontsize=16)
+plt.suptitle("100 components extracted by RBM", fontsize=16)
 plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 plt.show()
diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py
index b52f35986d0f6..4dc93993330b2 100755
--- a/examples/preprocessing/plot_all_scaling.py
+++ b/examples/preprocessing/plot_all_scaling.py
@@ -72,51 +72,55 @@
 feature_names = dataset.feature_names
 
 feature_mapping = {
-    'MedInc': 'Median income in block',
-    'HousAge': 'Median house age in block',
-    'AveRooms': 'Average number of rooms',
-    'AveBedrms': 'Average number of bedrooms',
-    'Population': 'Block population',
-    'AveOccup': 'Average house occupancy',
-    'Latitude': 'House block latitude',
-    'Longitude': 'House block longitude'
+    "MedInc": "Median income in block",
+    "HousAge": "Median house age in block",
+    "AveRooms": "Average number of rooms",
+    "AveBedrms": "Average number of bedrooms",
+    "Population": "Block population",
+    "AveOccup": "Average house occupancy",
+    "Latitude": "House block latitude",
+    "Longitude": "House block longitude",
 }
 
 # Take only 2 features to make visualization easier
 # Feature MedInc has a long tail distribution.
 # Feature AveOccup has a few but very large outliers.
-features = ['MedInc', 'AveOccup']
+features = ["MedInc", "AveOccup"]
 features_idx = [feature_names.index(feature) for feature in features]
 X = X_full[:, features_idx]
 distributions = [
-    ('Unscaled data', X),
-    ('Data after standard scaling',
-        StandardScaler().fit_transform(X)),
-    ('Data after min-max scaling',
-        MinMaxScaler().fit_transform(X)),
-    ('Data after max-abs scaling',
-        MaxAbsScaler().fit_transform(X)),
-    ('Data after robust scaling',
-        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
-    ('Data after power transformation (Yeo-Johnson)',
-     PowerTransformer(method='yeo-johnson').fit_transform(X)),
-    ('Data after power transformation (Box-Cox)',
-     PowerTransformer(method='box-cox').fit_transform(X)),
-    ('Data after quantile transformation (uniform pdf)',
-        QuantileTransformer(output_distribution='uniform')
-        .fit_transform(X)),
-    ('Data after quantile transformation (gaussian pdf)',
-        QuantileTransformer(output_distribution='normal')
-        .fit_transform(X)),
-    ('Data after sample-wise L2 normalizing',
-        Normalizer().fit_transform(X)),
+    ("Unscaled data", X),
+    ("Data after standard scaling", StandardScaler().fit_transform(X)),
+    ("Data after min-max scaling", MinMaxScaler().fit_transform(X)),
+    ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
+    (
+        "Data after robust scaling",
+        RobustScaler(quantile_range=(25, 75)).fit_transform(X),
+    ),
+    (
+        "Data after power transformation (Yeo-Johnson)",
+        PowerTransformer(method="yeo-johnson").fit_transform(X),
+    ),
+    (
+        "Data after power transformation (Box-Cox)",
+        PowerTransformer(method="box-cox").fit_transform(X),
+    ),
+    (
+        "Data after quantile transformation (uniform pdf)",
+        QuantileTransformer(output_distribution="uniform").fit_transform(X),
+    ),
+    (
+        "Data after quantile transformation (gaussian pdf)",
+        QuantileTransformer(output_distribution="normal").fit_transform(X),
+    ),
+    ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
 ]
 
 # scale the output between 0 and 1 for the colorbar
 y = minmax_scale(y_full)
 
 # plasma does not exist in matplotlib < 1.5
-cmap = getattr(cm, 'plasma_r', cm.hot_r)
+cmap = getattr(cm, "plasma_r", cm.hot_r)
 
 
 def create_axes(title, figsize=(16, 6)):
@@ -155,13 +159,14 @@ def create_axes(title, figsize=(16, 6)):
     rect_colorbar = [left, bottom, width, height]
     ax_colorbar = plt.axes(rect_colorbar)
 
-    return ((ax_scatter, ax_histy, ax_histx),
-            (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
-            ax_colorbar)
+    return (
+        (ax_scatter, ax_histy, ax_histx),
+        (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom),
+        ax_colorbar,
+    )
 
 
-def plot_distribution(axes, X, y, hist_nbins=50, title="",
-                      x0_label="", x1_label=""):
+def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""):
     ax, hist_X1, hist_X0 = axes
 
     ax.set_title(title)
@@ -170,28 +175,31 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="",
 
     # The scatter plot
     colors = cmap(y)
-    ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors)
+    ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors)
 
     # Removing the top and the right spine for aesthetics
     # make nice axis layout
-    ax.spines['top'].set_visible(False)
-    ax.spines['right'].set_visible(False)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
     ax.get_xaxis().tick_bottom()
     ax.get_yaxis().tick_left()
-    ax.spines['left'].set_position(('outward', 10))
-    ax.spines['bottom'].set_position(('outward', 10))
+    ax.spines["left"].set_position(("outward", 10))
+    ax.spines["bottom"].set_position(("outward", 10))
 
     # Histogram for axis X1 (feature 5)
     hist_X1.set_ylim(ax.get_ylim())
-    hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal',
-                 color='grey', ec='grey')
-    hist_X1.axis('off')
+    hist_X1.hist(
+        X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey"
+    )
+    hist_X1.axis("off")
 
     # Histogram for axis X0 (feature 0)
     hist_X0.set_xlim(ax.get_xlim())
-    hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical',
-                 color='grey', ec='grey')
-    hist_X0.axis('off')
+    hist_X0.hist(
+        X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey"
+    )
+    hist_X0.axis("off")
+
 
 # %%
 # Two plots will be shown for each scaler/normalizer/transformer. The left
@@ -205,29 +213,42 @@ def make_plot(item_idx):
     title, X = distributions[item_idx]
     ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes(title)
     axarr = (ax_zoom_out, ax_zoom_in)
-    plot_distribution(axarr[0], X, y, hist_nbins=200,
-                      x0_label=feature_mapping[features[0]],
-                      x1_label=feature_mapping[features[1]],
-                      title="Full data")
+    plot_distribution(
+        axarr[0],
+        X,
+        y,
+        hist_nbins=200,
+        x0_label=feature_mapping[features[0]],
+        x1_label=feature_mapping[features[1]],
+        title="Full data",
+    )
 
     # zoom-in
     zoom_in_percentile_range = (0, 99)
     cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range)
     cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range)
 
-    non_outliers_mask = (
-        np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) &
-        np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1))
-    plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask],
-                      hist_nbins=50,
-                      x0_label=feature_mapping[features[0]],
-                      x1_label=feature_mapping[features[1]],
-                      title="Zoom-in")
+    non_outliers_mask = np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all(
+        X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1
+    )
+    plot_distribution(
+        axarr[1],
+        X[non_outliers_mask],
+        y[non_outliers_mask],
+        hist_nbins=50,
+        x0_label=feature_mapping[features[0]],
+        x1_label=feature_mapping[features[1]],
+        title="Zoom-in",
+    )
 
     norm = mpl.colors.Normalize(y_full.min(), y_full.max())
-    mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cmap,
-                              norm=norm, orientation='vertical',
-                              label='Color mapping for values of y')
+    mpl.colorbar.ColorbarBase(
+        ax_colorbar,
+        cmap=cmap,
+        norm=norm,
+        orientation="vertical",
+        label="Color mapping for values of y",
+    )
 
 
 # %%
diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py
index 9cfcb30e6fdd7..2e5be3f4640f4 100644
--- a/examples/preprocessing/plot_discretization.py
+++ b/examples/preprocessing/plot_discretization.py
@@ -49,19 +49,17 @@
 X = X.reshape(-1, 1)
 
 # transform the dataset with KBinsDiscretizer
-enc = KBinsDiscretizer(n_bins=10, encode='onehot')
+enc = KBinsDiscretizer(n_bins=10, encode="onehot")
 X_binned = enc.fit_transform(X)
 
 # predict with original dataset
 fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
 line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
 reg = LinearRegression().fit(X, y)
-ax1.plot(line, reg.predict(line), linewidth=2, color='green',
-         label="linear regression")
+ax1.plot(line, reg.predict(line), linewidth=2, color="green", label="linear regression")
 reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
-ax1.plot(line, reg.predict(line), linewidth=2, color='red',
-         label="decision tree")
-ax1.plot(X[:, 0], y, 'o', c='k')
+ax1.plot(line, reg.predict(line), linewidth=2, color="red", label="decision tree")
+ax1.plot(X[:, 0], y, "o", c="k")
 ax1.legend(loc="best")
 ax1.set_ylabel("Regression output")
 ax1.set_xlabel("Input feature")
@@ -70,14 +68,25 @@
 # predict with transformed dataset
 line_binned = enc.transform(line)
 reg = LinearRegression().fit(X_binned, y)
-ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green',
-         linestyle='-', label='linear regression')
-reg = DecisionTreeRegressor(min_samples_split=3,
-                            random_state=0).fit(X_binned, y)
-ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red',
-         linestyle=':', label='decision tree')
-ax2.plot(X[:, 0], y, 'o', c='k')
-ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=.2)
+ax2.plot(
+    line,
+    reg.predict(line_binned),
+    linewidth=2,
+    color="green",
+    linestyle="-",
+    label="linear regression",
+)
+reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X_binned, y)
+ax2.plot(
+    line,
+    reg.predict(line_binned),
+    linewidth=2,
+    color="red",
+    linestyle=":",
+    label="decision tree",
+)
+ax2.plot(X[:, 0], y, "o", c="k")
+ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=0.2)
 ax2.legend(loc="best")
 ax2.set_xlabel("Input feature")
 ax2.set_title("Result after discretization")
diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py
index e55e7cb500eb1..355bb9253d963 100644
--- a/examples/preprocessing/plot_discretization_classification.py
+++ b/examples/preprocessing/plot_discretization_classification.py
@@ -50,42 +50,42 @@
 
 print(__doc__)
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 
 def get_name(estimator):
     name = estimator.__class__.__name__
-    if name == 'Pipeline':
+    if name == "Pipeline":
         name = [get_name(est[1]) for est in estimator.steps]
-        name = ' + '.join(name)
+        name = " + ".join(name)
     return name
 
 
 # list of (estimator, param_grid), where param_grid is used in GridSearchCV
 classifiers = [
-    (LogisticRegression(random_state=0), {
-        'C': np.logspace(-2, 7, 10)
-    }),
-    (LinearSVC(random_state=0), {
-        'C': np.logspace(-2, 7, 10)
-    }),
-    (make_pipeline(
-        KBinsDiscretizer(encode='onehot'),
-        LogisticRegression(random_state=0)), {
-            'kbinsdiscretizer__n_bins': np.arange(2, 10),
-            'logisticregression__C': np.logspace(-2, 7, 10),
-        }),
-    (make_pipeline(
-        KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
-            'kbinsdiscretizer__n_bins': np.arange(2, 10),
-            'linearsvc__C': np.logspace(-2, 7, 10),
-        }),
-    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
-        'learning_rate': np.logspace(-4, 0, 10)
-    }),
-    (SVC(random_state=0), {
-        'C': np.logspace(-2, 7, 10)
-    }),
+    (LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}),
+    (LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
+    (
+        make_pipeline(
+            KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0)
+        ),
+        {
+            "kbinsdiscretizer__n_bins": np.arange(2, 10),
+            "logisticregression__C": np.logspace(-2, 7, 10),
+        },
+    ),
+    (
+        make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)),
+        {
+            "kbinsdiscretizer__n_bins": np.arange(2, 10),
+            "linearsvc__C": np.logspace(-2, 7, 10),
+        },
+    ),
+    (
+        GradientBoostingClassifier(n_estimators=50, random_state=0),
+        {"learning_rate": np.logspace(-4, 0, 10)},
+    ),
+    (SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
 ]
 
 names = [get_name(e) for e, g in classifiers]
@@ -94,57 +94,62 @@ def get_name(estimator):
 datasets = [
     make_moons(n_samples=n_samples, noise=0.2, random_state=0),
     make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
-    make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
-                        n_informative=2, random_state=2,
-                        n_clusters_per_class=1)
+    make_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_redundant=0,
+        n_informative=2,
+        random_state=2,
+        n_clusters_per_class=1,
+    ),
 ]
 
-fig, axes = plt.subplots(nrows=len(datasets), ncols=len(classifiers) + 1,
-                         figsize=(21, 9))
+fig, axes = plt.subplots(
+    nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)
+)
 
 cm = plt.cm.PiYG
-cm_bright = ListedColormap(['#b30065', '#178000'])
+cm_bright = ListedColormap(["#b30065", "#178000"])
 
 # iterate over datasets
 for ds_cnt, (X, y) in enumerate(datasets):
-    print('\ndataset %d\n---------' % ds_cnt)
+    print("\ndataset %d\n---------" % ds_cnt)
 
     # preprocess dataset, split into training and test part
     X = StandardScaler().fit_transform(X)
     X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=.5, random_state=42)
+        X, y, test_size=0.5, random_state=42
+    )
 
     # create the grid for background colors
-    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
-    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
-    xx, yy = np.meshgrid(
-        np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
     # plot the dataset first
     ax = axes[ds_cnt, 0]
     if ds_cnt == 0:
         ax.set_title("Input data")
     # plot the training points
-    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
-               edgecolors='k')
+    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
     # and testing points
-    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
-               edgecolors='k')
+    ax.scatter(
+        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
+    )
     ax.set_xlim(xx.min(), xx.max())
     ax.set_ylim(yy.min(), yy.max())
     ax.set_xticks(())
     ax.set_yticks(())
 
     # iterate over classifiers
-    for est_idx, (name, (estimator, param_grid)) in \
-            enumerate(zip(names, classifiers)):
+    for est_idx, (name, (estimator, param_grid)) in enumerate(zip(names, classifiers)):
         ax = axes[ds_cnt, est_idx + 1]
 
         clf = GridSearchCV(estimator=estimator, param_grid=param_grid)
         with ignore_warnings(category=ConvergenceWarning):
             clf.fit(X_train, y_train)
         score = clf.score(X_test, y_test)
-        print('%s: %.2f' % (name, score))
+        print("%s: %.2f" % (name, score))
 
         # plot the decision boundary. For that, we will assign a color to each
         # point in the mesh [x_min, x_max]*[y_min, y_max].
@@ -155,24 +160,37 @@ def get_name(estimator):
 
         # put the result into a color plot
         Z = Z.reshape(xx.shape)
-        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
 
         # plot the training points
-        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
-                   edgecolors='k')
+        ax.scatter(
+            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
+        )
         # and testing points
-        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
-                   edgecolors='k', alpha=0.6)
+        ax.scatter(
+            X_test[:, 0],
+            X_test[:, 1],
+            c=y_test,
+            cmap=cm_bright,
+            edgecolors="k",
+            alpha=0.6,
+        )
         ax.set_xlim(xx.min(), xx.max())
         ax.set_ylim(yy.min(), yy.max())
         ax.set_xticks(())
         ax.set_yticks(())
 
         if ds_cnt == 0:
-            ax.set_title(name.replace(' + ', '\n'))
-        ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15,
-                bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),
-                transform=ax.transAxes, horizontalalignment='right')
+            ax.set_title(name.replace(" + ", "\n"))
+        ax.text(
+            0.95,
+            0.06,
+            ("%.2f" % score).lstrip("0"),
+            size=15,
+            bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"),
+            transform=ax.transAxes,
+            horizontalalignment="right",
+        )
 
 
 plt.tight_layout()
@@ -180,12 +198,18 @@ def get_name(estimator):
 # Add suptitles above the figure
 plt.subplots_adjust(top=0.90)
 suptitles = [
-    'Linear classifiers',
-    'Feature discretization and linear classifiers',
-    'Non-linear classifiers',
+    "Linear classifiers",
+    "Feature discretization and linear classifiers",
+    "Non-linear classifiers",
 ]
 for i, suptitle in zip([1, 3, 5], suptitles):
     ax = axes[0, i]
-    ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,
-            horizontalalignment='center', size='x-large')
+    ax.text(
+        1.05,
+        1.25,
+        suptitle,
+        transform=ax.transAxes,
+        horizontalalignment="center",
+        size="x-large",
+    )
 plt.show()
diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py
index 9ef211a83ccf3..bee3a6314cd52 100644
--- a/examples/preprocessing/plot_discretization_strategies.py
+++ b/examples/preprocessing/plot_discretization_strategies.py
@@ -27,7 +27,7 @@
 
 print(__doc__)
 
-strategies = ['uniform', 'quantile', 'kmeans']
+strategies = ["uniform", "quantile", "kmeans"]
 
 n_samples = 200
 centers_0 = np.array([[0, 0], [0, 5], [2, 4], [8, 8]])
@@ -37,13 +37,23 @@
 random_state = 42
 X_list = [
     np.random.RandomState(random_state).uniform(-3, 3, size=(n_samples, 2)),
-    make_blobs(n_samples=[n_samples // 10, n_samples * 4 // 10,
-                          n_samples // 10, n_samples * 4 // 10],
-               cluster_std=0.5, centers=centers_0,
-               random_state=random_state)[0],
-    make_blobs(n_samples=[n_samples // 5, n_samples * 4 // 5],
-               cluster_std=0.5, centers=centers_1,
-               random_state=random_state)[0],
+    make_blobs(
+        n_samples=[
+            n_samples // 10,
+            n_samples * 4 // 10,
+            n_samples // 10,
+            n_samples * 4 // 10,
+        ],
+        cluster_std=0.5,
+        centers=centers_0,
+        random_state=random_state,
+    )[0],
+    make_blobs(
+        n_samples=[n_samples // 5, n_samples * 4 // 5],
+        cluster_std=0.5,
+        centers=centers_1,
+        random_state=random_state,
+    )[0],
 ]
 
 figure = plt.figure(figsize=(14, 9))
@@ -51,13 +61,14 @@
 for ds_cnt, X in enumerate(X_list):
 
     ax = plt.subplot(len(X_list), len(strategies) + 1, i)
-    ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
+    ax.scatter(X[:, 0], X[:, 1], edgecolors="k")
     if ds_cnt == 0:
         ax.set_title("Input data", size=14)
 
     xx, yy = np.meshgrid(
         np.linspace(X[:, 0].min(), X[:, 0].max(), 300),
-        np.linspace(X[:, 1].min(), X[:, 1].max(), 300))
+        np.linspace(X[:, 1].min(), X[:, 1].max(), 300),
+    )
     grid = np.c_[xx.ravel(), yy.ravel()]
 
     ax.set_xlim(xx.min(), xx.max())
@@ -68,7 +79,7 @@
     i += 1
     # transform the dataset with KBinsDiscretizer
     for strategy in strategies:
-        enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy)
+        enc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy=strategy)
         enc.fit(X)
         grid_encoded = enc.transform(grid)
 
@@ -76,18 +87,18 @@
 
         # horizontal stripes
         horizontal = grid_encoded[:, 0].reshape(xx.shape)
-        ax.contourf(xx, yy, horizontal, alpha=.5)
+        ax.contourf(xx, yy, horizontal, alpha=0.5)
         # vertical stripes
         vertical = grid_encoded[:, 1].reshape(xx.shape)
-        ax.contourf(xx, yy, vertical, alpha=.5)
+        ax.contourf(xx, yy, vertical, alpha=0.5)
 
-        ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
+        ax.scatter(X[:, 0], X[:, 1], edgecolors="k")
         ax.set_xlim(xx.min(), xx.max())
         ax.set_ylim(yy.min(), yy.max())
         ax.set_xticks(())
         ax.set_yticks(())
         if ds_cnt == 0:
-            ax.set_title("strategy='%s'" % (strategy, ), size=14)
+            ax.set_title("strategy='%s'" % (strategy,), size=14)
 
         i += 1
 
diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py
index 581ca20a83a42..ef605cedbe5f5 100644
--- a/examples/preprocessing/plot_map_data_to_normal.py
+++ b/examples/preprocessing/plot_map_data_to_normal.py
@@ -53,12 +53,13 @@
 
 
 rng = np.random.RandomState(304)
-bc = PowerTransformer(method='box-cox')
-yj = PowerTransformer(method='yeo-johnson')
+bc = PowerTransformer(method="box-cox")
+yj = PowerTransformer(method="yeo-johnson")
 # n_quantiles is set to the training set size rather than the default value
 # to avoid a warning being raised by this example
-qt = QuantileTransformer(n_quantiles=500, output_distribution='normal',
-                         random_state=rng)
+qt = QuantileTransformer(
+    n_quantiles=500, output_distribution="normal", random_state=rng
+)
 size = (N_SAMPLES, 1)
 
 
@@ -88,28 +89,32 @@
 
 # create plots
 distributions = [
-    ('Lognormal', X_lognormal),
-    ('Chi-squared', X_chisq),
-    ('Weibull', X_weibull),
-    ('Gaussian', X_gaussian),
-    ('Uniform', X_uniform),
-    ('Bimodal', X_bimodal)
+    ("Lognormal", X_lognormal),
+    ("Chi-squared", X_chisq),
+    ("Weibull", X_weibull),
+    ("Gaussian", X_gaussian),
+    ("Uniform", X_uniform),
+    ("Bimodal", X_bimodal),
 ]
 
-colors = ['#D81B60', '#0188FF', '#FFC107',
-          '#B7A2FF', '#000000', '#2EC5AC']
+colors = ["#D81B60", "#0188FF", "#FFC107", "#B7A2FF", "#000000", "#2EC5AC"]
 
 fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))
 axes = axes.flatten()
-axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
-             (13, 16, 19, 22), (14, 17, 20, 23)]
-axes_list = [(axes[i], axes[j], axes[k], axes[l])
-             for (i, j, k, l) in axes_idxs]
+axes_idxs = [
+    (0, 3, 6, 9),
+    (1, 4, 7, 10),
+    (2, 5, 8, 11),
+    (12, 15, 18, 21),
+    (13, 16, 19, 22),
+    (14, 17, 20, 23),
+]
+axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs]
 
 
 for distribution, color, axes in zip(distributions, colors, axes_list):
     name, X = distribution
-    X_train, X_test = train_test_split(X, test_size=.5)
+    X_train, X_test = train_test_split(X, test_size=0.5)
 
     # perform power transforms and quantile transform
     X_trans_bc = bc.fit(X_train).transform(X_test)
@@ -122,19 +127,20 @@
 
     ax_original.hist(X_train, color=color, bins=BINS)
     ax_original.set_title(name, fontsize=FONT_SIZE)
-    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+    ax_original.tick_params(axis="both", which="major", labelsize=FONT_SIZE)
 
     for ax, X_trans, meth_name, lmbda in zip(
-            (ax_bc, ax_yj, ax_qt),
-            (X_trans_bc, X_trans_yj, X_trans_qt),
-            ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
-            (lmbda_bc, lmbda_yj, None)):
+        (ax_bc, ax_yj, ax_qt),
+        (X_trans_bc, X_trans_yj, X_trans_qt),
+        ("Box-Cox", "Yeo-Johnson", "Quantile transform"),
+        (lmbda_bc, lmbda_yj, None),
+    ):
         ax.hist(X_trans, color=color, bins=BINS)
-        title = 'After {}'.format(meth_name)
+        title = "After {}".format(meth_name)
         if lmbda is not None:
-            title += '\n$\\lambda$ = {}'.format(lmbda)
+            title += "\n$\\lambda$ = {}".format(lmbda)
         ax.set_title(title, fontsize=FONT_SIZE)
-        ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
+        ax.tick_params(axis="both", which="major", labelsize=FONT_SIZE)
         ax.set_xlim([-3.5, 3.5])
 
 
diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py
index b24786e1a018d..f80debf306867 100644
--- a/examples/preprocessing/plot_scaling_importance.py
+++ b/examples/preprocessing/plot_scaling_importance.py
@@ -49,6 +49,7 @@
 import matplotlib.pyplot as plt
 from sklearn.datasets import load_wine
 from sklearn.pipeline import make_pipeline
+
 print(__doc__)
 
 # Code source: Tyler Lanigan <tylerlanigan@gmail.com>
@@ -63,9 +64,9 @@
 features, target = load_wine(return_X_y=True)
 
 # Make a train/test split using 30% test size
-X_train, X_test, y_train, y_test = train_test_split(features, target,
-                                                    test_size=0.30,
-                                                    random_state=RANDOM_STATE)
+X_train, X_test, y_train, y_test = train_test_split(
+    features, target, test_size=0.30, random_state=RANDOM_STATE
+)
 
 # Fit to data and predict using pipelined GNB and PCA.
 unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
@@ -78,54 +79,56 @@
 pred_test_std = std_clf.predict(X_test)
 
 # Show prediction accuracies in scaled and unscaled data.
-print('\nPrediction accuracy for the normal test dataset with PCA')
-print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))
+print("\nPrediction accuracy for the normal test dataset with PCA")
+print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test)))
 
-print('\nPrediction accuracy for the standardized test dataset with PCA')
-print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
+print("\nPrediction accuracy for the standardized test dataset with PCA")
+print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test_std)))
 
 # Extract PCA from pipeline
-pca = unscaled_clf.named_steps['pca']
-pca_std = std_clf.named_steps['pca']
+pca = unscaled_clf.named_steps["pca"]
+pca_std = std_clf.named_steps["pca"]
 
 # Show first principal components
-print('\nPC 1 without scaling:\n', pca.components_[0])
-print('\nPC 1 with scaling:\n', pca_std.components_[0])
+print("\nPC 1 without scaling:\n", pca.components_[0])
+print("\nPC 1 with scaling:\n", pca_std.components_[0])
 
 # Use PCA without and with scale on X_train data for visualization.
 X_train_transformed = pca.transform(X_train)
-scaler = std_clf.named_steps['standardscaler']
+scaler = std_clf.named_steps["standardscaler"]
 X_train_std_transformed = pca_std.transform(scaler.transform(X_train))
 
 # visualize standardized vs. untouched dataset with PCA performed
 fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)
 
 
-for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
-    ax1.scatter(X_train_transformed[y_train == l, 0],
-                X_train_transformed[y_train == l, 1],
-                color=c,
-                label='class %s' % l,
-                alpha=0.5,
-                marker=m
-                )
-
-for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
-    ax2.scatter(X_train_std_transformed[y_train == l, 0],
-                X_train_std_transformed[y_train == l, 1],
-                color=c,
-                label='class %s' % l,
-                alpha=0.5,
-                marker=m
-                )
-
-ax1.set_title('Training dataset after PCA')
-ax2.set_title('Standardized training dataset after PCA')
+for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")):
+    ax1.scatter(
+        X_train_transformed[y_train == l, 0],
+        X_train_transformed[y_train == l, 1],
+        color=c,
+        label="class %s" % l,
+        alpha=0.5,
+        marker=m,
+    )
+
+for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")):
+    ax2.scatter(
+        X_train_std_transformed[y_train == l, 0],
+        X_train_std_transformed[y_train == l, 1],
+        color=c,
+        label="class %s" % l,
+        alpha=0.5,
+        marker=m,
+    )
+
+ax1.set_title("Training dataset after PCA")
+ax2.set_title("Standardized training dataset after PCA")
 
 for ax in (ax1, ax2):
-    ax.set_xlabel('1st principal component')
-    ax.set_ylabel('2nd principal component')
-    ax.legend(loc='upper right')
+    ax.set_xlabel("1st principal component")
+    ax.set_ylabel("2nd principal component")
+    ax.legend(loc="upper right")
     ax.grid()
 
 plt.tight_layout()
diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py
index cc0cfe674c61d..adac61ed9688f 100644
--- a/examples/release_highlights/plot_release_highlights_0_22_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_22_0.py
@@ -81,16 +81,11 @@
 
 X, y = load_iris(return_X_y=True)
 estimators = [
-    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
-    ('svr', make_pipeline(StandardScaler(),
-                          LinearSVC(random_state=42)))
+    ("rf", RandomForestClassifier(n_estimators=10, random_state=42)),
+    ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42))),
 ]
-clf = StackingClassifier(
-    estimators=estimators, final_estimator=LogisticRegression()
-)
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, stratify=y, random_state=42
-)
+clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
 clf.fit(X_train, y_train).score(X_test, y_test)
 
 # %%
@@ -107,16 +102,16 @@
 from sklearn.inspection import permutation_importance
 
 X, y = make_classification(random_state=0, n_features=5, n_informative=3)
-feature_names = np.array([f'x_{i}' for i in range(X.shape[1])])
+feature_names = np.array([f"x_{i}" for i in range(X.shape[1])])
 
 rf = RandomForestClassifier(random_state=0).fit(X, y)
-result = permutation_importance(rf, X, y, n_repeats=10, random_state=0,
-                                n_jobs=-1)
+result = permutation_importance(rf, X, y, n_repeats=10, random_state=0, n_jobs=-1)
 
 fig, ax = plt.subplots()
 sorted_idx = result.importances_mean.argsort()
-ax.boxplot(result.importances[sorted_idx].T,
-           vert=False, labels=feature_names[sorted_idx])
+ax.boxplot(
+    result.importances[sorted_idx].T, vert=False, labels=feature_names[sorted_idx]
+)
 ax.set_title("Permutation Importance of each feature")
 ax.set_ylabel("Features")
 fig.tight_layout()
@@ -161,9 +156,10 @@
 
 with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
     estimator = make_pipeline(
-        KNeighborsTransformer(n_neighbors=10, mode='distance'),
-        Isomap(n_neighbors=10, metric='precomputed'),
-        memory=tmpdir)
+        KNeighborsTransformer(n_neighbors=10, mode="distance"),
+        Isomap(n_neighbors=10, metric="precomputed"),
+        memory=tmpdir,
+    )
     estimator.fit(X)
 
     # We can decrease the number of neighbors and the graph will not be
@@ -204,12 +200,18 @@
 X, y = make_classification(random_state=0)
 
 rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y)
-print("Average number of nodes without pruning {:.1f}".format(
-    np.mean([e.tree_.node_count for e in rf.estimators_])))
+print(
+    "Average number of nodes without pruning {:.1f}".format(
+        np.mean([e.tree_.node_count for e in rf.estimators_])
+    )
+)
 
 rf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y)
-print("Average number of nodes with pruning {:.1f}".format(
-    np.mean([e.tree_.node_count for e in rf.estimators_])))
+print(
+    "Average number of nodes with pruning {:.1f}".format(
+        np.mean([e.tree_.node_count for e in rf.estimators_])
+    )
+)
 
 # %%
 # Retrieve dataframes from OpenML
@@ -219,8 +221,8 @@
 
 from sklearn.datasets import fetch_openml
 
-titanic = fetch_openml('titanic', version=1, as_frame=True)
-print(titanic.data.head()[['pclass', 'embarked']])
+titanic = fetch_openml("titanic", version=1, as_frame=True)
+print(titanic.data.head()[["pclass", "embarked"]])
 
 # %%
 # Checking scikit-learn compatibility of an estimator
@@ -245,6 +247,7 @@
 def test_sklearn_compatible_estimator(estimator, check):
     check(estimator)
 
+
 # %%
 # ROC AUC now supports multiclass classification
 # ----------------------------------------------
@@ -266,5 +269,5 @@ def test_sklearn_compatible_estimator(estimator, check):
 from sklearn.metrics import roc_auc_score
 
 X, y = make_classification(n_classes=4, n_informative=16)
-clf = SVC(decision_function_shape='ovo', probability=True).fit(X, y)
-print(roc_auc_score(y, clf.predict_proba(X), multi_class='ovo'))
+clf = SVC(decision_function_shape="ovo", probability=True).fit(X, y)
+print(roc_auc_score(y, clf.predict_proba(X), multi_class="ovo"))
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index 409c41a035540..d81f5886e6c63 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -45,7 +45,7 @@
 y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 glm = PoissonRegressor()
-gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
+gbdt = HistGradientBoostingRegressor(loss="poisson", learning_rate=0.01)
 glm.fit(X_train, y_train)
 gbdt.fit(X_train, y_train)
 print(glm.score(X_test, y_test))
@@ -67,16 +67,19 @@
 from sklearn.impute import SimpleImputer
 from sklearn.compose import make_column_transformer
 from sklearn.linear_model import LogisticRegression
-set_config(display='diagram')
 
-num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
+set_config(display="diagram")
+
+num_proc = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
 
 cat_proc = make_pipeline(
-    SimpleImputer(strategy='constant', fill_value='missing'),
-    OneHotEncoder(handle_unknown='ignore'))
+    SimpleImputer(strategy="constant", fill_value="missing"),
+    OneHotEncoder(handle_unknown="ignore"),
+)
 
-preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
-                                       (cat_proc, ('feat0', 'feat2')))
+preprocessor = make_column_transformer(
+    (num_proc, ("feat1", "feat3")), (cat_proc, ("feat0", "feat2"))
+)
 
 clf = make_pipeline(preprocessor, LogisticRegression())
 clf
@@ -101,7 +104,7 @@
 X, y = make_blobs(random_state=rng)
 X = scipy.sparse.csr_matrix(X)
 X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
-kmeans = KMeans(algorithm='elkan').fit(X_train)
+kmeans = KMeans(algorithm="elkan").fit(X_train)
 print(completeness_score(kmeans.predict(X_test), y_test))
 
 ##############################################################################
@@ -129,21 +132,30 @@
 rng = np.random.RandomState(0)
 X = rng.randn(n_samples, 2)
 noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
-y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise)
+y = 5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise
 
 gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
 gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)
 
 disp = plot_partial_dependence(
-    gbdt_no_cst, X, features=[0], feature_names=['feature 0'],
-    line_kw={'linewidth': 4, 'label': 'unconstrained', "color": "tab:blue"})
-plot_partial_dependence(gbdt_cst, X, features=[0],
-    line_kw={'linewidth': 4, 'label': 'constrained', "color": "tab:orange"},
-    ax=disp.axes_)
+    gbdt_no_cst,
+    X,
+    features=[0],
+    feature_names=["feature 0"],
+    line_kw={"linewidth": 4, "label": "unconstrained", "color": "tab:blue"},
+)
+plot_partial_dependence(
+    gbdt_cst,
+    X,
+    features=[0],
+    line_kw={"linewidth": 4, "label": "constrained", "color": "tab:orange"},
+    ax=disp.axes_,
+)
 disp.axes_[0, 0].plot(
-    X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples', color="tab:green"
+    X[:, 0], y, "o", alpha=0.5, zorder=-1, label="samples", color="tab:green"
 )
-disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1)
+disp.axes_[0, 0].set_ylim(-3, 3)
+disp.axes_[0, 0].set_xlim(-1, 1)
 plt.legend()
 plt.show()
 
@@ -163,7 +175,8 @@
 X, y = make_regression(n_samples, n_features, random_state=rng)
 sample_weight = rng.rand(n_samples)
 X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
-    X, y, sample_weight, random_state=rng)
+    X, y, sample_weight, random_state=rng
+)
 reg = Lasso()
 reg.fit(X_train, y_train, sample_weight=sw_train)
 print(reg.score(X_test, y_test, sw_test))
diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py
index f5b10dfb21acc..e54e91fe5aafb 100644
--- a/examples/release_highlights/plot_release_highlights_0_24_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_24_0.py
@@ -43,7 +43,7 @@
 # Read more in the :ref:`User Guide <successive_halving_user_guide>` (note:
 # the Successive Halving estimators are still :term:`experimental
 # <experimental>`).
-# 
+#
 # .. figure:: ../model_selection/images/sphx_glr_plot_successive_halving_iterations_001.png
 #   :target: ../model_selection/plot_successive_halving_iterations.html
 #   :align: center
@@ -61,14 +61,17 @@
 
 clf = RandomForestClassifier(n_estimators=10, random_state=rng)
 
-param_dist = {"max_depth": [3, None],
-              "max_features": randint(1, 11),
-              "min_samples_split": randint(2, 11),
-              "bootstrap": [True, False],
-              "criterion": ["gini", "entropy"]}
+param_dist = {
+    "max_depth": [3, None],
+    "max_features": randint(1, 11),
+    "min_samples_split": randint(2, 11),
+    "bootstrap": [True, False],
+    "criterion": ["gini", "entropy"],
+}
 
-rsh = HalvingRandomSearchCV(estimator=clf, param_distributions=param_dist,
-                            factor=2, random_state=rng)
+rsh = HalvingRandomSearchCV(
+    estimator=clf, param_distributions=param_dist, factor=2, random_state=rng
+)
 rsh.fit(X, y)
 rsh.best_params_
 
@@ -145,8 +148,10 @@
 knn = KNeighborsClassifier(n_neighbors=3)
 sfs = SequentialFeatureSelector(knn, n_features_to_select=2)
 sfs.fit(X, y)
-print("Features selected by forward sequential selection: "
-      f"{feature_names[sfs.get_support()].tolist()}")
+print(
+    "Features selected by forward sequential selection: "
+    f"{feature_names[sfs.get_support()].tolist()}"
+)
 
 ##############################################################################
 # New PolynomialCountSketch kernel approximation function
@@ -164,19 +169,20 @@
 from sklearn.linear_model import LogisticRegression
 
 X, y = fetch_covtype(return_X_y=True)
-pipe = make_pipeline(MinMaxScaler(),
-                     PolynomialCountSketch(degree=2, n_components=300),
-                     LogisticRegression(max_iter=1000))
-X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000,
-                                                    test_size=10000,
-                                                    random_state=42)
+pipe = make_pipeline(
+    MinMaxScaler(),
+    PolynomialCountSketch(degree=2, n_components=300),
+    LogisticRegression(max_iter=1000),
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, train_size=5000, test_size=10000, random_state=42
+)
 pipe.fit(X_train, y_train).score(X_test, y_test)
 
 ##############################################################################
 # For comparison, here is the score of a linear baseline for the same data:
 
-linear_baseline = make_pipeline(MinMaxScaler(),
-                                LogisticRegression(max_iter=1000))
+linear_baseline = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1000))
 linear_baseline.fit(X_train, y_train).score(X_test, y_test)
 
 ##############################################################################
@@ -192,16 +198,22 @@
 from sklearn.inspection import plot_partial_dependence
 
 X, y = fetch_california_housing(return_X_y=True, as_frame=True)
-features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms']
+features = ["MedInc", "AveOccup", "HouseAge", "AveRooms"]
 est = RandomForestRegressor(n_estimators=10)
 est.fit(X, y)
 display = plot_partial_dependence(
-       est, X, features, kind="individual", subsample=50,
-       n_jobs=3, grid_resolution=20, random_state=0
+    est,
+    X,
+    features,
+    kind="individual",
+    subsample=50,
+    n_jobs=3,
+    grid_resolution=20,
+    random_state=0,
 )
 display.figure_.suptitle(
-    'Partial dependence of house value on non-location features\n'
-    'for the California housing dataset, with BayesianRidge'
+    "Partial dependence of house value on non-location features\n"
+    "for the California housing dataset, with BayesianRidge"
 )
 display.figure_.subplots_adjust(hspace=0.3)
 
@@ -223,7 +235,7 @@
 # positive integer target correlated with X[:, 5] with many zeros:
 y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
-regressor = DecisionTreeRegressor(criterion='poisson', random_state=0)
+regressor = DecisionTreeRegressor(criterion="poisson", random_state=0)
 regressor.fit(X_train, y_train)
 
 ##############################################################################
diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py
index 715546f78ab25..d75edca605b99 100644
--- a/examples/semi_supervised/plot_label_propagation_digits.py
+++ b/examples/semi_supervised/plot_label_propagation_digits.py
@@ -52,15 +52,17 @@ class will be very good.
 
 # #############################################################################
 # Learn with LabelSpreading
-lp_model = LabelSpreading(gamma=.25, max_iter=20)
+lp_model = LabelSpreading(gamma=0.25, max_iter=20)
 lp_model.fit(X, y_train)
 predicted_labels = lp_model.transduction_[unlabeled_set]
 true_labels = y[unlabeled_set]
 
 cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
 
-print("Label Spreading model: %d labeled & %d unlabeled points (%d total)" %
-      (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))
+print(
+    "Label Spreading model: %d labeled & %d unlabeled points (%d total)"
+    % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
+)
 
 print(classification_report(true_labels, predicted_labels))
 
@@ -85,8 +87,9 @@ class will be very good.
     sub.imshow(image, cmap=plt.cm.gray_r)
     plt.xticks([])
     plt.yticks([])
-    sub.set_title('predict: %i\ntrue: %i' % (
-        lp_model.transduction_[image_index], y[image_index]))
+    sub.set_title(
+        "predict: %i\ntrue: %i" % (lp_model.transduction_[image_index], y[image_index])
+    )
 
-f.suptitle('Learning with small amount of labeled data')
+f.suptitle("Learning with small amount of labeled data")
 plt.show()
diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
index a05fa07d755d7..e6c19403aa728 100644
--- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
+++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
@@ -60,13 +60,13 @@
     predicted_labels = lp_model.transduction_[unlabeled_indices]
     true_labels = y[unlabeled_indices]
 
-    cm = confusion_matrix(true_labels, predicted_labels,
-                          labels=lp_model.classes_)
+    cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)
 
     print("Iteration %i %s" % (i, 70 * "_"))
-    print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
-          % (n_labeled_points, n_total_samples - n_labeled_points,
-             n_total_samples))
+    print(
+        "Label Spreading model: %d labeled & %d unlabeled (%d total)"
+        % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)
+    )
 
     print(classification_report(true_labels, predicted_labels))
 
@@ -74,42 +74,50 @@
     print(cm)
 
     # compute the entropies of transduced label distributions
-    pred_entropies = stats.distributions.entropy(
-        lp_model.label_distributions_.T)
+    pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
 
     # select up to 5 digit examples that the classifier is most uncertain about
     uncertainty_index = np.argsort(pred_entropies)[::-1]
     uncertainty_index = uncertainty_index[
-        np.in1d(uncertainty_index, unlabeled_indices)][:5]
+        np.in1d(uncertainty_index, unlabeled_indices)
+    ][:5]
 
     # keep track of indices that we get labels for
     delete_indices = np.array([], dtype=int)
 
     # for more than 5 iterations, visualize the gain only on the first 5
     if i < 5:
-        f.text(.05, (1 - (i + 1) * .183),
-               "model %d\n\nfit with\n%d labels" %
-               ((i + 1), i * 5 + 10), size=10)
+        f.text(
+            0.05,
+            (1 - (i + 1) * 0.183),
+            "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10),
+            size=10,
+        )
     for index, image_index in enumerate(uncertainty_index):
         image = images[image_index]
 
         # for more than 5 iterations, visualize the gain only on the first 5
         if i < 5:
             sub = f.add_subplot(5, 5, index + 1 + (5 * i))
-            sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none')
-            sub.set_title("predict: %i\ntrue: %i" % (
-                lp_model.transduction_[image_index], y[image_index]), size=10)
-            sub.axis('off')
+            sub.imshow(image, cmap=plt.cm.gray_r, interpolation="none")
+            sub.set_title(
+                "predict: %i\ntrue: %i"
+                % (lp_model.transduction_[image_index], y[image_index]),
+                size=10,
+            )
+            sub.axis("off")
 
         # labeling 5 points, remote from labeled set
-        delete_index, = np.where(unlabeled_indices == image_index)
+        (delete_index,) = np.where(unlabeled_indices == image_index)
         delete_indices = np.concatenate((delete_indices, delete_index))
 
     unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
     n_labeled_points += len(uncertainty_index)
 
-f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
-           "uncertain labels to learn with the next model.", y=1.15)
-plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2,
-                    hspace=0.85)
+f.suptitle(
+    "Active learning with Label Propagation.\nRows show 5 most "
+    "uncertain labels to learn with the next model.",
+    y=1.15,
+)
+plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, hspace=0.85)
 plt.show()
diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py
index f0145bcd53ccb..e94eab6e16dc1 100644
--- a/examples/semi_supervised/plot_label_propagation_structure.py
+++ b/examples/semi_supervised/plot_label_propagation_structure.py
@@ -24,13 +24,13 @@
 n_samples = 200
 X, y = make_circles(n_samples=n_samples, shuffle=False)
 outer, inner = 0, 1
-labels = np.full(n_samples, -1.)
+labels = np.full(n_samples, -1.0)
 labels[0] = outer
 labels[-1] = inner
 
 # #############################################################################
 # Learn with LabelSpreading
-label_spread = LabelSpreading(kernel='knn', alpha=0.8)
+label_spread = LabelSpreading(kernel="knn", alpha=0.8)
 label_spread.fit(X, labels)
 
 # #############################################################################
@@ -38,24 +38,57 @@
 output_labels = label_spread.transduction_
 plt.figure(figsize=(8.5, 4))
 plt.subplot(1, 2, 1)
-plt.scatter(X[labels == outer, 0], X[labels == outer, 1], color='navy',
-            marker='s', lw=0, label="outer labeled", s=10)
-plt.scatter(X[labels == inner, 0], X[labels == inner, 1], color='c',
-            marker='s', lw=0, label='inner labeled', s=10)
-plt.scatter(X[labels == -1, 0], X[labels == -1, 1], color='darkorange',
-            marker='.', label='unlabeled')
-plt.legend(scatterpoints=1, shadow=False, loc='upper right')
+plt.scatter(
+    X[labels == outer, 0],
+    X[labels == outer, 1],
+    color="navy",
+    marker="s",
+    lw=0,
+    label="outer labeled",
+    s=10,
+)
+plt.scatter(
+    X[labels == inner, 0],
+    X[labels == inner, 1],
+    color="c",
+    marker="s",
+    lw=0,
+    label="inner labeled",
+    s=10,
+)
+plt.scatter(
+    X[labels == -1, 0],
+    X[labels == -1, 1],
+    color="darkorange",
+    marker=".",
+    label="unlabeled",
+)
+plt.legend(scatterpoints=1, shadow=False, loc="upper right")
 plt.title("Raw data (2 classes=outer and inner)")
 
 plt.subplot(1, 2, 2)
 output_label_array = np.asarray(output_labels)
 outer_numbers = np.where(output_label_array == outer)[0]
 inner_numbers = np.where(output_label_array == inner)[0]
-plt.scatter(X[outer_numbers, 0], X[outer_numbers, 1], color='navy',
-            marker='s', lw=0, s=10, label="outer learned")
-plt.scatter(X[inner_numbers, 0], X[inner_numbers, 1], color='c',
-            marker='s', lw=0, s=10, label="inner learned")
-plt.legend(scatterpoints=1, shadow=False, loc='upper right')
+plt.scatter(
+    X[outer_numbers, 0],
+    X[outer_numbers, 1],
+    color="navy",
+    marker="s",
+    lw=0,
+    s=10,
+    label="outer learned",
+)
+plt.scatter(
+    X[inner_numbers, 0],
+    X[inner_numbers, 1],
+    color="c",
+    marker="s",
+    lw=0,
+    s=10,
+    label="inner learned",
+)
+plt.legend(scatterpoints=1, shadow=False, loc="upper right")
 plt.title("Labels learned with Label Spreading (KNN)")
 
 plt.subplots_adjust(left=0.07, bottom=0.07, right=0.93, top=0.92)
diff --git a/examples/semi_supervised/plot_self_training_varying_threshold.py b/examples/semi_supervised/plot_self_training_varying_threshold.py
index c2c89a36b5e8c..fa2ac289086d3 100644
--- a/examples/semi_supervised/plot_self_training_varying_threshold.py
+++ b/examples/semi_supervised/plot_self_training_varying_threshold.py
@@ -58,8 +58,7 @@
 amount_iterations = np.empty((x_values.shape[0], n_splits))
 
 for (i, threshold) in enumerate(x_values):
-    self_training_clf = SelfTrainingClassifier(base_classifier,
-                                               threshold=threshold)
+    self_training_clf = SelfTrainingClassifier(base_classifier, threshold=threshold)
 
     # We need manual cross validation so that we don't treat -1 as a separate
     # class when computing accuracy
@@ -74,8 +73,10 @@
         self_training_clf.fit(X_train, y_train)
 
         # The amount of labeled samples that at the end of fitting
-        amount_labeled[i, fold] = total_samples - np.unique(
-            self_training_clf.labeled_iter_, return_counts=True)[1][0]
+        amount_labeled[i, fold] = (
+            total_samples
+            - np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0]
+        )
         # The last iteration the classifier labeled a sample in
         amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_)
 
@@ -84,26 +85,34 @@
 
 
 ax1 = plt.subplot(211)
-ax1.errorbar(x_values, scores.mean(axis=1),
-             yerr=scores.std(axis=1),
-             capsize=2, color='b')
-ax1.set_ylabel('Accuracy', color='b')
-ax1.tick_params('y', colors='b')
+ax1.errorbar(
+    x_values, scores.mean(axis=1), yerr=scores.std(axis=1), capsize=2, color="b"
+)
+ax1.set_ylabel("Accuracy", color="b")
+ax1.tick_params("y", colors="b")
 
 ax2 = ax1.twinx()
-ax2.errorbar(x_values, amount_labeled.mean(axis=1),
-             yerr=amount_labeled.std(axis=1),
-             capsize=2, color='g')
+ax2.errorbar(
+    x_values,
+    amount_labeled.mean(axis=1),
+    yerr=amount_labeled.std(axis=1),
+    capsize=2,
+    color="g",
+)
 ax2.set_ylim(bottom=0)
-ax2.set_ylabel('Amount of labeled samples', color='g')
-ax2.tick_params('y', colors='g')
+ax2.set_ylabel("Amount of labeled samples", color="g")
+ax2.tick_params("y", colors="g")
 
 ax3 = plt.subplot(212, sharex=ax1)
-ax3.errorbar(x_values, amount_iterations.mean(axis=1),
-             yerr=amount_iterations.std(axis=1),
-             capsize=2, color='b')
+ax3.errorbar(
+    x_values,
+    amount_iterations.mean(axis=1),
+    yerr=amount_iterations.std(axis=1),
+    capsize=2,
+    color="b",
+)
 ax3.set_ylim(bottom=0)
-ax3.set_ylabel('Amount of iterations')
-ax3.set_xlabel('Threshold')
+ax3.set_ylabel("Amount of iterations")
+ax3.set_xlabel("Threshold")
 
 plt.show()
diff --git a/examples/semi_supervised/plot_semi_supervised_newsgroups.py b/examples/semi_supervised/plot_semi_supervised_newsgroups.py
index 72815471f54b9..7316417e86120 100644
--- a/examples/semi_supervised/plot_semi_supervised_newsgroups.py
+++ b/examples/semi_supervised/plot_semi_supervised_newsgroups.py
@@ -24,45 +24,52 @@
 from sklearn.semi_supervised import LabelSpreading
 from sklearn.metrics import f1_score
 
-data = fetch_20newsgroups(subset='train', categories=None)
+data = fetch_20newsgroups(subset="train", categories=None)
 print("%d documents" % len(data.filenames))
 print("%d categories" % len(data.target_names))
 print()
 
 # Parameters
-sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
+sdg_params = dict(alpha=1e-5, penalty="l2", loss="log")
 vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)
 
 # Supervised Pipeline
-pipeline = Pipeline([
-    ('vect', CountVectorizer(**vectorizer_params)),
-    ('tfidf', TfidfTransformer()),
-    ('clf', SGDClassifier(**sdg_params)),
-])
+pipeline = Pipeline(
+    [
+        ("vect", CountVectorizer(**vectorizer_params)),
+        ("tfidf", TfidfTransformer()),
+        ("clf", SGDClassifier(**sdg_params)),
+    ]
+)
 # SelfTraining Pipeline
-st_pipeline = Pipeline([
-    ('vect', CountVectorizer(**vectorizer_params)),
-    ('tfidf', TfidfTransformer()),
-    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
-])
+st_pipeline = Pipeline(
+    [
+        ("vect", CountVectorizer(**vectorizer_params)),
+        ("tfidf", TfidfTransformer()),
+        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
+    ]
+)
 # LabelSpreading Pipeline
-ls_pipeline = Pipeline([
-    ('vect', CountVectorizer(**vectorizer_params)),
-    ('tfidf', TfidfTransformer()),
-    # LabelSpreading does not support dense matrices
-    ('todense', FunctionTransformer(lambda x: x.todense())),
-    ('clf', LabelSpreading()),
-])
+ls_pipeline = Pipeline(
+    [
+        ("vect", CountVectorizer(**vectorizer_params)),
+        ("tfidf", TfidfTransformer()),
+        # LabelSpreading does not support dense matrices
+        ("todense", FunctionTransformer(lambda x: x.todense())),
+        ("clf", LabelSpreading()),
+    ]
+)
 
 
 def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
     print("Number of training samples:", len(X_train))
-    print("Unlabeled samples in training set:",
-          sum(1 for x in y_train if x == -1))
+    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test)
-    print("Micro-averaged F1 score on test set: "
-          "%0.3f" % f1_score(y_test, y_pred, average='micro'))
+    print(
+        "Micro-averaged F1 score on test set: %0.3f"
+        % f1_score(y_test, y_pred, average="micro")
+    )
     print("-" * 10)
     print()
 
@@ -78,18 +85,18 @@ def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
     y_mask = np.random.rand(len(y_train)) < 0.2
 
     # X_20 and y_20 are the subset of the train dataset indicated by the mask
-    X_20, y_20 = map(list, zip(*((x, y)
-                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
+    X_20, y_20 = map(
+        list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m))
+    )
     print("Supervised SGDClassifier on 20% of the training data:")
     eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)
 
     # set the non-masked subset to be unlabeled
     y_train[~y_mask] = -1
-    print("SelfTrainingClassifier on 20% of the training data (rest "
-          "is unlabeled):")
+    print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):")
     eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)
 
-    if 'CI' not in os.environ:
+    if "CI" not in os.environ:
         # LabelSpreading takes too long to run in the online documentation
         print("LabelSpreading on 20% of the data (rest is unlabeled):")
         eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)
diff --git a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
index f93dc2f28370e..f9703fd44a902 100644
--- a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
+++ b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py
@@ -33,7 +33,7 @@
 y = iris.target
 
 # step size in the mesh
-h = .02
+h = 0.02
 
 rng = np.random.RandomState(0)
 y_rand = rng.rand(y.shape[0])
@@ -43,26 +43,31 @@
 y_50[y_rand < 0.5] = -1
 # we create an instance of SVM and fit out data. We do not scale our
 # data since we want to plot the support vectors
-ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data')
-ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data')
-ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data')
+ls30 = (LabelSpreading().fit(X, y_30), y_30, "Label Spreading 30% data")
+ls50 = (LabelSpreading().fit(X, y_50), y_50, "Label Spreading 50% data")
+ls100 = (LabelSpreading().fit(X, y), y, "Label Spreading 100% data")
 
 # the base classifier for self-training is identical to the SVC
-base_classifier = SVC(kernel='rbf', gamma=.5, probability=True)
-st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30),
-        y_30, 'Self-training 30% data')
-st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50),
-        y_50, 'Self-training 50% data')
-
-rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel')
+base_classifier = SVC(kernel="rbf", gamma=0.5, probability=True)
+st30 = (
+    SelfTrainingClassifier(base_classifier).fit(X, y_30),
+    y_30,
+    "Self-training 30% data",
+)
+st50 = (
+    SelfTrainingClassifier(base_classifier).fit(X, y_50),
+    y_50,
+    "Self-training 50% data",
+)
+
+rbf_svc = (SVC(kernel="rbf", gamma=0.5).fit(X, y), y, "SVC with rbf kernel")
 
 # create a mesh to plot in
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                     np.arange(y_min, y_max, h))
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
-color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}
+color_map = {-1: (1, 1, 1), 0: (0, 0, 0.9), 1: (1, 0, 0), 2: (0.8, 0.6, 0)}
 
 classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc)
 for i, (clf, y_train, title) in enumerate(classifiers):
@@ -74,11 +79,11 @@
     # Put the result into a color plot
     Z = Z.reshape(xx.shape)
     plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
-    plt.axis('off')
+    plt.axis("off")
 
     # Plot also the training points
     colors = [color_map[y] for y in y_train]
-    plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')
+    plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors="black")
 
     plt.title(title)
 
diff --git a/examples/svm/plot_custom_kernel.py b/examples/svm/plot_custom_kernel.py
index 5ee70f8cc7801..86fe0aa8e585e 100644
--- a/examples/svm/plot_custom_kernel.py
+++ b/examples/svm/plot_custom_kernel.py
@@ -32,7 +32,7 @@ def my_kernel(X, Y):
     return np.dot(np.dot(X, M), Y.T)
 
 
-h = .02  # step size in the mesh
+h = 0.02  # step size in the mesh
 
 # we create an instance of SVM and fit out data.
 clf = svm.SVC(kernel=my_kernel)
@@ -50,8 +50,7 @@ def my_kernel(X, Y):
 plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
 
 # Plot also the training points
-plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
-plt.title('3-Class classification using Support Vector Machine with custom'
-          ' kernel')
-plt.axis('tight')
+plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors="k")
+plt.title("3-Class classification using Support Vector Machine with custom kernel")
+plt.axis("tight")
 plt.show()
diff --git a/examples/svm/plot_iris_svc.py b/examples/svm/plot_iris_svc.py
index ab7860296985c..10b64f1c62e09 100644
--- a/examples/svm/plot_iris_svc.py
+++ b/examples/svm/plot_iris_svc.py
@@ -40,7 +40,7 @@
 from sklearn import svm, datasets
 
 
-def make_meshgrid(x, y, h=.02):
+def make_meshgrid(x, y, h=0.02):
     """Create a mesh of points to plot in
 
     Parameters
@@ -55,8 +55,7 @@ def make_meshgrid(x, y, h=.02):
     """
     x_min, x_max = x.min() - 1, x.max() + 1
     y_min, y_max = y.min() - 1, y.max() + 1
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
-                         np.arange(y_min, y_max, h))
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
     return xx, yy
 
 
@@ -86,17 +85,21 @@ def plot_contours(ax, clf, xx, yy, **params):
 # we create an instance of SVM and fit out data. We do not scale our
 # data since we want to plot the support vectors
 C = 1.0  # SVM regularization parameter
-models = (svm.SVC(kernel='linear', C=C),
-          svm.LinearSVC(C=C, max_iter=10000),
-          svm.SVC(kernel='rbf', gamma=0.7, C=C),
-          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C))
+models = (
+    svm.SVC(kernel="linear", C=C),
+    svm.LinearSVC(C=C, max_iter=10000),
+    svm.SVC(kernel="rbf", gamma=0.7, C=C),
+    svm.SVC(kernel="poly", degree=3, gamma="auto", C=C),
+)
 models = (clf.fit(X, y) for clf in models)
 
 # title for the plots
-titles = ('SVC with linear kernel',
-          'LinearSVC (linear kernel)',
-          'SVC with RBF kernel',
-          'SVC with polynomial (degree 3) kernel')
+titles = (
+    "SVC with linear kernel",
+    "LinearSVC (linear kernel)",
+    "SVC with RBF kernel",
+    "SVC with polynomial (degree 3) kernel",
+)
 
 # Set-up 2x2 grid for plotting.
 fig, sub = plt.subplots(2, 2)
@@ -106,13 +109,12 @@ def plot_contours(ax, clf, xx, yy, **params):
 xx, yy = make_meshgrid(X0, X1)
 
 for clf, title, ax in zip(models, titles, sub.flatten()):
-    plot_contours(ax, clf, xx, yy,
-                  cmap=plt.cm.coolwarm, alpha=0.8)
-    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
+    plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
+    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
     ax.set_xlim(xx.min(), xx.max())
     ax.set_ylim(yy.min(), yy.max())
-    ax.set_xlabel('Sepal length')
-    ax.set_ylabel('Sepal width')
+    ax.set_xlabel("Sepal length")
+    ax.set_ylabel("Sepal width")
     ax.set_xticks(())
     ax.set_yticks(())
     ax.set_title(title)
diff --git a/examples/svm/plot_linearsvc_support_vectors.py b/examples/svm/plot_linearsvc_support_vectors.py
index cc7e9caa5cda8..298ec5e2419fb 100644
--- a/examples/svm/plot_linearsvc_support_vectors.py
+++ b/examples/svm/plot_linearsvc_support_vectors.py
@@ -26,8 +26,7 @@
     # decision_function = np.dot(X, clf.coef_[0]) + clf.intercept_[0]
     # The support vectors are the samples that lie within the margin
     # boundaries, whose size is conventionally constrained to 1
-    support_vector_indices = np.where(
-        np.abs(decision_function) <= 1 + 1e-15)[0]
+    support_vector_indices = np.where(np.abs(decision_function) <= 1 + 1e-15)[0]
     support_vectors = X[support_vector_indices]
 
     plt.subplot(1, 2, i + 1)
@@ -35,14 +34,28 @@
     ax = plt.gca()
     xlim = ax.get_xlim()
     ylim = ax.get_ylim()
-    xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50),
-                         np.linspace(ylim[0], ylim[1], 50))
+    xx, yy = np.meshgrid(
+        np.linspace(xlim[0], xlim[1], 50), np.linspace(ylim[0], ylim[1], 50)
+    )
     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
     Z = Z.reshape(xx.shape)
-    plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
-                linestyles=['--', '-', '--'])
-    plt.scatter(support_vectors[:, 0], support_vectors[:, 1], s=100,
-                linewidth=1, facecolors='none', edgecolors='k')
+    plt.contour(
+        xx,
+        yy,
+        Z,
+        colors="k",
+        levels=[-1, 0, 1],
+        alpha=0.5,
+        linestyles=["--", "-", "--"],
+    )
+    plt.scatter(
+        support_vectors[:, 0],
+        support_vectors[:, 1],
+        s=100,
+        linewidth=1,
+        facecolors="none",
+        edgecolors="k",
+    )
     plt.title("C=" + str(C))
 plt.tight_layout()
 plt.show()
diff --git a/examples/svm/plot_oneclass.py b/examples/svm/plot_oneclass.py
index 3f04537ca1f00..a3b060a0e67c0 100644
--- a/examples/svm/plot_oneclass.py
+++ b/examples/svm/plot_oneclass.py
@@ -42,25 +42,29 @@
 
 plt.title("Novelty Detection")
 plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
-a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
-plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
+a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
+plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")
 
 s = 40
-b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
-b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
-                 edgecolors='k')
-c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
-                edgecolors='k')
-plt.axis('tight')
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")
+plt.axis("tight")
 plt.xlim((-5, 5))
 plt.ylim((-5, 5))
-plt.legend([a.collections[0], b1, b2, c],
-           ["learned frontier", "training observations",
-            "new regular observations", "new abnormal observations"],
-           loc="upper left",
-           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.legend(
+    [a.collections[0], b1, b2, c],
+    [
+        "learned frontier",
+        "training observations",
+        "new regular observations",
+        "new abnormal observations",
+    ],
+    loc="upper left",
+    prop=matplotlib.font_manager.FontProperties(size=11),
+)
 plt.xlabel(
-    "error train: %d/200 ; errors novel regular: %d/40 ; "
-    "errors novel abnormal: %d/40"
-    % (n_error_train, n_error_test, n_error_outliers))
+    "error train: %d/200 ; errors novel regular: %d/40 ; errors novel abnormal: %d/40"
+    % (n_error_train, n_error_test, n_error_outliers)
+)
 plt.show()
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index eda5a7248c24d..19cae930d93a8 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -1,4 +1,4 @@
-'''
+"""
 ==================
 RBF SVM parameters
 ==================
@@ -73,7 +73,7 @@
 ``gamma_range`` steps will increase the resolution of the hyper-parameter heat
 map.
 
-'''
+"""
 print(__doc__)
 
 import numpy as np
@@ -90,8 +90,8 @@
 # Utility function to move the midpoint of a colormap to be around
 # the values of interest.
 
-class MidpointNormalize(Normalize):
 
+class MidpointNormalize(Normalize):
     def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
         self.midpoint = midpoint
         Normalize.__init__(self, vmin, vmax, clip)
@@ -100,6 +100,7 @@ def __call__(self, value, clip=None):
         x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
         return np.ma.masked_array(np.interp(value, x, y))
 
+
 # #############################################################################
 # Load and prepare data set
 #
@@ -142,8 +143,10 @@ def __call__(self, value, clip=None):
 grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
 grid.fit(X, y)
 
-print("The best parameters are %s with a score of %0.2f"
-      % (grid.best_params_, grid.best_score_))
+print(
+    "The best parameters are %s with a score of %0.2f"
+    % (grid.best_params_, grid.best_score_)
+)
 
 # Now we need to fit a classifier for all parameters in the 2d version
 # (we use a smaller set of parameters here because it takes a while to train)
@@ -171,19 +174,16 @@ def __call__(self, value, clip=None):
 
     # visualize decision function for these parameters
     plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1)
-    plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)),
-              size='medium')
+    plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)), size="medium")
 
     # visualize parameter's effect on decision function
     plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu)
-    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r,
-                edgecolors='k')
+    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r, edgecolors="k")
     plt.xticks(())
     plt.yticks(())
-    plt.axis('tight')
+    plt.axis("tight")
 
-scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
-                                                     len(gamma_range))
+scores = grid.cv_results_["mean_test_score"].reshape(len(C_range), len(gamma_range))
 
 # Draw heatmap of the validation accuracy as a function of gamma and C
 #
@@ -195,13 +195,17 @@ def __call__(self, value, clip=None):
 # the same color.
 
 plt.figure(figsize=(8, 6))
-plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
-plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
-           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
-plt.xlabel('gamma')
-plt.ylabel('C')
+plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95)
+plt.imshow(
+    scores,
+    interpolation="nearest",
+    cmap=plt.cm.hot,
+    norm=MidpointNormalize(vmin=0.2, midpoint=0.92),
+)
+plt.xlabel("gamma")
+plt.ylabel("C")
 plt.colorbar()
 plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
 plt.yticks(np.arange(len(C_range)), C_range)
-plt.title('Validation accuracy')
+plt.title("Validation accuracy")
 plt.show()
diff --git a/examples/svm/plot_separating_hyperplane.py b/examples/svm/plot_separating_hyperplane.py
index cbd61abad53e6..cfb4a195e8a12 100644
--- a/examples/svm/plot_separating_hyperplane.py
+++ b/examples/svm/plot_separating_hyperplane.py
@@ -19,7 +19,7 @@
 X, y = make_blobs(n_samples=40, centers=2, random_state=6)
 
 # fit the model, don't regularize for illustration purposes
-clf = svm.SVC(kernel='linear', C=1000)
+clf = svm.SVC(kernel="linear", C=1000)
 clf.fit(X, y)
 
 plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired)
@@ -37,9 +37,16 @@
 Z = clf.decision_function(xy).reshape(XX.shape)
 
 # plot decision boundary and margins
-ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
-           linestyles=['--', '-', '--'])
+ax.contour(
+    XX, YY, Z, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"]
+)
 # plot support vectors
-ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,
-           linewidth=1, facecolors='none', edgecolors='k')
+ax.scatter(
+    clf.support_vectors_[:, 0],
+    clf.support_vectors_[:, 1],
+    s=100,
+    linewidth=1,
+    facecolors="none",
+    edgecolors="k",
+)
 plt.show()
diff --git a/examples/svm/plot_separating_hyperplane_unbalanced.py b/examples/svm/plot_separating_hyperplane_unbalanced.py
index 2a0540fead310..a1db23f5f5ca8 100644
--- a/examples/svm/plot_separating_hyperplane_unbalanced.py
+++ b/examples/svm/plot_separating_hyperplane_unbalanced.py
@@ -36,21 +36,24 @@
 n_samples_2 = 100
 centers = [[0.0, 0.0], [2.0, 2.0]]
 clusters_std = [1.5, 0.5]
-X, y = make_blobs(n_samples=[n_samples_1, n_samples_2],
-                  centers=centers,
-                  cluster_std=clusters_std,
-                  random_state=0, shuffle=False)
+X, y = make_blobs(
+    n_samples=[n_samples_1, n_samples_2],
+    centers=centers,
+    cluster_std=clusters_std,
+    random_state=0,
+    shuffle=False,
+)
 
 # fit the model and get the separating hyperplane
-clf = svm.SVC(kernel='linear', C=1.0)
+clf = svm.SVC(kernel="linear", C=1.0)
 clf.fit(X, y)
 
 # fit the model and get the separating hyperplane using weighted classes
-wclf = svm.SVC(kernel='linear', class_weight={1: 10})
+wclf = svm.SVC(kernel="linear", class_weight={1: 10})
 wclf.fit(X, y)
 
 # plot the samples
-plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
+plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors="k")
 
 # plot the decision functions for both classifiers
 ax = plt.gca()
@@ -67,14 +70,17 @@
 Z = clf.decision_function(xy).reshape(XX.shape)
 
 # plot decision boundary and margins
-a = ax.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.5, linestyles=['-'])
+a = ax.contour(XX, YY, Z, colors="k", levels=[0], alpha=0.5, linestyles=["-"])
 
 # get the separating hyperplane for weighted classes
 Z = wclf.decision_function(xy).reshape(XX.shape)
 
 # plot decision boundary and margins for weighted classes
-b = ax.contour(XX, YY, Z, colors='r', levels=[0], alpha=0.5, linestyles=['-'])
+b = ax.contour(XX, YY, Z, colors="r", levels=[0], alpha=0.5, linestyles=["-"])
 
-plt.legend([a.collections[0], b.collections[0]], ["non weighted", "weighted"],
-           loc="upper right")
+plt.legend(
+    [a.collections[0], b.collections[0]],
+    ["non weighted", "weighted"],
+    loc="upper right",
+)
 plt.show()
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index b0392b1c00361..3fa7d05240df0 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -30,9 +30,13 @@
 # #############################################################################
 # Create a feature-selection transform, a scaler and an instance of SVM that we
 # combine together to have a full-blown estimator
-clf = Pipeline([('anova', SelectPercentile(chi2)),
-                ('scaler', StandardScaler()),
-                ('svc', SVC(gamma="auto"))])
+clf = Pipeline(
+    [
+        ("anova", SelectPercentile(chi2)),
+        ("scaler", StandardScaler()),
+        ("svc", SVC(gamma="auto")),
+    ]
+)
 
 # #############################################################################
 # Plot the cross-validation score as a function of percentile of features
@@ -47,10 +51,9 @@
     score_stds.append(this_scores.std())
 
 plt.errorbar(percentiles, score_means, np.array(score_stds))
-plt.title(
-    'Performance of the SVM-Anova varying the percentile of features selected')
+plt.title("Performance of the SVM-Anova varying the percentile of features selected")
 plt.xticks(np.linspace(0, 100, 11, endpoint=True))
-plt.xlabel('Percentile')
-plt.ylabel('Accuracy Score')
-plt.axis('tight')
+plt.xlabel("Percentile")
+plt.ylabel("Accuracy Score")
+plt.axis("tight")
 plt.show()
diff --git a/examples/svm/plot_svm_kernels.py b/examples/svm/plot_svm_kernels.py
index dbad4e0b725e2..5d8ef3403797f 100644
--- a/examples/svm/plot_svm_kernels.py
+++ b/examples/svm/plot_svm_kernels.py
@@ -24,30 +24,32 @@
 
 
 # Our dataset and targets
-X = np.c_[(.4, -.7),
-          (-1.5, -1),
-          (-1.4, -.9),
-          (-1.3, -1.2),
-          (-1.1, -.2),
-          (-1.2, -.4),
-          (-.5, 1.2),
-          (-1.5, 2.1),
-          (1, 1),
-          # --
-          (1.3, .8),
-          (1.2, .5),
-          (.2, -2),
-          (.5, -2.4),
-          (.2, -2.3),
-          (0, -2.7),
-          (1.3, 2.1)].T
+X = np.c_[
+    (0.4, -0.7),
+    (-1.5, -1),
+    (-1.4, -0.9),
+    (-1.3, -1.2),
+    (-1.1, -0.2),
+    (-1.2, -0.4),
+    (-0.5, 1.2),
+    (-1.5, 2.1),
+    (1, 1),
+    # --
+    (1.3, 0.8),
+    (1.2, 0.5),
+    (0.2, -2),
+    (0.5, -2.4),
+    (0.2, -2.3),
+    (0, -2.7),
+    (1.3, 2.1),
+].T
 Y = [0] * 8 + [1] * 8
 
 # figure number
 fignum = 1
 
 # fit the model
-for kernel in ('linear', 'poly', 'rbf'):
+for kernel in ("linear", "poly", "rbf"):
     clf = svm.SVC(kernel=kernel, gamma=2)
     clf.fit(X, Y)
 
@@ -55,12 +57,17 @@
     plt.figure(fignum, figsize=(4, 3))
     plt.clf()
 
-    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
-                facecolors='none', zorder=10, edgecolors='k')
-    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired,
-                edgecolors='k')
-
-    plt.axis('tight')
+    plt.scatter(
+        clf.support_vectors_[:, 0],
+        clf.support_vectors_[:, 1],
+        s=80,
+        facecolors="none",
+        zorder=10,
+        edgecolors="k",
+    )
+    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired, edgecolors="k")
+
+    plt.axis("tight")
     x_min = -3
     x_max = 3
     y_min = -3
@@ -73,8 +80,14 @@
     Z = Z.reshape(XX.shape)
     plt.figure(fignum, figsize=(4, 3))
     plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
-    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
-                levels=[-.5, 0, .5])
+    plt.contour(
+        XX,
+        YY,
+        Z,
+        colors=["k", "k", "k"],
+        linestyles=["--", "-", "--"],
+        levels=[-0.5, 0, 0.5],
+    )
 
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py
index 5b267957677f8..123cfafff68e1 100644
--- a/examples/svm/plot_svm_margin.py
+++ b/examples/svm/plot_svm_margin.py
@@ -36,9 +36,9 @@
 fignum = 1
 
 # fit the model
-for name, penalty in (('unreg', 1), ('reg', 0.05)):
+for name, penalty in (("unreg", 1), ("reg", 0.05)):
 
-    clf = svm.SVC(kernel='linear', C=penalty)
+    clf = svm.SVC(kernel="linear", C=penalty)
     clf.fit(X, Y)
 
     # get the separating hyperplane
@@ -58,17 +58,24 @@
     # plot the line, the points, and the nearest vectors to the plane
     plt.figure(fignum, figsize=(4, 3))
     plt.clf()
-    plt.plot(xx, yy, 'k-')
-    plt.plot(xx, yy_down, 'k--')
-    plt.plot(xx, yy_up, 'k--')
-
-    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
-                facecolors='none', zorder=10, edgecolors='k',
-                cmap=cm.get_cmap('RdBu'))
-    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=cm.get_cmap('RdBu'),
-                edgecolors='k')
-
-    plt.axis('tight')
+    plt.plot(xx, yy, "k-")
+    plt.plot(xx, yy_down, "k--")
+    plt.plot(xx, yy_up, "k--")
+
+    plt.scatter(
+        clf.support_vectors_[:, 0],
+        clf.support_vectors_[:, 1],
+        s=80,
+        facecolors="none",
+        zorder=10,
+        edgecolors="k",
+        cmap=cm.get_cmap("RdBu"),
+    )
+    plt.scatter(
+        X[:, 0], X[:, 1], c=Y, zorder=10, cmap=cm.get_cmap("RdBu"), edgecolors="k"
+    )
+
+    plt.axis("tight")
     x_min = -4.8
     x_max = 4.2
     y_min = -6
@@ -79,8 +86,7 @@
     Z = clf.decision_function(xy).reshape(XX.shape)
 
     # Put the result into a contour plot
-    plt.contourf(XX, YY, Z, cmap=cm.get_cmap('RdBu'),
-                 alpha=0.5, linestyles=['-'])
+    plt.contourf(XX, YY, Z, cmap=cm.get_cmap("RdBu"), alpha=0.5, linestyles=["-"])
 
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
diff --git a/examples/svm/plot_svm_nonlinear.py b/examples/svm/plot_svm_nonlinear.py
index 47575d992a63b..09ae1febc8873 100644
--- a/examples/svm/plot_svm_nonlinear.py
+++ b/examples/svm/plot_svm_nonlinear.py
@@ -15,27 +15,29 @@
 import matplotlib.pyplot as plt
 from sklearn import svm
 
-xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
-                     np.linspace(-3, 3, 500))
+xx, yy = np.meshgrid(np.linspace(-3, 3, 500), np.linspace(-3, 3, 500))
 np.random.seed(0)
 X = np.random.randn(300, 2)
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
-clf = svm.NuSVC(gamma='auto')
+clf = svm.NuSVC(gamma="auto")
 clf.fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
 Z = Z.reshape(xx.shape)
 
-plt.imshow(Z, interpolation='nearest',
-           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
-           origin='lower', cmap=plt.cm.PuOr_r)
-contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
-                       linestyles='dashed')
-plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
-            edgecolors='k')
+plt.imshow(
+    Z,
+    interpolation="nearest",
+    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+    aspect="auto",
+    origin="lower",
+    cmap=plt.cm.PuOr_r,
+)
+contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linestyles="dashed")
+plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors="k")
 plt.xticks(())
 plt.yticks(())
 plt.axis([-3, 3, -3, 3])
diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py
index a91b588a15f63..ca45951a9f224 100644
--- a/examples/svm/plot_svm_regression.py
+++ b/examples/svm/plot_svm_regression.py
@@ -23,34 +23,52 @@
 
 # #############################################################################
 # Fit regression model
-svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
-svr_lin = SVR(kernel='linear', C=100, gamma='auto')
-svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
-               coef0=1)
+svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
+svr_lin = SVR(kernel="linear", C=100, gamma="auto")
+svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)
 
 # #############################################################################
 # Look at the results
 lw = 2
 
 svrs = [svr_rbf, svr_lin, svr_poly]
-kernel_label = ['RBF', 'Linear', 'Polynomial']
-model_color = ['m', 'c', 'g']
+kernel_label = ["RBF", "Linear", "Polynomial"]
+model_color = ["m", "c", "g"]
 
 fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True)
 for ix, svr in enumerate(svrs):
-    axes[ix].plot(X, svr.fit(X, y).predict(X), color=model_color[ix], lw=lw,
-                  label='{} model'.format(kernel_label[ix]))
-    axes[ix].scatter(X[svr.support_], y[svr.support_], facecolor="none",
-                     edgecolor=model_color[ix], s=50,
-                     label='{} support vectors'.format(kernel_label[ix]))
-    axes[ix].scatter(X[np.setdiff1d(np.arange(len(X)), svr.support_)],
-                     y[np.setdiff1d(np.arange(len(X)), svr.support_)],
-                     facecolor="none", edgecolor="k", s=50,
-                     label='other training data')
-    axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),
-                    ncol=1, fancybox=True, shadow=True)
-
-fig.text(0.5, 0.04, 'data', ha='center', va='center')
-fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical')
+    axes[ix].plot(
+        X,
+        svr.fit(X, y).predict(X),
+        color=model_color[ix],
+        lw=lw,
+        label="{} model".format(kernel_label[ix]),
+    )
+    axes[ix].scatter(
+        X[svr.support_],
+        y[svr.support_],
+        facecolor="none",
+        edgecolor=model_color[ix],
+        s=50,
+        label="{} support vectors".format(kernel_label[ix]),
+    )
+    axes[ix].scatter(
+        X[np.setdiff1d(np.arange(len(X)), svr.support_)],
+        y[np.setdiff1d(np.arange(len(X)), svr.support_)],
+        facecolor="none",
+        edgecolor="k",
+        s=50,
+        label="other training data",
+    )
+    axes[ix].legend(
+        loc="upper center",
+        bbox_to_anchor=(0.5, 1.1),
+        ncol=1,
+        fancybox=True,
+        shadow=True,
+    )
+
+fig.text(0.5, 0.04, "data", ha="center", va="center")
+fig.text(0.06, 0.5, "target", ha="center", va="center", rotation="vertical")
 fig.suptitle("Support Vector Regression", fontsize=14)
 plt.show()
diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 5208519172824..cc3793fefc7d3 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -100,22 +100,31 @@
 n_features = 300
 
 # l1 data (only 5 informative features)
-X_1, y_1 = datasets.make_classification(n_samples=n_samples,
-                                        n_features=n_features, n_informative=5,
-                                        random_state=1)
+X_1, y_1 = datasets.make_classification(
+    n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1
+)
 
 # l2 data: non sparse, but less features
-y_2 = np.sign(.5 - rnd.rand(n_samples))
+y_2 = np.sign(0.5 - rnd.rand(n_samples))
 X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]
 X_2 += 5 * rnd.randn(n_samples, n_features // 5)
 
-clf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
-                       tol=1e-3),
-             np.logspace(-2.3, -1.3, 10), X_1, y_1),
-            (LinearSVC(penalty='l2', loss='squared_hinge', dual=True),
-             np.logspace(-4.5, -2, 10), X_2, y_2)]
-
-colors = ['navy', 'cyan', 'darkorange']
+clf_sets = [
+    (
+        LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3),
+        np.logspace(-2.3, -1.3, 10),
+        X_1,
+        y_1,
+    ),
+    (
+        LinearSVC(penalty="l2", loss="squared_hinge", dual=True),
+        np.logspace(-4.5, -2, 10),
+        X_2,
+        y_2,
+    ),
+]
+
+colors = ["navy", "cyan", "darkorange"]
 lw = 2
 
 for clf, cs, X, y in clf_sets:
@@ -126,25 +135,36 @@
         param_grid = dict(C=cs)
         # To get nice curve, we need a large number of iterations to
         # reduce the variance
-        grid = GridSearchCV(clf, refit=False, param_grid=param_grid,
-                            cv=ShuffleSplit(train_size=train_size,
-                                            test_size=.3,
-                                            n_splits=250, random_state=1))
+        grid = GridSearchCV(
+            clf,
+            refit=False,
+            param_grid=param_grid,
+            cv=ShuffleSplit(
+                train_size=train_size, test_size=0.3, n_splits=250, random_state=1
+            ),
+        )
         grid.fit(X, y)
-        scores = grid.cv_results_['mean_test_score']
+        scores = grid.cv_results_["mean_test_score"]
 
-        scales = [(1, 'No scaling'),
-                  ((n_samples * train_size), '1/n_samples'),
-                  ]
+        scales = [
+            (1, "No scaling"),
+            ((n_samples * train_size), "1/n_samples"),
+        ]
 
         for ax, (scaler, name) in zip(axes, scales):
-            ax.set_xlabel('C')
-            ax.set_ylabel('CV Score')
+            ax.set_xlabel("C")
+            ax.set_ylabel("CV Score")
             grid_cs = cs * float(scaler)  # scale the C's
-            ax.semilogx(grid_cs, scores, label="fraction %.2f" %
-                        train_size, color=colors[k], lw=lw)
-            ax.set_title('scaling=%s, penalty=%s, loss=%s' %
-                         (name, clf.penalty, clf.loss))
+            ax.semilogx(
+                grid_cs,
+                scores,
+                label="fraction %.2f" % train_size,
+                color=colors[k],
+                lw=lw,
+            )
+            ax.set_title(
+                "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss)
+            )
 
     plt.legend(loc="best")
 plt.show()
diff --git a/examples/svm/plot_svm_tie_breaking.py b/examples/svm/plot_svm_tie_breaking.py
index 76eabfa1e35be..aea34ac97fd8a 100644
--- a/examples/svm/plot_svm_tie_breaking.py
+++ b/examples/svm/plot_svm_tie_breaking.py
@@ -27,13 +27,13 @@
 X, y = make_blobs(random_state=27)
 
 fig, sub = plt.subplots(2, 1, figsize=(5, 8))
-titles = ("break_ties = False",
-          "break_ties = True")
+titles = ("break_ties = False", "break_ties = True")
 
 for break_ties, title, ax in zip((False, True), titles, sub.flatten()):
 
-    svm = SVC(kernel="linear", C=1, break_ties=break_ties,
-              decision_function_shape='ovr').fit(X, y)
+    svm = SVC(
+        kernel="linear", C=1, break_ties=break_ties, decision_function_shape="ovr"
+    ).fit(X, y)
 
     xlim = [X[:, 0].min(), X[:, 0].max()]
     ylim = [X[:, 1].min(), X[:, 1].max()]
@@ -49,8 +49,12 @@
     points = ax.scatter(X[:, 0], X[:, 1], c=y, cmap="Accent")
     classes = [(0, 1), (0, 2), (1, 2)]
     line = np.linspace(X[:, 1].min() - 5, X[:, 1].max() + 5)
-    ax.imshow(-pred.reshape(xx.shape), cmap="Accent", alpha=.2,
-              extent=(xlim[0], xlim[1], ylim[1], ylim[0]))
+    ax.imshow(
+        -pred.reshape(xx.shape),
+        cmap="Accent",
+        alpha=0.2,
+        extent=(xlim[0], xlim[1], ylim[1], ylim[0]),
+    )
 
     for coef, intercept, col in zip(svm.coef_, svm.intercept_, classes):
         line2 = -(line * coef[1] + intercept) / coef[0]
diff --git a/examples/svm/plot_weighted_samples.py b/examples/svm/plot_weighted_samples.py
index 0549da7a38084..f25390446fc87 100644
--- a/examples/svm/plot_weighted_samples.py
+++ b/examples/svm/plot_weighted_samples.py
@@ -28,10 +28,17 @@ def plot_decision_function(classifier, sample_weight, axis, title):
 
     # plot the line, the points, and the nearest vectors to the plane
     axis.contourf(xx, yy, Z, alpha=0.75, cmap=plt.cm.bone)
-    axis.scatter(X[:, 0], X[:, 1], c=y, s=100 * sample_weight, alpha=0.9,
-                 cmap=plt.cm.bone, edgecolors='black')
+    axis.scatter(
+        X[:, 0],
+        X[:, 1],
+        c=y,
+        s=100 * sample_weight,
+        alpha=0.9,
+        cmap=plt.cm.bone,
+        edgecolors="black",
+    )
 
-    axis.axis('off')
+    axis.axis("off")
     axis.set_title(title)
 
 
@@ -55,9 +62,9 @@ def plot_decision_function(classifier, sample_weight, axis, title):
 clf_no_weights.fit(X, y)
 
 fig, axes = plt.subplots(1, 2, figsize=(14, 6))
-plot_decision_function(clf_no_weights, sample_weight_constant, axes[0],
-                       "Constant weights")
-plot_decision_function(clf_weights, sample_weight_last_ten, axes[1],
-                       "Modified weights")
+plot_decision_function(
+    clf_no_weights, sample_weight_constant, axes[0], "Constant weights"
+)
+plot_decision_function(clf_weights, sample_weight_last_ten, axes[1], "Modified weights")
 
 plt.show()
diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py
index 7f7bc422808dc..5351bb5bef3e3 100644
--- a/examples/text/plot_document_classification_20newsgroups.py
+++ b/examples/text/plot_document_classification_20newsgroups.py
@@ -45,40 +45,60 @@
 
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
 op = OptionParser()
-op.add_option("--report",
-              action="store_true", dest="print_report",
-              help="Print a detailed classification report.")
-op.add_option("--chi2_select",
-              action="store", type="int", dest="select_chi2",
-              help="Select some number of features using a chi-squared test")
-op.add_option("--confusion_matrix",
-              action="store_true", dest="print_cm",
-              help="Print the confusion matrix.")
-op.add_option("--top10",
-              action="store_true", dest="print_top10",
-              help="Print ten most discriminative terms per class"
-                   " for every classifier.")
-op.add_option("--all_categories",
-              action="store_true", dest="all_categories",
-              help="Whether to use all categories or not.")
-op.add_option("--use_hashing",
-              action="store_true",
-              help="Use a hashing vectorizer.")
-op.add_option("--n_features",
-              action="store", type=int, default=2 ** 16,
-              help="n_features when using the hashing vectorizer.")
-op.add_option("--filtered",
-              action="store_true",
-              help="Remove newsgroup information that is easily overfit: "
-                   "headers, signatures, and quoting.")
+op.add_option(
+    "--report",
+    action="store_true",
+    dest="print_report",
+    help="Print a detailed classification report.",
+)
+op.add_option(
+    "--chi2_select",
+    action="store",
+    type="int",
+    dest="select_chi2",
+    help="Select some number of features using a chi-squared test",
+)
+op.add_option(
+    "--confusion_matrix",
+    action="store_true",
+    dest="print_cm",
+    help="Print the confusion matrix.",
+)
+op.add_option(
+    "--top10",
+    action="store_true",
+    dest="print_top10",
+    help="Print ten most discriminative terms per class for every classifier.",
+)
+op.add_option(
+    "--all_categories",
+    action="store_true",
+    dest="all_categories",
+    help="Whether to use all categories or not.",
+)
+op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.")
+op.add_option(
+    "--n_features",
+    action="store",
+    type=int,
+    default=2 ** 16,
+    help="n_features when using the hashing vectorizer.",
+)
+op.add_option(
+    "--filtered",
+    action="store_true",
+    help=(
+        "Remove newsgroup information that is easily overfit: "
+        "headers, signatures, and quoting."
+    ),
+)
 
 
 def is_interactive():
-    return not hasattr(sys.modules['__main__'], '__file__')
+    return not hasattr(sys.modules["__main__"], "__file__")
 
 
 # work-around for Jupyter notebook and IPython console
@@ -103,44 +123,44 @@ def is_interactive():
     categories = None
 else:
     categories = [
-        'alt.atheism',
-        'talk.religion.misc',
-        'comp.graphics',
-        'sci.space',
+        "alt.atheism",
+        "talk.religion.misc",
+        "comp.graphics",
+        "sci.space",
     ]
 
 if opts.filtered:
-    remove = ('headers', 'footers', 'quotes')
+    remove = ("headers", "footers", "quotes")
 else:
     remove = ()
 
 print("Loading 20 newsgroups dataset for categories:")
 print(categories if categories else "all")
 
-data_train = fetch_20newsgroups(subset='train', categories=categories,
-                                shuffle=True, random_state=42,
-                                remove=remove)
+data_train = fetch_20newsgroups(
+    subset="train", categories=categories, shuffle=True, random_state=42, remove=remove
+)
 
-data_test = fetch_20newsgroups(subset='test', categories=categories,
-                               shuffle=True, random_state=42,
-                               remove=remove)
-print('data loaded')
+data_test = fetch_20newsgroups(
+    subset="test", categories=categories, shuffle=True, random_state=42, remove=remove
+)
+print("data loaded")
 
 # order of labels in `target_names` can be different from `categories`
 target_names = data_train.target_names
 
 
 def size_mb(docs):
-    return sum(len(s.encode('utf-8')) for s in docs) / 1e6
+    return sum(len(s.encode("utf-8")) for s in docs) / 1e6
 
 
 data_train_size_mb = size_mb(data_train.data)
 data_test_size_mb = size_mb(data_test.data)
 
-print("%d documents - %0.3fMB (training set)" % (
-    len(data_train.data), data_train_size_mb))
-print("%d documents - %0.3fMB (test set)" % (
-    len(data_test.data), data_test_size_mb))
+print(
+    "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)
+)
+print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb))
 print("%d categories" % len(target_names))
 print()
 
@@ -150,12 +170,12 @@ def size_mb(docs):
 print("Extracting features from the training data using a sparse vectorizer")
 t0 = time()
 if opts.use_hashing:
-    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
-                                   n_features=opts.n_features)
+    vectorizer = HashingVectorizer(
+        stop_words="english", alternate_sign=False, n_features=opts.n_features
+    )
     X_train = vectorizer.transform(data_train.data)
 else:
-    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
-                                 stop_words='english')
+    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
     X_train = vectorizer.fit_transform(data_train.data)
 duration = time() - t0
 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
@@ -177,8 +197,7 @@ def size_mb(docs):
     feature_names = vectorizer.get_feature_names_out()
 
 if opts.select_chi2:
-    print("Extracting %d best features by a chi-squared test" %
-          opts.select_chi2)
+    print("Extracting %d best features by a chi-squared test" % opts.select_chi2)
     t0 = time()
     ch2 = SelectKBest(chi2, k=opts.select_chi2)
     X_train = ch2.fit_transform(X_train, y_train)
@@ -201,7 +220,7 @@ def trim(s):
 # We train and test the datasets with 15 different classification models
 # and get performance results for each model.
 def benchmark(clf):
-    print('_' * 80)
+    print("_" * 80)
     print("Training: ")
     print(clf)
     t0 = time()
@@ -217,7 +236,7 @@ def benchmark(clf):
     score = metrics.accuracy_score(y_test, pred)
     print("accuracy:   %0.3f" % score)
 
-    if hasattr(clf, 'coef_'):
+    if hasattr(clf, "coef_"):
         print("dimensionality: %d" % clf.coef_.shape[1])
         print("density: %f" % density(clf.coef_))
 
@@ -230,67 +249,74 @@ def benchmark(clf):
 
     if opts.print_report:
         print("classification report:")
-        print(metrics.classification_report(y_test, pred,
-                                            target_names=target_names))
+        print(metrics.classification_report(y_test, pred, target_names=target_names))
 
     if opts.print_cm:
         print("confusion matrix:")
         print(metrics.confusion_matrix(y_test, pred))
 
     print()
-    clf_descr = str(clf).split('(')[0]
+    clf_descr = str(clf).split("(")[0]
     return clf_descr, score, train_time, test_time
 
 
 results = []
 for clf, name in (
-        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
-        (Perceptron(max_iter=50), "Perceptron"),
-        (PassiveAggressiveClassifier(max_iter=50),
-         "Passive-Aggressive"),
-        (KNeighborsClassifier(n_neighbors=10), "kNN"),
-        (RandomForestClassifier(), "Random forest")):
-    print('=' * 80)
+    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
+    (Perceptron(max_iter=50), "Perceptron"),
+    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
+    (KNeighborsClassifier(n_neighbors=10), "kNN"),
+    (RandomForestClassifier(), "Random forest"),
+):
+    print("=" * 80)
     print(name)
     results.append(benchmark(clf))
 
 for penalty in ["l2", "l1"]:
-    print('=' * 80)
+    print("=" * 80)
     print("%s penalty" % penalty.upper())
     # Train Liblinear model
-    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
-                                       tol=1e-3)))
+    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))
 
     # Train SGD model
-    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
-                                           penalty=penalty)))
+    results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty)))
 
 # Train SGD with Elastic Net penalty
-print('=' * 80)
+print("=" * 80)
 print("Elastic-Net penalty")
-results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
-                                       penalty="elasticnet")))
+results.append(
+    benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet"))
+)
 
 # Train NearestCentroid without threshold
-print('=' * 80)
+print("=" * 80)
 print("NearestCentroid (aka Rocchio classifier)")
 results.append(benchmark(NearestCentroid()))
 
 # Train sparse Naive Bayes classifiers
-print('=' * 80)
+print("=" * 80)
 print("Naive Bayes")
-results.append(benchmark(MultinomialNB(alpha=.01)))
-results.append(benchmark(BernoulliNB(alpha=.01)))
-results.append(benchmark(ComplementNB(alpha=.1)))
+results.append(benchmark(MultinomialNB(alpha=0.01)))
+results.append(benchmark(BernoulliNB(alpha=0.01)))
+results.append(benchmark(ComplementNB(alpha=0.1)))
 
-print('=' * 80)
+print("=" * 80)
 print("LinearSVC with L1-based feature selection")
 # The smaller C, the stronger the regularization.
 # The more regularization, the more sparsity.
-results.append(benchmark(Pipeline([
-  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
-                                                  tol=1e-3))),
-  ('classification', LinearSVC(penalty="l2"))])))
+results.append(
+    benchmark(
+        Pipeline(
+            [
+                (
+                    "feature_selection",
+                    SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
+                ),
+                ("classification", LinearSVC(penalty="l2")),
+            ]
+        )
+    )
+)
 
 
 # %%
@@ -308,17 +334,16 @@ def benchmark(clf):
 
 plt.figure(figsize=(12, 8))
 plt.title("Score")
-plt.barh(indices, score, .2, label="score", color='navy')
-plt.barh(indices + .3, training_time, .2, label="training time",
-         color='c')
-plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
+plt.barh(indices, score, 0.2, label="score", color="navy")
+plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c")
+plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange")
 plt.yticks(())
-plt.legend(loc='best')
-plt.subplots_adjust(left=.25)
-plt.subplots_adjust(top=.95)
-plt.subplots_adjust(bottom=.05)
+plt.legend(loc="best")
+plt.subplots_adjust(left=0.25)
+plt.subplots_adjust(top=0.95)
+plt.subplots_adjust(bottom=0.05)
 
 for i, c in zip(indices, clf_names):
-    plt.text(-.3, i, c)
+    plt.text(-0.3, i, c)
 
 plt.show()
diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
index 128138681bc72..832dabb6ba067 100644
--- a/examples/text/plot_document_clustering.py
+++ b/examples/text/plot_document_clustering.py
@@ -72,36 +72,56 @@
 
 
 # Display progress logs on stdout
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 
 # parse commandline arguments
 op = OptionParser()
-op.add_option("--lsa",
-              dest="n_components", type="int",
-              help="Preprocess documents with latent semantic analysis.")
-op.add_option("--no-minibatch",
-              action="store_false", dest="minibatch", default=True,
-              help="Use ordinary k-means algorithm (in batch mode).")
-op.add_option("--no-idf",
-              action="store_false", dest="use_idf", default=True,
-              help="Disable Inverse Document Frequency feature weighting.")
-op.add_option("--use-hashing",
-              action="store_true", default=False,
-              help="Use a hashing feature vectorizer")
-op.add_option("--n-features", type=int, default=10000,
-              help="Maximum number of features (dimensions)"
-                   " to extract from text.")
-op.add_option("--verbose",
-              action="store_true", dest="verbose", default=False,
-              help="Print progress reports inside k-means algorithm.")
+op.add_option(
+    "--lsa",
+    dest="n_components",
+    type="int",
+    help="Preprocess documents with latent semantic analysis.",
+)
+op.add_option(
+    "--no-minibatch",
+    action="store_false",
+    dest="minibatch",
+    default=True,
+    help="Use ordinary k-means algorithm (in batch mode).",
+)
+op.add_option(
+    "--no-idf",
+    action="store_false",
+    dest="use_idf",
+    default=True,
+    help="Disable Inverse Document Frequency feature weighting.",
+)
+op.add_option(
+    "--use-hashing",
+    action="store_true",
+    default=False,
+    help="Use a hashing feature vectorizer",
+)
+op.add_option(
+    "--n-features",
+    type=int,
+    default=10000,
+    help="Maximum number of features (dimensions) to extract from text.",
+)
+op.add_option(
+    "--verbose",
+    action="store_true",
+    dest="verbose",
+    default=False,
+    help="Print progress reports inside k-means algorithm.",
+)
 
 print(__doc__)
 op.print_help()
 
 
 def is_interactive():
-    return not hasattr(sys.modules['__main__'], '__file__')
+    return not hasattr(sys.modules["__main__"], "__file__")
 
 
 # work-around for Jupyter notebook and IPython console
@@ -115,10 +135,10 @@ def is_interactive():
 # #############################################################################
 # Load some categories from the training set
 categories = [
-    'alt.atheism',
-    'talk.religion.misc',
-    'comp.graphics',
-    'sci.space',
+    "alt.atheism",
+    "talk.religion.misc",
+    "comp.graphics",
+    "sci.space",
 ]
 # Uncomment the following to do the analysis on all the categories
 # categories = None
@@ -126,8 +146,9 @@ def is_interactive():
 print("Loading 20 newsgroups dataset for categories:")
 print(categories)
 
-dataset = fetch_20newsgroups(subset='all', categories=categories,
-                             shuffle=True, random_state=42)
+dataset = fetch_20newsgroups(
+    subset="all", categories=categories, shuffle=True, random_state=42
+)
 
 print("%d documents" % len(dataset.data))
 print("%d categories" % len(dataset.target_names))
@@ -136,24 +157,33 @@ def is_interactive():
 labels = dataset.target
 true_k = np.unique(labels).shape[0]
 
-print("Extracting features from the training dataset "
-      "using a sparse vectorizer")
+print("Extracting features from the training dataset using a sparse vectorizer")
 t0 = time()
 if opts.use_hashing:
     if opts.use_idf:
         # Perform an IDF normalization on the output of HashingVectorizer
-        hasher = HashingVectorizer(n_features=opts.n_features,
-                                   stop_words='english', alternate_sign=False,
-                                   norm=None)
+        hasher = HashingVectorizer(
+            n_features=opts.n_features,
+            stop_words="english",
+            alternate_sign=False,
+            norm=None,
+        )
         vectorizer = make_pipeline(hasher, TfidfTransformer())
     else:
-        vectorizer = HashingVectorizer(n_features=opts.n_features,
-                                       stop_words='english',
-                                       alternate_sign=False, norm='l2')
+        vectorizer = HashingVectorizer(
+            n_features=opts.n_features,
+            stop_words="english",
+            alternate_sign=False,
+            norm="l2",
+        )
 else:
-    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
-                                 min_df=2, stop_words='english',
-                                 use_idf=opts.use_idf)
+    vectorizer = TfidfVectorizer(
+        max_df=0.5,
+        max_features=opts.n_features,
+        min_df=2,
+        stop_words="english",
+        use_idf=opts.use_idf,
+    )
 X = vectorizer.fit_transform(dataset.data)
 
 print("done in %fs" % (time() - t0))
@@ -175,8 +205,9 @@ def is_interactive():
     print("done in %fs" % (time() - t0))
 
     explained_variance = svd.explained_variance_ratio_.sum()
-    print("Explained variance of the SVD step: {}%".format(
-        int(explained_variance * 100)))
+    print(
+        "Explained variance of the SVD step: {}%".format(int(explained_variance * 100))
+    )
 
     print()
 
@@ -185,11 +216,22 @@ def is_interactive():
 # Do the actual clustering
 
 if opts.minibatch:
-    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
-                         init_size=1000, batch_size=1000, verbose=opts.verbose)
+    km = MiniBatchKMeans(
+        n_clusters=true_k,
+        init="k-means++",
+        n_init=1,
+        init_size=1000,
+        batch_size=1000,
+        verbose=opts.verbose,
+    )
 else:
-    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
-                verbose=opts.verbose)
+    km = KMeans(
+        n_clusters=true_k,
+        init="k-means++",
+        max_iter=100,
+        n_init=1,
+        verbose=opts.verbose,
+    )
 
 print("Clustering sparse data with %s" % km)
 t0 = time()
@@ -200,10 +242,11 @@ def is_interactive():
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
 print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
-print("Adjusted Rand-Index: %.3f"
-      % metrics.adjusted_rand_score(labels, km.labels_))
-print("Silhouette Coefficient: %0.3f"
-      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
+print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
+print(
+    "Silhouette Coefficient: %0.3f"
+    % metrics.silhouette_score(X, km.labels_, sample_size=1000)
+)
 
 print()
 
@@ -219,7 +262,7 @@ def is_interactive():
 
     terms = vectorizer.get_feature_names_out()
     for i in range(true_k):
-        print("Cluster %d:" % i, end='')
+        print("Cluster %d:" % i, end="")
         for ind in order_centroids[i, :10]:
-            print(' %s' % terms[ind], end='')
+            print(" %s" % terms[ind], end="")
         print()
diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py
index 1cf5c0aa6a0ce..1284fb7f164cd 100644
--- a/examples/text/plot_hashing_vs_dict_vectorizer.py
+++ b/examples/text/plot_hashing_vs_dict_vectorizer.py
@@ -51,13 +51,13 @@ def token_freqs(doc):
 
 
 categories = [
-    'alt.atheism',
-    'comp.graphics',
-    'comp.sys.ibm.pc.hardware',
-    'misc.forsale',
-    'rec.autos',
-    'sci.space',
-    'talk.religion.misc',
+    "alt.atheism",
+    "comp.graphics",
+    "comp.sys.ibm.pc.hardware",
+    "misc.forsale",
+    "rec.autos",
+    "sci.space",
+    "talk.religion.misc",
 ]
 # Uncomment the following line to use a larger set (11k+ documents)
 # categories = None
@@ -77,9 +77,8 @@ def token_freqs(doc):
 
 
 print("Loading 20 newsgroups training data")
-raw_data, _ = fetch_20newsgroups(subset='train', categories=categories,
-                                 return_X_y=True)
-data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
+raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
+data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
 print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))
 print()
 
diff --git a/examples/tree/plot_cost_complexity_pruning.py b/examples/tree/plot_cost_complexity_pruning.py
index 822d7a206f842..9f317cc96b4fa 100644
--- a/examples/tree/plot_cost_complexity_pruning.py
+++ b/examples/tree/plot_cost_complexity_pruning.py
@@ -45,7 +45,7 @@
 # In the following plot, the maximum effective alpha value is removed, because
 # it is the trivial tree with only one node.
 fig, ax = plt.subplots()
-ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
+ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
 ax.set_xlabel("effective alpha")
 ax.set_ylabel("total impurity of leaves")
 ax.set_title("Total Impurity vs effective alpha for training set")
@@ -59,8 +59,11 @@
     clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
     clf.fit(X_train, y_train)
     clfs.append(clf)
-print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
-      clfs[-1].tree_.node_count, ccp_alphas[-1]))
+print(
+    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
+        clfs[-1].tree_.node_count, ccp_alphas[-1]
+    )
+)
 
 # %%
 # For the remainder of this example, we remove the last element in
@@ -73,11 +76,11 @@
 node_counts = [clf.tree_.node_count for clf in clfs]
 depth = [clf.tree_.max_depth for clf in clfs]
 fig, ax = plt.subplots(2, 1)
-ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
+ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
 ax[0].set_xlabel("alpha")
 ax[0].set_ylabel("number of nodes")
 ax[0].set_title("Number of nodes vs alpha")
-ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
+ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
 ax[1].set_xlabel("alpha")
 ax[1].set_ylabel("depth of tree")
 ax[1].set_title("Depth vs alpha")
@@ -98,9 +101,7 @@
 ax.set_xlabel("alpha")
 ax.set_ylabel("accuracy")
 ax.set_title("Accuracy vs alpha for training and testing sets")
-ax.plot(ccp_alphas, train_scores, marker='o', label="train",
-        drawstyle="steps-post")
-ax.plot(ccp_alphas, test_scores, marker='o', label="test",
-        drawstyle="steps-post")
+ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
+ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
 ax.legend()
 plt.show()
diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py
index 60328c4f90d4f..7dbe203163de2 100644
--- a/examples/tree/plot_iris_dtc.py
+++ b/examples/tree/plot_iris_dtc.py
@@ -30,8 +30,7 @@
 # Load data
 iris = load_iris()
 
-for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
-                                [1, 2], [1, 3], [2, 3]]):
+for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
     # We only take the two corresponding features
     X = iris.data[:, pair]
     y = iris.target
@@ -44,8 +43,9 @@
 
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
-    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
-                         np.arange(y_min, y_max, plot_step))
+    xx, yy = np.meshgrid(
+        np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
+    )
     plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
 
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
@@ -58,11 +58,18 @@
     # Plot the training points
     for i, color in zip(range(n_classes), plot_colors):
         idx = np.where(y == i)
-        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
-                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)
+        plt.scatter(
+            X[idx, 0],
+            X[idx, 1],
+            c=color,
+            label=iris.target_names[i],
+            cmap=plt.cm.RdYlBu,
+            edgecolor="black",
+            s=15,
+        )
 
 plt.suptitle("Decision surface of a decision tree using paired features")
-plt.legend(loc='lower right', borderpad=0, handletextpad=0)
+plt.legend(loc="lower right", borderpad=0, handletextpad=0)
 plt.axis("tight")
 
 plt.figure()
diff --git a/examples/tree/plot_tree_regression.py b/examples/tree/plot_tree_regression.py
index 717de4ab72e4e..7a71b0450f2b7 100644
--- a/examples/tree/plot_tree_regression.py
+++ b/examples/tree/plot_tree_regression.py
@@ -39,10 +39,8 @@
 
 # Plot the results
 plt.figure()
-plt.scatter(X, y, s=20, edgecolor="black",
-            c="darkorange", label="data")
-plt.plot(X_test, y_1, color="cornflowerblue",
-         label="max_depth=2", linewidth=2)
+plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
+plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
 plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
 plt.xlabel("data")
 plt.ylabel("target")
diff --git a/examples/tree/plot_tree_regression_multioutput.py b/examples/tree/plot_tree_regression_multioutput.py
index b47bfcd80e49a..ab9a530b0faee 100644
--- a/examples/tree/plot_tree_regression_multioutput.py
+++ b/examples/tree/plot_tree_regression_multioutput.py
@@ -24,7 +24,7 @@
 rng = np.random.RandomState(1)
 X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)
 y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
-y[::5, :] += (0.5 - rng.rand(20, 2))
+y[::5, :] += 0.5 - rng.rand(20, 2)
 
 # Fit regression model
 regr_1 = DecisionTreeRegressor(max_depth=2)
@@ -43,14 +43,19 @@
 # Plot the results
 plt.figure()
 s = 25
-plt.scatter(y[:, 0], y[:, 1], c="navy", s=s,
-            edgecolor="black", label="data")
-plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s,
-            edgecolor="black", label="max_depth=2")
-plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s,
-            edgecolor="black", label="max_depth=5")
-plt.scatter(y_3[:, 0], y_3[:, 1], c="orange", s=s,
-            edgecolor="black", label="max_depth=8")
+plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data")
+plt.scatter(
+    y_1[:, 0],
+    y_1[:, 1],
+    c="cornflowerblue",
+    s=s,
+    edgecolor="black",
+    label="max_depth=2",
+)
+plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s, edgecolor="black", label="max_depth=5")
+plt.scatter(
+    y_3[:, 0], y_3[:, 1], c="orange", s=s, edgecolor="black", label="max_depth=8"
+)
 plt.xlim([-6, 6])
 plt.ylim([-6, 6])
 plt.xlabel("target 1")
diff --git a/examples/tree/plot_unveil_tree_structure.py b/examples/tree/plot_unveil_tree_structure.py
index 81f67c615c94c..65aa685463bac 100644
--- a/examples/tree/plot_unveil_tree_structure.py
+++ b/examples/tree/plot_unveil_tree_structure.py
@@ -92,22 +92,30 @@
     else:
         is_leaves[node_id] = True
 
-print("The binary tree structure has {n} nodes and has "
-      "the following tree structure:\n".format(n=n_nodes))
+print(
+    "The binary tree structure has {n} nodes and has "
+    "the following tree structure:\n".format(n=n_nodes)
+)
 for i in range(n_nodes):
     if is_leaves[i]:
-        print("{space}node={node} is a leaf node.".format(
-            space=node_depth[i] * "\t", node=i))
+        print(
+            "{space}node={node} is a leaf node.".format(
+                space=node_depth[i] * "\t", node=i
+            )
+        )
     else:
-        print("{space}node={node} is a split node: "
-              "go to node {left} if X[:, {feature}] <= {threshold} "
-              "else to node {right}.".format(
-                  space=node_depth[i] * "\t",
-                  node=i,
-                  left=children_left[i],
-                  feature=feature[i],
-                  threshold=threshold[i],
-                  right=children_right[i]))
+        print(
+            "{space}node={node} is a split node: "
+            "go to node {left} if X[:, {feature}] <= {threshold} "
+            "else to node {right}.".format(
+                space=node_depth[i] * "\t",
+                node=i,
+                left=children_left[i],
+                feature=feature[i],
+                threshold=threshold[i],
+                right=children_right[i],
+            )
+        )
 
 ##############################################################################
 # We can compare the above output to the plot of the decision tree.
@@ -139,29 +147,33 @@
 
 sample_id = 0
 # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
-node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
-                                    node_indicator.indptr[sample_id + 1]]
+node_index = node_indicator.indices[
+    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
+]
 
-print('Rules used to predict sample {id}:\n'.format(id=sample_id))
+print("Rules used to predict sample {id}:\n".format(id=sample_id))
 for node_id in node_index:
     # continue to the next node if it is a leaf node
     if leaf_id[sample_id] == node_id:
         continue
 
     # check if value of the split feature for sample 0 is below threshold
-    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
+    if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
         threshold_sign = "<="
     else:
         threshold_sign = ">"
 
-    print("decision node {node} : (X_test[{sample}, {feature}] = {value}) "
-          "{inequality} {threshold})".format(
-              node=node_id,
-              sample=sample_id,
-              feature=feature[node_id],
-              value=X_test[sample_id, feature[node_id]],
-              inequality=threshold_sign,
-              threshold=threshold[node_id]))
+    print(
+        "decision node {node} : (X_test[{sample}, {feature}] = {value}) "
+        "{inequality} {threshold})".format(
+            node=node_id,
+            sample=sample_id,
+            feature=feature[node_id],
+            value=X_test[sample_id, feature[node_id]],
+            inequality=threshold_sign,
+            threshold=threshold[node_id],
+        )
+    )
 
 ##############################################################################
 # For a group of samples, we can determine the common nodes the samples go
@@ -169,12 +181,13 @@
 
 sample_ids = [0, 1]
 # boolean array indicating the nodes both samples go through
-common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
-                len(sample_ids))
+common_nodes = node_indicator.toarray()[sample_ids].sum(axis=0) == len(sample_ids)
 # obtain node ids using position in array
 common_node_id = np.arange(n_nodes)[common_nodes]
 
-print("\nThe following samples {samples} share the node(s) {nodes} in the "
-      "tree.".format(samples=sample_ids, nodes=common_node_id))
-print("This is {prop}% of all nodes.".format(
-    prop=100 * len(common_node_id) / n_nodes))
+print(
+    "\nThe following samples {samples} share the node(s) {nodes} in the tree.".format(
+        samples=sample_ids, nodes=common_node_id
+    )
+)
+print("This is {prop}% of all nodes.".format(prop=100 * len(common_node_id) / n_nodes))
diff --git a/pyproject.toml b/pyproject.toml
index 71da52002cc96..3762d2f229f76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,6 @@ exclude = '''
   | \.git          # root of the project
   | \.mypy_cache
   | \.vscode
-  | examples
   | build
   | dist
   | doc/tutorial

From 12f46cc51865a7514b2793034e25896f13c10bb7 Mon Sep 17 00:00:00 2001
From: Pinky <pinky00ch@gmail.com>
Date: Thu, 7 Oct 2021 16:02:10 +0530
Subject: [PATCH 11/18] DOC Ensures that SplineTransformer passes numpydoc
 validation (#21248)

* Remove SplineTransformer from DOCSTRING_IGNORE_LIST

* Fix numpydocs from SplineTransformer
---
 maint_tools/test_docstrings.py       | 1 -
 sklearn/preprocessing/_polynomial.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py
index 3388b7635d214..35ed4c515dd81 100644
--- a/maint_tools/test_docstrings.py
+++ b/maint_tools/test_docstrings.py
@@ -28,7 +28,6 @@
     "SpectralBiclustering",
     "SpectralCoclustering",
     "SpectralEmbedding",
-    "SplineTransformer",
     "StackingRegressor",
     "TransformedTargetRegressor",
 ]
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 66d5a06773077..dbe78c0967582 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -693,6 +693,7 @@ def get_feature_names(self, input_features=None):
         Returns
         -------
         output_feature_names : list of str of shape (n_output_features,)
+            Transformed feature names.
         """
         n_splines = self.bsplines_[0].c.shape[0]
         if input_features is None:

From e11e8208a977e6498ca7c2aec893808f9119f729 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 7 Oct 2021 10:19:55 -0400
Subject: [PATCH 12/18] BLD Fixes osx build by downgrading to 11.X (#21227)

---
 build_tools/github/build_wheels.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/github/build_wheels.sh b/build_tools/github/build_wheels.sh
index c484513b648e3..2671c50f66bb3 100755
--- a/build_tools/github/build_wheels.sh
+++ b/build_tools/github/build_wheels.sh
@@ -12,7 +12,7 @@ if [[ "$RUNNER_OS" == "macOS" ]]; then
     # supported macos version is: High Sierra / 10.13. When upgrading this, be
     # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in
     # wheels.yml accordingly. Note that Darwin_17 == High Sierra / 10.13.
-    wget https://packages.macports.org/libomp/libomp-12.0.0_0+universal.darwin_17.i386-x86_64.tbz2 -O libomp.tbz2
+    wget https://packages.macports.org/libomp/libomp-11.0.1_0+universal.darwin_17.i386-x86_64.tbz2 -O libomp.tbz2
     sudo tar -C / -xvjf libomp.tbz2 opt
 
     export CC=/usr/bin/clang

From ff7d9c6f2cdeb33310aff53e9bced27a6d5bea21 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 7 Oct 2021 17:39:15 +0200
Subject: [PATCH 13/18] DOC Cross-link check_estimator and
 parametrize_with_checks (#21269)

---
 sklearn/utils/estimator_checks.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 37537bc1b0498..ccc6ff23ed8fc 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -472,6 +472,10 @@ def parametrize_with_checks(estimators):
     -------
     decorator : `pytest.mark.parametrize`
 
+    See Also
+    --------
+    check_estimator : Check if estimator adheres to scikit-learn conventions.
+
     Examples
     --------
     >>> from sklearn.utils.estimator_checks import parametrize_with_checks
@@ -547,6 +551,11 @@ def check_estimator(Estimator, generate_only=False):
     checks_generator : generator
         Generator that yields (estimator, check) tuples. Returned when
         `generate_only=True`.
+
+    See Also
+    --------
+    parametrize_with_checks : Pytest specific decorator for parametrizing estimator
+        checks.
     """
     if isinstance(Estimator, type):
         msg = (

From 39fd93f84ff09e0e3c7d1f3bbee0919d5b4f80a6 Mon Sep 17 00:00:00 2001
From: Shao Yang Hong <hongsy2006@gmail.com>
Date: Thu, 7 Oct 2021 23:42:58 +0800
Subject: [PATCH 14/18] DOC Clarify use_idf in TfidfTransformer/TfidfVectorizer
 docstrings (#21213)

---
 sklearn/feature_extraction/text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 8dd743813fa27..82582de8a5b60 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1504,7 +1504,7 @@ class TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
           See :func:`preprocessing.normalize`.
 
     use_idf : bool, default=True
-        Enable inverse-document-frequency reweighting.
+        Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
 
     smooth_idf : bool, default=True
         Smooth idf weights by adding one to document frequencies, as if an
@@ -1842,7 +1842,7 @@ class TfidfVectorizer(CountVectorizer):
           See :func:`preprocessing.normalize`.
 
     use_idf : bool, default=True
-        Enable inverse-document-frequency reweighting.
+        Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
 
     smooth_idf : bool, default=True
         Smooth idf weights by adding one to document frequencies, as if an

From 4abc00bfd34731ee0f538d7299276b38b7ac018f Mon Sep 17 00:00:00 2001
From: Juan Martin Loyola <jmloyola@outlook.com>
Date: Fri, 8 Oct 2021 06:14:11 -0300
Subject: [PATCH 15/18] DOC Ensures that SelfTrainingClassifier passes numpydoc
 validation (#21277)

* Remove SelfTrainingClassifier from DOCSTRING_IGNORE_LIST

* Fix numpydocs from SelfTrainingClassifier

* Change docstrings to maintain consistency
---
 maint_tools/test_docstrings.py            |  1 -
 sklearn/semi_supervised/_self_training.py | 55 ++++++++++++-----------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py
index 35ed4c515dd81..bb73b3ad3e22f 100644
--- a/maint_tools/test_docstrings.py
+++ b/maint_tools/test_docstrings.py
@@ -23,7 +23,6 @@
     "PassiveAggressiveClassifier",
     "PassiveAggressiveRegressor",
     "QuadraticDiscriminantAnalysis",
-    "SelfTrainingClassifier",
     "SparseRandomProjection",
     "SpectralBiclustering",
     "SpectralCoclustering",
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index ad627c6f98574..71d2a7b32236b 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -37,30 +37,30 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     Parameters
     ----------
     base_estimator : estimator object
-        An estimator object implementing ``fit`` and ``predict_proba``.
-        Invoking the ``fit`` method will fit a clone of the passed estimator,
-        which will be stored in the ``base_estimator_`` attribute.
+        An estimator object implementing `fit` and `predict_proba`.
+        Invoking the `fit` method will fit a clone of the passed estimator,
+        which will be stored in the `base_estimator_` attribute.
 
     threshold : float, default=0.75
         The decision threshold for use with `criterion='threshold'`.
-        Should be in [0, 1). When using the 'threshold' criterion, a
+        Should be in [0, 1). When using the `'threshold'` criterion, a
         :ref:`well calibrated classifier <calibration>` should be used.
 
     criterion : {'threshold', 'k_best'}, default='threshold'
         The selection criterion used to select which labels to add to the
-        training set. If 'threshold', pseudo-labels with prediction
-        probabilities above `threshold` are added to the dataset. If 'k_best',
+        training set. If `'threshold'`, pseudo-labels with prediction
+        probabilities above `threshold` are added to the dataset. If `'k_best'`,
         the `k_best` pseudo-labels with highest prediction probabilities are
         added to the dataset. When using the 'threshold' criterion, a
         :ref:`well calibrated classifier <calibration>` should be used.
 
     k_best : int, default=10
         The amount of samples to add in each iteration. Only used when
-        `criterion` is k_best'.
+        `criterion='k_best'`.
 
     max_iter : int or None, default=10
         Maximum number of iterations allowed. Should be greater than or equal
-        to 0. If it is ``None``, the classifier will continue to predict labels
+        to 0. If it is `None`, the classifier will continue to predict labels
         until no new pseudo-labels are added, or all unlabeled samples have
         been labeled.
 
@@ -74,7 +74,7 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
 
     classes_ : ndarray or list of ndarray of shape (n_classes,)
         Class labels for each output. (Taken from the trained
-        ``base_estimator_``).
+        `base_estimator_`).
 
     transduction_ : ndarray of shape (n_samples,)
         The labels used for the final fit of the classifier, including
@@ -104,11 +104,24 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}
         The reason that fitting was stopped.
 
-        - 'max_iter': `n_iter_` reached `max_iter`.
-        - 'no_change': no new labels were predicted.
-        - 'all_labeled': all unlabeled samples were labeled before `max_iter`
+        - `'max_iter'`: `n_iter_` reached `max_iter`.
+        - `'no_change'`: no new labels were predicted.
+        - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`
           was reached.
 
+    See Also
+    --------
+    LabelPropagation : Label propagation classifier.
+    LabelSpreading : Label spreading model for semi-supervised learning.
+
+    References
+    ----------
+    David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
+    supervised methods. In Proceedings of the 33rd annual meeting on
+    Association for Computational Linguistics (ACL '95). Association for
+    Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:
+    https://doi.org/10.3115/981658.981684
+
     Examples
     --------
     >>> import numpy as np
@@ -123,14 +136,6 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
     >>> self_training_model = SelfTrainingClassifier(svc)
     >>> self_training_model.fit(iris.data, iris.target)
     SelfTrainingClassifier(...)
-
-    References
-    ----------
-    David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling
-    supervised methods. In Proceedings of the 33rd annual meeting on
-    Association for Computational Linguistics (ACL '95). Association for
-    Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:
-    https://doi.org/10.3115/981658.981684
     """
 
     _estimator_type = "classifier"
@@ -153,7 +158,7 @@ def __init__(
 
     def fit(self, X, y):
         """
-        Fits this ``SelfTrainingClassifier`` to a dataset.
+        Fit self-training classifier using `X`, `y` as training data.
 
         Parameters
         ----------
@@ -167,7 +172,7 @@ def fit(self, X, y):
         Returns
         -------
         self : object
-            Returns an instance of self.
+            Fitted estimator.
         """
         # we need row slicing support for sparce matrices, but costly finiteness check
         # can be delegated to the base estimator.
@@ -281,7 +286,7 @@ def fit(self, X, y):
 
     @if_delegate_has_method(delegate="base_estimator")
     def predict(self, X):
-        """Predict the classes of X.
+        """Predict the classes of `X`.
 
         Parameters
         ----------
@@ -326,7 +331,7 @@ def predict_proba(self, X):
 
     @if_delegate_has_method(delegate="base_estimator")
     def decision_function(self, X):
-        """Calls decision function of the `base_estimator`.
+        """Call decision function of the `base_estimator`.
 
         Parameters
         ----------
@@ -372,7 +377,7 @@ def predict_log_proba(self, X):
 
     @if_delegate_has_method(delegate="base_estimator")
     def score(self, X, y):
-        """Calls score on the `base_estimator`.
+        """Call score on the `base_estimator`.
 
         Parameters
         ----------

From eb2b5fa767a49bf056f6ffdd253b8bea9d4328ff Mon Sep 17 00:00:00 2001
From: Dimitri Papadopoulos Orfanos
 <3234522+DimitriPapadopoulos@users.noreply.github.com>
Date: Fri, 8 Oct 2021 13:55:15 +0200
Subject: [PATCH 16/18] DOC Remove some str/unicode leftovers from Python 2
 (#21270)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/_base.py                    | 4 ++--
 sklearn/datasets/_california_housing.py      | 2 +-
 sklearn/datasets/_lfw.py                     | 4 ++--
 sklearn/datasets/_svmlight_format_io.py      | 5 ++---
 sklearn/datasets/_twenty_newsgroups.py       | 2 +-
 sklearn/feature_extraction/text.py           | 8 ++++----
 sklearn/linear_model/_base.py                | 2 +-
 sklearn/linear_model/_stochastic_gradient.py | 2 +-
 sklearn/metrics/_base.py                     | 2 +-
 sklearn/metrics/_classification.py           | 2 +-
 sklearn/metrics/_dist_metrics.pyx            | 2 +-
 sklearn/mixture/_base.py                     | 2 +-
 sklearn/mixture/_gaussian_mixture.py         | 2 +-
 sklearn/preprocessing/tests/test_encoders.py | 4 ++--
 sklearn/tree/_export.py                      | 4 ++--
 sklearn/utils/graph.py                       | 2 +-
 sklearn/utils/metaestimators.py              | 2 +-
 17 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index b5f6fd22f9c33..dab3c92d654bb 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -141,10 +141,10 @@ def load_files(
 
     Parameters
     ----------
-    container_path : str or unicode
+    container_path : str
         Path to the main folder holding one subfolder per category
 
-    description : str or unicode, default=None
+    description : str, default=None
         A paragraph describing the characteristic of the dataset: its source,
         reference, etc.
 
diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
index 34a936e51cbb2..59ff356e90838 100644
--- a/sklearn/datasets/_california_housing.py
+++ b/sklearn/datasets/_california_housing.py
@@ -102,7 +102,7 @@ def fetch_california_housing(
             If ``as_frame`` is True, ``target`` is a pandas object.
         feature_names : list of length 8
             Array of ordered feature names used in the dataset.
-        DESCR : string
+        DESCR : str
             Description of the California housing dataset.
         frame : pandas DataFrame
             Only present when `as_frame=True`. DataFrame with ``data`` and
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index fb7d603bfc0ff..0af8c8635bc85 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -301,7 +301,7 @@ def fetch_lfw_people(
         target : numpy array of shape (13233,)
             Labels associated to each face image.
             Those labels range from 0-5748 and correspond to the person IDs.
-        DESCR : string
+        DESCR : str
             Description of the Labeled Faces in the Wild (LFW) dataset.
 
     (data, target) : tuple if ``return_X_y`` is True
@@ -486,7 +486,7 @@ def fetch_lfw_pairs(
         target : numpy array of shape (2200,). Shape depends on ``subset``.
             Labels associated to each pair of images.
             The two label values being different persons or the same person.
-        DESCR : string
+        DESCR : str
             Description of the Labeled Faces in the Wild (LFW) dataset.
 
     """
diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py
index 48e258b4e8512..6a7d9dcc1936c 100644
--- a/sklearn/datasets/_svmlight_format_io.py
+++ b/sklearn/datasets/_svmlight_format_io.py
@@ -446,7 +446,7 @@ def dump_svmlight_file(
         integer or float, or array-like objects of integer or float for
         multilabel classifications.
 
-    f : string or file-like in binary mode
+    f : str or file-like in binary mode
         If string, specifies the path that will contain the data.
         If file-like, data will be written to f. f should be opened in binary
         mode.
@@ -455,7 +455,7 @@ def dump_svmlight_file(
         Whether column indices should be written zero-based (True) or one-based
         (False).
 
-    comment : string, default=None
+    comment : str, default=None
         Comment to insert at the top of the file. This should be either a
         Unicode string, which will be encoded as UTF-8, or an ASCII byte
         string.
@@ -478,7 +478,6 @@ def dump_svmlight_file(
         # Convert comment string to list of lines in UTF-8.
         # If a byte string is passed, then check whether it's ASCII;
         # if a user wants to get fancy, they'll have to decode themselves.
-        # Avoid mention of str and unicode types for Python 3.x compat.
         if isinstance(comment, bytes):
             comment.decode("ascii")  # just for the exception
         else:
diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
index 24046367c69c6..ef0ce6b99a25e 100644
--- a/sklearn/datasets/_twenty_newsgroups.py
+++ b/sklearn/datasets/_twenty_newsgroups.py
@@ -184,7 +184,7 @@ def fetch_20newsgroups(
         Select the dataset to load: 'train' for the training set, 'test'
         for the test set, 'all' for both, with shuffled ordering.
 
-    categories : array-like, dtype=str or unicode, default=None
+    categories : array-like, dtype=str, default=None
         If None (default), load all the categories.
         If not None, list of category names to load (other categories
         ignored).
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 82582de8a5b60..02af3ff289ae2 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -156,7 +156,7 @@ def strip_accents_ascii(s):
 
     Parameters
     ----------
-    s : string
+    s : str
         The string to strip
 
     See Also
@@ -175,7 +175,7 @@ def strip_tags(s):
 
     Parameters
     ----------
-    s : string
+    s : str
         The string to strip
     """
     return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
@@ -204,7 +204,7 @@ def decode(self, doc):
 
         Parameters
         ----------
-        doc : str
+        doc : bytes or str
             The string to decode.
 
         Returns
@@ -620,7 +620,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
         Remove accents and perform other character normalization
         during the preprocessing step.
         'ascii' is a fast method that only works on characters that have
-        an direct ASCII mapping.
+        a direct ASCII mapping.
         'unicode' is a slightly slower method that works on any characters.
         None (default) does nothing.
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 8b5102ecdd403..841ed6a1c1cc4 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -66,7 +66,7 @@ def _deprecate_normalize(normalize, default, estimator_name):
     default : bool,
         default normalize value used by the estimator
 
-    estimator_name : string,
+    estimator_name : str
         name of the linear estimator which calls this function.
         The name will be used for writing the deprecation warnings
 
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 23ba7c77d85ac..3ae077f4331cc 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -392,7 +392,7 @@ def fit_binary(
     C : float
         Maximum step size for passive aggressive
 
-    learning_rate : string
+    learning_rate : str
         The learning rate. Accepted values are 'constant', 'optimal',
         'invscaling', 'pa1' and 'pa2'.
 
diff --git a/sklearn/metrics/_base.py b/sklearn/metrics/_base.py
index 5640848b1a9d4..dd0258f600ccc 100644
--- a/sklearn/metrics/_base.py
+++ b/sklearn/metrics/_base.py
@@ -32,7 +32,7 @@ def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight
         Target scores, can either be probability estimates of the positive
         class, confidence values, or binary decisions.
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 7237fa53fda25..b4316053c0f74 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2009,7 +2009,7 @@ def classification_report(
 
     Returns
     -------
-    report : string / dict
+    report : str or dict
         Text summary of the precision, recall, F1 score for each class.
         Dictionary returned if output_dict is True. Dictionary has the
         following structure::
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index c592c1d8c5d4a..c94cf597c0fac 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -234,7 +234,7 @@ cdef class DistanceMetric:
 
         Parameters
         ----------
-        metric : string or class name
+        metric : str or class name
             The distance metric to use
         **kwargs
             additional arguments will be passed to the requested metric
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index d40903899c187..bbe9699859ded 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -28,7 +28,7 @@ def _check_shape(param, param_shape, name):
 
     param_shape : tuple
 
-    name : string
+    name : str
     """
     param = np.array(param)
     if param.shape != param_shape:
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 850adfdd6d47f..f4bb194e1e33d 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -108,7 +108,7 @@ def _check_precisions(precisions, covariance_type, n_components, n_features):
         'diag' : shape of (n_components, n_features)
         'spherical' : shape of (n_components,)
 
-    covariance_type : string
+    covariance_type : str
 
     n_components : int
         Number of components.
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 0429dc00c2322..dcc07d25af5fd 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -800,8 +800,8 @@ def test_encoder_dtypes():
     for X in [
         np.array([[1, 2], [3, 4]], dtype="int64"),
         np.array([[1, 2], [3, 4]], dtype="float64"),
-        np.array([["a", "b"], ["c", "d"]]),  # unicode dtype
-        np.array([[b"a", b"b"], [b"c", b"d"]]),  # string dtype
+        np.array([["a", "b"], ["c", "d"]]),  # str dtype
+        np.array([[b"a", b"b"], [b"c", b"d"]]),  # bytes dtype
         np.array([[1, "a"], [3, "b"]], dtype="object"),
     ]:
         enc.fit(X)
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 18f98d36871b9..dc50ee70f05f0 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -839,7 +839,7 @@ def export_graphviz(
 
     Returns
     -------
-    dot_data : string
+    dot_data : str
         String representation of the input tree in GraphViz dot format.
         Only returned if ``out_file`` is None.
 
@@ -961,7 +961,7 @@ def export_text(
 
     Returns
     -------
-    report : string
+    report : str
         Text summary of all the rules in the decision tree.
 
     Examples
diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
index 8eacb17e628c3..020227ba001a9 100644
--- a/sklearn/utils/graph.py
+++ b/sklearn/utils/graph.py
@@ -92,7 +92,7 @@ def graph_shortest_path(dist_matrix, directed=True, method="auto"):
         if False, then find the shortest path on an undirected graph: the
         algorithm can progress from a point to its neighbors and vice versa.
 
-    method : string ['auto'|'FW'|'D']
+    method : {'auto', 'FW', 'D'}, default='auto'
         method to use.  Options are
         'auto' : attempt to choose the best method for the current problem
         'FW' : Floyd-Warshall algorithm.  O[N^3]
diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py
index bd43eeba2a3dd..5d71d28c5ffab 100644
--- a/sklearn/utils/metaestimators.py
+++ b/sklearn/utils/metaestimators.py
@@ -205,7 +205,7 @@ def if_delegate_has_method(delegate):
 
     Parameters
     ----------
-    delegate : string, list of strings or tuple of strings
+    delegate : str, list of str or tuple of str
         Name of the sub-estimator that can be accessed as an attribute of the
         base object. If a list or a tuple of names are provided, the first
         sub-estimator that is an attribute of the base object will be used.

From 46a6cf29ab019afa7cf3c815cb206fa822f0ee0a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 8 Oct 2021 10:16:50 -0400
Subject: [PATCH 17/18] Re-introduce 'surrogate' for the wording and adapt
 docstrings accordingly

Co-authored-by: Roman Yurchak <rth.yurchak@pm.me>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/_dist_metrics.pyx | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index c94cf597c0fac..2698d5dea8769 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -296,8 +296,7 @@ cdef class DistanceMetric:
 
         The rank-preserving surrogate distance is any measure that yields the same
         rank as the distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the rank-preserving surrogate distance is the
-        squared-euclidean distance.
+        Euclidean metric, the surrogate distance is the squared-euclidean distance.
         """
         return self.dist(x1, x2, size)
 
@@ -322,25 +321,24 @@ cdef class DistanceMetric:
         return 0
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
-        """Convert the ranking-preserving distance to the distance"""
+        """Convert the rank-preserving surrogate distance to the distance"""
         return rdist
 
     cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
-        """Convert the distance to the ranking-preserving distance"""
+        """Convert the distance to the rank-preserving surrogate distance"""
         return dist
 
     def rdist_to_dist(self, rdist):
-        """Convert the ranking-preserving distance to the true distance.
+        """Convert the ranking-preserving surrogate distance to the distance.
 
-        The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the rank-preserving surrogate distance is the
-        squared-euclidean distance.
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, for the
+        Euclidean metric, the surrogate distance is the squared-euclidean distance.
 
         Parameters
         ----------
         rdist : double
-            Ranking-preserving distance.
+            Surrogate distance.
 
         Returns
         -------
@@ -352,10 +350,9 @@ cdef class DistanceMetric:
     def dist_to_rdist(self, dist):
         """Convert the true distance to the rank-preserving surrogate distance.
 
-        The rank-preserving surrogate distance is any measure that yields the same
-        rank as the distance, but is more efficient to compute. For example, for the
-        Euclidean metric, the rank-preserving surrogate distance is the
-        squared-euclidean distance.
+        The surrogate distance is any measure that yields the same rank as the
+        distance, but is more efficient to compute. For example, for the
+        Euclidean metric, the surrogate distance is the squared-euclidean distance.
 
         Parameters
         ----------
@@ -365,7 +362,7 @@ cdef class DistanceMetric:
         Returns
         -------
         double
-            Ranking-preserving distance.
+            Surrogate distance.
         """
         return dist
 

From f00c134d770584fb79f9ecd5fbc2a78840e8f044 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 8 Oct 2021 10:56:26 -0400
Subject: [PATCH 18/18] Re-word even more for "rank-preserving surrogate"

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Roman Yurchak <rth.yurchak@pm.me>

Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 sklearn/metrics/_dist_metrics.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index 2698d5dea8769..a8fb4c45ddd0c 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -290,7 +290,7 @@ cdef class DistanceMetric:
 
     cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1:
-        """Compute the ranking-preserving distance between vectors x1 and x2.
+        """Compute the rank-preserving surrogate distance between vectors x1 and x2.
 
         This can optionally be overridden in a base class.
 
@@ -329,7 +329,7 @@ cdef class DistanceMetric:
         return dist
 
     def rdist_to_dist(self, rdist):
-        """Convert the ranking-preserving surrogate distance to the distance.
+        """Convert the rank-preserving surrogate distance to the distance.
 
         The surrogate distance is any measure that yields the same rank as the
         distance, but is more efficient to compute. For example, for the